jhuber6 created this revision.
jhuber6 added reviewers: JonChesterfield, yaxunl, saiislam, arsenm, 
carlo.bertolli, MaskRay, jdoerfert, tianshilei1992.
Herald added subscribers: kosarev, StephenFan, t-tye, tpr, dstuttard, jvesely, 
kzhuravl.
Herald added a project: All.
jhuber6 requested review of this revision.
Herald added subscribers: cfe-commits, wdng.
Herald added a project: clang.

The AMDGPU library uses several control constants to change code paths
for the math functions and intrinsics. These are normally included using
several individual bitcode libraries at link time. However, this is
problematic because it requires us to know the AMDGPU architecture at
link time which should not be strictly necessary. This patch adds new
code that emits the constants that would normally be included by the
bitcode libraries. This removes around six libraries we would otherwise
need to include and now we can link these libraries in unconditionally
like we do with libdevice.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D130096

Files:
  clang/include/clang/Basic/CodeGenOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/CodeGen/TargetInfo.h
  clang/test/CodeGen/amdgcn-occl-constants.c

Index: clang/test/CodeGen/amdgcn-occl-constants.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/amdgcn-occl-constants.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -fgpu-fast-relaxed-math \
+// RUN:   -S -emit-llvm -o - %s | FileCheck %s --check-prefix=FAST
+
+void foo() {}
+
+// CHECK: @__oclc_daz_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1
+// CHECK: @__oclc_wavefrontsize64 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// CHECK: @__oclc_finite_only_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1
+// CHECK: @__oclc_unsafe_math_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1
+// CHECK: @__oclc_correctly_rounded_sqrt32 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// CHECK: @__oclc_ISA_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 9010, align 4
+// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400, align 4
+
+// FAST: @__oclc_daz_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 0, align 1
+// FAST: @__oclc_wavefrontsize64 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// FAST: @__oclc_finite_only_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// FAST: @__oclc_unsafe_math_opt = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// FAST: @__oclc_correctly_rounded_sqrt32 = weak_odr hidden local_unnamed_addr addrspace(4) constant i8 1, align 1
+// FAST: @__oclc_ISA_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 9010, align 4
+// FAST: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400, align 4
Index: clang/lib/CodeGen/TargetInfo.h
===================================================================
--- clang/lib/CodeGen/TargetInfo.h
+++ clang/lib/CodeGen/TargetInfo.h
@@ -63,6 +63,9 @@
       CodeGen::CodeGenModule &CGM,
       const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {}
 
+  /// Provides a convenient hook to handle extra target-specific globals.
+  virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
+
   /// Any further codegen related checks that need to be done on a function call
   /// in a target specific manner.
   virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/IntrinsicsS390.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm> // std::sort
 
@@ -9307,6 +9308,8 @@
   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
                                  CodeGenModule &CGM) const;
 
+  void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
+
   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
   unsigned getOpenCLKernelCallingConv() const override;
@@ -9422,6 +9425,61 @@
   }
 }
 
+void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
+    CodeGen::CodeGenModule &CGM) const {
+  if (!CGM.getTriple().isAMDGCN())
+    return;
+  StringRef CPU = CGM.getTarget().getTargetOpts().CPU;
+  // Check if we have any function declarations of `__ocml` or `__oclc`
+  llvm::AMDGPU::GPUKind Kind = llvm::AMDGPU::parseArchAMDGCN(CPU);
+  unsigned Features = llvm::AMDGPU::getArchAttrAMDGCN(Kind);
+  if (Kind == llvm::AMDGPU::GK_NONE)
+    return;
+
+  unsigned Minor;
+  unsigned Major;
+  StringRef Identifier = CPU.drop_while([](char C) { return !isDigit(C); });
+  if (Identifier.take_back(2).getAsInteger(16, Minor) ||
+      Identifier.drop_back(2).getAsInteger(10, Major))
+    return;
+
+  auto AddGlobal = [&](StringRef Name, unsigned Value, unsigned Size = 8) {
+    if (CGM.getModule().getNamedGlobal(Name))
+      return;
+
+    auto *Type =
+        llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), Size);
+    auto *GV = new llvm::GlobalVariable(
+        CGM.getModule(), Type, true,
+        llvm::GlobalValue::LinkageTypes::WeakODRLinkage,
+        llvm::ConstantInt::get(Type, Value), Name, nullptr,
+        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+    GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+    GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
+    GV->setAlignment(CGM.getDataLayout().getABITypeAlign(Type));
+  };
+
+  // TODO: Add flags to toggle these as-needed.
+  bool DenormAtZero = !((Features & llvm::AMDGPU::FEATURE_FAST_FMA_F32) &&
+                        (Features & llvm::AMDGPU::FEATURE_FAST_DENORMAL_F32));
+  bool Wavefront64 = !(Features & llvm::AMDGPU::FEATURE_WAVE32);
+  bool FastRelaxedMath = CGM.getCodeGenOpts().GPUFastRelaxedMath;
+  bool FiniteOnly = false;
+  bool UnsafeMath = false;
+  bool CorrectSqrt = true;
+
+  // Control constants for math operations.
+  AddGlobal("__oclc_daz_opt", DenormAtZero);
+  AddGlobal("__oclc_wavefrontsize64", Wavefront64);
+  AddGlobal("__oclc_finite_only_opt", FiniteOnly || FastRelaxedMath);
+  AddGlobal("__oclc_unsafe_math_opt", UnsafeMath || FastRelaxedMath);
+  AddGlobal("__oclc_correctly_rounded_sqrt32", CorrectSqrt);
+
+  // Control constants for the system.
+  AddGlobal("__oclc_ISA_version", Minor + Major * 1000, 32);
+  AddGlobal("__oclc_ABI_version", 400, 32);
+}
+
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
   if (requiresAMDGPUProtectedVisibility(D, GV)) {
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -919,6 +919,7 @@
   if (getCodeGenOpts().SkipRaxSetup)
     getModule().addModuleFlag(llvm::Module::Override, "SkipRaxSetup", 1);
 
+  getTargetCodeGenInfo().emitTargetGlobals(*this);
   getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames);
 
   EmitBackendOptionsMetadata(getCodeGenOpts());
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -964,6 +964,10 @@
   Alias<fgpu_flush_denormals_to_zero>;
 def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">,
   Alias<fno_gpu_flush_denormals_to_zero>;
+defm gpu_fast_relaxed_math : BoolFOption<"gpu-fast-relaxed-math",
+  CodeGenOpts<"GPUFastRelaxedMath">, DefaultFalse,
+  PosFlag<SetTrue, [CC1Option], "Use relaxed math for the AMDGPU math library">,
+  NegFlag<SetFalse>>;
 defm gpu_rdc : BoolFOption<"gpu-rdc",
   LangOpts<"GPURelocatableDeviceCode">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option], "Generate relocatable device code, also known as separate compilation mode">,
Index: clang/include/clang/Basic/CodeGenOptions.def
===================================================================
--- clang/include/clang/Basic/CodeGenOptions.def
+++ clang/include/clang/Basic/CodeGenOptions.def
@@ -467,6 +467,8 @@
 /// propagate signaling NaN inputs per IEEE 754-2008 (AMDGPU Only)
 CODEGENOPT(EmitIEEENaNCompliantInsts, 1, 1)
 
+CODEGENOPT(GPUFastRelaxedMath, 1, 0)
+
 // Whether to emit Swift Async function extended frame information: auto,
 // never, always.
 ENUM_CODEGENOPT(SwiftAsyncFramePointer, SwiftAsyncFramePointerKind, 2,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to