yaxunl created this revision.
yaxunl added reviewers: tra, rjmccall.

nvcc supports accessing file-scope static device variables in host code by host 
APIs
like cudaMemcpyToSymbol etc.

HIP let users access device variables in host code by shadow variables. In host 
compilation,
clang emits a shadow variable for each device variable, and calls 
`__hipRegisterVariable` to
register it in init function. The address of the shadow variable and the device 
side mangled
name of the device variable is passed to `__hipRegisterVariable`. Runtime looks 
up the symbol
by name in the device binary (so called code object, which is actually elf 
format) to find the
address of the device variable.

The problem with static device variables is that they have internal linkage, 
therefore their
name may be changed by the linker if there are multiple symbols with the same 
name. Also
they end up as local symbols in the elf file, whereas the runtime only look up 
the global symbols.

To support accessing file-scope static device variables in host code, we need 
to give them
a unique name and external linkage. This can be done by postfixing each static 
device variable with
a unique CUID (Compilation Unit ID). Also we have to make sure the host 
compilation and device
compilation of the same compilation unit use identical CUID.

This patch added a 32bit unique CUID for each input file, which is represented 
by InputAction.
clang initially creates an InputAction for each input file for the host 
compilation. In HIP action
builder, each InputAction is cloned for each GPU arch, and the CUID is also 
cloned. In this way,
we guarantee the corresponding device and host compilation for the same file 
shared the
same CUID, therefore the postfixed device variable and shadow variable share 
the same name.
On the other hand, different compilation units have different CUID, therefore a 
static variable
with the same name but in a different compilation unit will have a different 
name.

Since the static device variables have different name across compilation units, 
now we let
them have external linkage so that they can be looked up by the runtime.


https://reviews.llvm.org/D80858

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Action.h
  clang/include/clang/Driver/CC1Options.td
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/Driver/Action.cpp
  clang/lib/Driver/Driver.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGenCUDA/static-device-var.cu
  clang/test/Driver/hip-cuid.hip

Index: clang/test/Driver/hip-cuid.hip
===================================================================
--- /dev/null
+++ clang/test/Driver/hip-cuid.hip
@@ -0,0 +1,20 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang -### -x hip \
+// RUN:   -target x86_64-unknown-linux-gnu \
+// RUN:   --offload-arch=gfx900 \
+// RUN:   --offload-arch=gfx906 \
+// RUN:   -c -nogpulib \
+// RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}clang{{.*}}" "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-hip-cuid=[[CUID:[^"]+]]"
+
+// CHECK: "{{.*}}clang{{.*}}" "-cc1" "-triple" "amdgcn-amd-amdhsa"
+// CHECK-SAME: "-hip-cuid=[[CUID]]"
+
+// CHECK: "{{.*}}clang{{.*}}" "-cc1" "-triple" "x86_64-unknown-linux-gnu"
+// CHECK-SAME: "-hip-cuid=[[CUID]]"
Index: clang/test/CodeGenCUDA/static-device-var.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/static-device-var.cu
@@ -0,0 +1,57 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:    -emit-llvm -o - -x hip %s | FileCheck -check-prefix=INT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux \
+// RUN:    -emit-llvm -o - -x hip %s | FileCheck -check-prefix=INT-HOST %s
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -hip-cuid=123 \
+// RUN:    -emit-llvm -o - -x hip %s | FileCheck -check-prefix=EXT-DEV %s
+
+// RUN: %clang_cc1 -triple x86_64-gnu-linux -hip-cuid=123 \
+// RUN:    -emit-llvm -o - -x hip %s | FileCheck -check-prefix=EXT-HOST %s
+
+#include "Inputs/cuda.h"
+
+// Test normal static device variables
+// INT-DEV: @_ZL1x = internal addrspace(1) global i32 0
+// INT-HOST-DAG: @_ZL1x = internal global i32 undef
+// INT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x\00"
+
+// Test externalized static device variables
+// EXT-DEV: @_ZL1x.hip.static.123 = addrspace(1) externally_initialized global i32 0
+// EXT-HOST-DAG: @_ZL1x.hip.static.123 = internal global i32 undef
+// EXT-HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"_ZL1x.hip.static.123\00"
+
+static __device__ int x;
+
+// Test normal static device variables
+// INT-DEV: @_ZL1y = internal addrspace(4) global i32 0
+// INT-HOST-DAG: @_ZL1y = internal global i32 undef
+// INT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y\00"
+
+// Test externalized static device variables
+// EXT-DEV: @_ZL1y.hip.static.123 = addrspace(4) externally_initialized global i32 0
+// EXT-HOST-DAG: @_ZL1y.hip.static.123 = internal global i32 undef
+// EXT-HOST-DAG: @[[DEVNAMEY:[0-9]+]] = {{.*}}c"_ZL1y.hip.static.123\00"
+
+static __constant__ int y;
+
+__global__ void kernel(int *a) {
+  a[0] = x;
+  a[1] = y;
+}
+
+int* getDeviceSymbol(int *x);
+
+void foo() {
+  getDeviceSymbol(&x);
+  getDeviceSymbol(&y);
+}
+
+// INT-HOST: __hipRegisterVar({{.*}}@_ZL1x{{.*}}@[[DEVNAMEX]]
+// INT-HOST: __hipRegisterVar({{.*}}@_ZL1y{{.*}}@[[DEVNAMEY]]
+// EXT-HOST: __hipRegisterVar({{.*}}@_ZL1x.hip.static.123{{.*}}@[[DEVNAMEX]]
+// EXT-HOST: __hipRegisterVar({{.*}}@_ZL1y.hip.static.123{{.*}}@[[DEVNAMEY]]
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2579,12 +2579,16 @@
   }
   Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
   Opts.HIPLambdaHostDevice = Args.hasArg(OPT_fhip_lambda_host_device);
-  if (Opts.HIP)
+  if (Opts.HIP) {
     Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
         Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
-  else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
-    Diags.Report(diag::warn_ignored_hip_only_option)
-        << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);
+    Opts.HIPCUID = getLastArgIntValue(Args, OPT_hip_cuid_EQ, Opts.HIPCUID);
+  } else {
+    for (auto Opt : {OPT_gpu_max_threads_per_block_EQ, OPT_hip_cuid_EQ})
+      if (Args.hasArg(Opt))
+        Diags.Report(diag::warn_ignored_hip_only_option)
+            << Args.getLastArg(Opt)->getAsString(Args);
+  }
 
   if (Opts.ObjC) {
     if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -6004,8 +6004,17 @@
       CmdArgs.push_back("-fcuda-short-ptr");
   }
 
-  if (IsHIP)
+  if (IsHIP) {
+    // Determine the original source input.
+    const Action *SourceAction = &JA;
+    while (SourceAction->getKind() != Action::InputClass) {
+      assert(!SourceAction->getInputs().empty() && "unexpected root action!");
+      SourceAction = SourceAction->getInputs()[0];
+    }
+    CmdArgs.push_back(Args.MakeArgString(
+        Twine("-hip-cuid=") + Twine(cast<InputAction>(SourceAction)->getId())));
     CmdArgs.push_back("-fcuda-allow-variadic-functions");
+  }
 
   // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
   // to specify the result of the compile phase on the host, so the meaningful
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -2410,7 +2410,7 @@
                                                  : types::TY_CUDA_DEVICE;
         for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
           CudaDeviceActions.push_back(
-              C.MakeAction<InputAction>(IA->getInputArg(), Ty));
+              C.MakeAction<InputAction>(IA->getInputArg(), Ty, IA->getId()));
         }
 
         return ABRT_Success;
Index: clang/lib/Driver/Action.cpp
===================================================================
--- clang/lib/Driver/Action.cpp
+++ clang/lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
 
 #include "clang/Driver/Action.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
 #include <cassert>
 #include <string>
 
@@ -163,8 +164,11 @@
 
 void InputAction::anchor() {}
 
-InputAction::InputAction(const Arg &_Input, types::ID _Type)
-    : Action(InputClass, _Type), Input(_Input) {}
+InputAction::InputAction(const Arg &_Input, types::ID _Type, unsigned _Id)
+    : Action(InputClass, _Type), Input(_Input), Id(_Id) {
+  if (!Id)
+    Id = llvm::sys::Process::GetRandomNumber();
+}
 
 void BindArchAction::anchor() {}
 
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -1087,6 +1087,15 @@
       }
     }
 
+  // Make unique name for device side static file-scope variable for HIP.
+  if (CGM.getLangOpts().HIP && CGM.getLangOpts().HIPCUID) {
+    if (const auto *VD = dyn_cast<VarDecl>(ND)) {
+      if ((VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>()) &&
+          VD->isFileVarDecl() && VD->getStorageClass() == SC_Static) {
+        Out << ".hip.static." << CGM.getLangOpts().HIPCUID;
+      }
+    }
+  }
   return std::string(Out.str());
 }
 
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -238,6 +238,19 @@
     DeviceSideName = std::string(Out.str());
   } else
     DeviceSideName = std::string(ND->getIdentifier()->getName());
+
+  // Make unique name for device side static file-scope variable for HIP.
+  if (CGM.getLangOpts().HIP && CGM.getLangOpts().HIPCUID) {
+    if (const auto *VD = dyn_cast<VarDecl>(ND)) {
+      if ((VD->hasAttr<CUDADeviceAttr>() || VD->hasAttr<CUDAConstantAttr>()) &&
+          VD->isFileVarDecl() && VD->getStorageClass() == SC_Static) {
+        SmallString<256> Buffer;
+        llvm::raw_svector_ostream Out(Buffer);
+        Out << DeviceSideName << ".hip.static." << CGM.getLangOpts().HIPCUID;
+        DeviceSideName = std::string(Out.str());
+      }
+    }
+  }
   return DeviceSideName;
 }
 
Index: clang/lib/AST/ASTContext.cpp
===================================================================
--- clang/lib/AST/ASTContext.cpp
+++ clang/lib/AST/ASTContext.cpp
@@ -10055,12 +10055,19 @@
   } else if (D->hasAttr<DLLExportAttr>()) {
     if (L == GVA_DiscardableODR)
       return GVA_StrongODR;
-  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice &&
-             D->hasAttr<CUDAGlobalAttr>()) {
+  } else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice) {
     // Device-side functions with __global__ attribute must always be
     // visible externally so they can be launched from host.
-    if (L == GVA_DiscardableODR || L == GVA_Internal)
+    if (D->hasAttr<CUDAGlobalAttr>() &&
+        (L == GVA_DiscardableODR || L == GVA_Internal))
       return GVA_StrongODR;
+    // Externalize device side static file-scope variable for HIP.
+    if (Context.getLangOpts().HIP && Context.getLangOpts().HIPCUID &&
+        (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) &&
+        isa<VarDecl>(D) && cast<VarDecl>(D)->isFileVarDecl() &&
+        cast<VarDecl>(D)->getStorageClass() == SC_Static) {
+      return GVA_StrongExternal;
+    }
   }
   return L;
 }
Index: clang/include/clang/Driver/CC1Options.td
===================================================================
--- clang/include/clang/Driver/CC1Options.td
+++ clang/include/clang/Driver/CC1Options.td
@@ -891,6 +891,14 @@
   HelpText<"Don't treat unattributed constexpr functions as __host__ __device__.">;
 
 //===----------------------------------------------------------------------===//
+// HIP Options
+//===----------------------------------------------------------------------===//
+
+def hip_cuid_EQ : Joined<["-"], "hip-cuid=">,
+  HelpText<"A unique id to identify a HIP compilation unit, which can be used to "
+           "identify a device-side static declaration in host compilation">;
+
+//===----------------------------------------------------------------------===//
 // OpenMP Options
 //===----------------------------------------------------------------------===//
 
Index: clang/include/clang/Driver/Action.h
===================================================================
--- clang/include/clang/Driver/Action.h
+++ clang/include/clang/Driver/Action.h
@@ -213,14 +213,17 @@
 
 class InputAction : public Action {
   const llvm::opt::Arg &Input;
-
+  unsigned Id;
   virtual void anchor();
 
 public:
-  InputAction(const llvm::opt::Arg &Input, types::ID Type);
+  InputAction(const llvm::opt::Arg &Input, types::ID Type, unsigned Id = 0);
 
   const llvm::opt::Arg &getInputArg() const { return Input; }
 
+  void setId(unsigned _Id) { Id = _Id; }
+  unsigned getId() const { return Id; }
+
   static bool classof(const Action *A) {
     return A->getKind() == InputClass;
   }
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -232,6 +232,7 @@
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
 LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")
+LANGOPT(HIPCUID, 32, 0, "default compilation unit id for HIP")
 
 LANGOPT(SYCL              , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to