This revision was automatically updated to reflect the committed changes.
Closed by commit rGb008ea304d43: [CUDA][HIP] Fix device variable linkage 
(authored by yaxunl).
Herald added a project: clang.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D95901/new/

https://reviews.llvm.org/D95901

Files:
  clang/lib/AST/ASTContext.cpp
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/test/CodeGenCUDA/device-stub.cu
  clang/test/CodeGenCUDA/device-var-linkage.cu
  clang/test/CodeGenCUDA/managed-var.cu

Index: clang/test/CodeGenCUDA/managed-var.cu
===================================================================
--- clang/test/CodeGenCUDA/managed-var.cu
+++ clang/test/CodeGenCUDA/managed-var.cu
@@ -10,17 +10,19 @@
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,NORDC %s
 
 // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \
 // RUN:   -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \
-// RUN:   -check-prefixes=HOST %s
+// RUN:   -check-prefixes=HOST,RDC %s
 
 #include "Inputs/cuda.h"
 
-// DEV-DAG: @x = {{.*}}addrspace(1) externally_initialized global i32 undef
-// HOST-DAG: @x = internal global i32 1
-// HOST-DAG: @x.managed = internal global i32* null
+// DEV-DAG: @x = external addrspace(1) externally_initialized global i32
+// NORDC-DAG: @x = internal global i32 1
+// RDC-DAG: @x = dso_local global i32 1
+// NORDC-DAG: @x.managed = internal global i32* null
+// RDC-DAG: @x.managed = dso_local global i32* null
 // HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00"
 
 struct vec {
@@ -31,11 +33,28 @@
 __managed__ vec v[100];
 __managed__ vec v2[100] = {{1, 1, 1}};
 
+// DEV-DAG: @ex = external addrspace(1) global i32
+// HOST-DAG: @ex = external global i32
+extern __managed__ int ex;
+
+// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL2sx = internal global i32 1
+// HOST-DAG: @_ZL2sx.managed = internal global i32* null
+static __managed__ int sx = 1;
+
+// HOST-NOT: @ex.managed
+
+// Force ex and sx mitted in device compilation.
 __global__ void foo(int *z) {
-  *z = x;
+  *z = x + ex + sx;
   v[1].x = 2;
 }
 
+// Force ex and sx emitted in host compilatioin.
+int foo2() {
+  return ex + sx;
+}
+
 // HOST-LABEL: define {{.*}}@_Z4loadv()
 // HOST:  %ld.managed = load i32*, i32** @x.managed, align 4
 // HOST:  %0 = load i32, i32* %ld.managed, align 4
@@ -97,4 +116,6 @@
 }
 
 // HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4)
+// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx
+// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex
 // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)
Index: clang/test/CodeGenCUDA/device-var-linkage.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/device-var-linkage.cu
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,NORDC %s
+// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=DEV,RDC %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,NORDC-H %s
+// RUN: %clang_cc1 -triple nvptx \
+// RUN:   -fgpu-rdc -emit-llvm -o - -x hip %s \
+// RUN:   | FileCheck -check-prefixes=HOST,RDC-H %s
+
+#include "Inputs/cuda.h"
+
+// DEV-DAG: @v1 = dso_local addrspace(1) externally_initialized global i32 0
+// NORDC-H-DAG: @v1 = internal global i32 undef
+// RDC-H-DAG: @v1 = dso_local global i32 undef
+__device__ int v1;
+// DEV-DAG: @v2 = dso_local addrspace(4) externally_initialized global i32 0
+// NORDC-H-DAG: @v2 = internal global i32 undef
+// RDC-H-DAG: @v2 = dso_local global i32 undef
+__constant__ int v2;
+// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32
+// NORDC-H-DAG: @v3 = internal global i32 0
+// RDC-H-DAG: @v3 = dso_local global i32 0
+__managed__ int v3;
+
+// DEV-DAG: @ev1 = external addrspace(1) global i32
+// HOST-DAG: @ev1 = external global i32
+extern __device__ int ev1;
+// DEV-DAG: @ev2 = external addrspace(4) global i32
+// HOST-DAG: @ev2 = external global i32
+extern __constant__ int ev2;
+// DEV-DAG: @ev3 = external addrspace(1) global i32
+// HOST-DAG: @ev3 = external global i32
+extern __managed__ int ev3;
+
+// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0
+// HOST-DAG: @_ZL3sv1 = internal global i32 undef
+static __device__ int sv1;
+// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0
+// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0
+// HOST-DAG: @_ZL3sv2 = internal global i32 undef
+static __constant__ int sv2;
+// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32
+// HOST-DAG: @_ZL3sv3 = internal global i32 0
+static __managed__ int sv3;
+
+__device__ __host__ int work(int *x);
+
+__device__ __host__ int fun1() {
+  return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3);
+}
+
+// HOST: hipRegisterVar({{.*}}@v1
+// HOST: hipRegisterVar({{.*}}@v2
+// HOST: hipRegisterManagedVar({{.*}}@v3
+// HOST-NOT: hipRegisterVar({{.*}}@ev1
+// HOST-NOT: hipRegisterVar({{.*}}@ev2
+// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3
+// HOST: hipRegisterVar({{.*}}@_ZL3sv1
+// HOST: hipRegisterVar({{.*}}@_ZL3sv2
+// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3
Index: clang/test/CodeGenCUDA/device-stub.cu
===================================================================
--- clang/test/CodeGenCUDA/device-stub.cu
+++ clang/test/CodeGenCUDA/device-stub.cu
@@ -30,9 +30,13 @@
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
 // RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
+// RUN:     -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \
+// RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
+// RUN:       --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW,LNX_17,NORDC17
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \
 // RUN:     -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \
 // RUN:   | FileCheck %s -allow-deprecated-dag-overlap \
-// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17
+// RUN:       --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17,RDC17
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -target-sdk-version=9.2 -o - \
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN
@@ -45,7 +49,7 @@
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
-// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF
+// RUN:   | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
 // RUN:   | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
 
@@ -56,15 +60,18 @@
 #include "Inputs/cuda.h"
 
 #ifndef NOGLOBALS
-// LNX-DAG: @device_var = internal global i32
+// NORDC-DAG: @device_var = internal global i32
+// RDC-DAG: @device_var = dso_local global i32
 // WIN-DAG: @"?device_var@@3HA" = internal global i32
 __device__ int device_var;
 
-// LNX-DAG: @constant_var = internal global i32
+// NORDC-DAG: @constant_var = internal global i32
+// RDC-DAG: @constant_var = dso_local global i32
 // WIN-DAG: @"?constant_var@@3HA" = internal global i32
 __constant__ int constant_var;
 
-// LNX-DAG: @shared_var = internal global i32
+// NORDC-DAG: @shared_var = internal global i32
+// RDC-DAG: @shared_var = dso_local global i32
 // WIN-DAG: @"?shared_var@@3HA" = internal global i32
 __shared__ int shared_var;
 
@@ -87,18 +94,21 @@
 
 // external device-side variables with definitions should generate
 // definitions for the shadows.
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef
 extern __device__ int ext_device_var_def;
 __device__ int ext_device_var_def = 1;
-// LNX-DAG: @ext_device_var_def = internal global i32 undef,
+// NORDC-DAG: @ext_device_var_def = internal global i32 undef,
+// RDC-DAG: @ext_device_var_def = dso_local global i32 undef,
 // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef
 __constant__ int ext_constant_var_def = 2;
 
 #if __cplusplus > 201402L
-/// FIXME: Reject __device__ constexpr and inline variables in Sema.
-// LNX_17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
-// LNX_17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}}
+// RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}}
+// NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}}
+// RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}}
 __device__ inline int inline_var = 3;
 struct C {
   __device__ static constexpr int member_inline_var = 4;
@@ -151,13 +161,13 @@
 // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
 // HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null
 // * constant unnamed string with NVModuleID
-// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
+// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
 // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
 // LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * Alias to global symbol containing the NVModuleID.
-// RDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
-// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
+// CUDARDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* }
+// CUDARDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper
 
 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.
@@ -214,25 +224,33 @@
 // HIP-NEXT: icmp eq i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit
 // HIP: if:
-// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDANORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+//   .. stores return value in __[[PREFIX]]_gpubin_handle
+// CUDANORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+//   .. and then calls __[[PREFIX]]_register_globals
+// HIP: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
 //   .. stores return value in __[[PREFIX]]_gpubin_handle
-// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
 //   .. and then calls __[[PREFIX]]_register_globals
 // HIP-NEXT: br label %exit
 // HIP: exit:
 // HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle
-// NORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// CUDANORDC-NEXT: call void @__[[PREFIX]]_register_globals
+// HIP-NEXT: call void @__[[PREFIX]]_register_globals
 // * In separate mode we also register a destructor.
-// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// CUDANORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
+// HIP-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor)
 
 // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
-// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
-// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
-// RDC-SAME: [[MODULE_ID_GLOBAL]]
+// CUDARDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
+// CUDARDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
+// CUDARDC-SAME: [[MODULE_ID_GLOBAL]]
 
 // Test that we've created destructor.
-// NORDC: define internal void @__[[PREFIX]]_module_dtor
-// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// CUDANORDC: define internal void @__[[PREFIX]]_module_dtor
+// HIP: define internal void @__[[PREFIX]]_module_dtor
+// CUDANORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// HIP: load{{.*}}__[[PREFIX]]_gpubin_handle
 // CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary
 // HIP-NEXT: icmp ne i8** {{.*}}, null
 // HIP-NEXT: br i1 {{.*}}, label %if, label %exit
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -4169,8 +4169,12 @@
   bool NeedsGlobalDtor =
       D->needsDestruction(getContext()) == QualType::DK_cxx_destructor;
 
+  bool IsHIPManagedVarOnDevice =
+      getLangOpts().CUDAIsDevice && D->hasAttr<HIPManagedAttr>();
+
   const VarDecl *InitDecl;
-  const Expr *InitExpr = D->getAnyInitializer(InitDecl);
+  const Expr *InitExpr =
+      IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl);
 
   Optional<ConstantEmitter> emitter;
 
@@ -4190,8 +4194,6 @@
       (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
        D->getType()->isCUDADeviceBuiltinTextureType() ||
        D->hasAttr<HIPManagedAttr>());
-  // HIP pinned shadow of initialized host-side global variables are also
-  // left undefined.
   if (getLangOpts().CUDA &&
       (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar))
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
@@ -4302,7 +4304,10 @@
     }
   }
 
-  GV->setInitializer(Init);
+  // HIP managed variables need to be emitted as declarations in device
+  // compilation.
+  if (!IsHIPManagedVarOnDevice)
+    GV->setInitializer(Init);
   if (emitter)
     emitter->finalize(GV);
 
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -546,6 +546,8 @@
             /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
             Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
             llvm::GlobalVariable::NotThreadLocal);
+        ManagedVar->setDSOLocal(Var->isDSOLocal());
+        ManagedVar->setVisibility(Var->getVisibility());
         replaceManagedVar(Var, ManagedVar);
         llvm::Value *Args[] = {
             &GpuBinaryHandlePtr,
@@ -932,11 +934,16 @@
 
 void CGNVCUDARuntime::internalizeDeviceSideVar(
     const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
-  // Host-side shadows of external declarations of device-side
-  // global variables become internal definitions. These have to
-  // be internal in order to prevent name conflicts with global
-  // host variables with the same name in a different TUs.
+  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
+  // global variables become internal definitions. These have to be internal in
+  // order to prevent name conflicts with global host variables with the same
+  // name in a different TUs.
   //
+  // For -fgpu-rdc, the shadow variables should not be internalized because
+  // they may be accessed by different TU.
+  if (CGM.getLangOpts().GPURelocatableDeviceCode)
+    return;
+
   // __shared__ variables are odd. Shadows do get created, but
   // they are not registered with the CUDA runtime, so they
   // can't really be used to access their device-side
Index: clang/lib/AST/ASTContext.cpp
===================================================================
--- clang/lib/AST/ASTContext.cpp
+++ clang/lib/AST/ASTContext.cpp
@@ -11435,16 +11435,22 @@
 }
 
 bool ASTContext::mayExternalizeStaticVar(const Decl *D) const {
-  return !getLangOpts().GPURelocatableDeviceCode &&
-         ((D->hasAttr<CUDADeviceAttr>() &&
-           !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
-          (D->hasAttr<CUDAConstantAttr>() &&
-           !D->getAttr<CUDAConstantAttr>()->isImplicit())) &&
-         isa<VarDecl>(D) && cast<VarDecl>(D)->isFileVarDecl() &&
-         cast<VarDecl>(D)->getStorageClass() == SC_Static;
+  bool IsStaticVar =
+      isa<VarDecl>(D) && cast<VarDecl>(D)->getStorageClass() == SC_Static;
+  bool IsExplicitDeviceVar = (D->hasAttr<CUDADeviceAttr>() &&
+                              !D->getAttr<CUDADeviceAttr>()->isImplicit()) ||
+                             (D->hasAttr<CUDAConstantAttr>() &&
+                              !D->getAttr<CUDAConstantAttr>()->isImplicit());
+  // CUDA/HIP: static managed variables need to be externalized since it is
+  // a declaration in IR, therefore cannot have internal linkage.
+  // ToDo: externalize static variables for -fgpu-rdc.
+  return IsStaticVar &&
+         (D->hasAttr<HIPManagedAttr>() ||
+          (!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar));
 }
 
 bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const {
   return mayExternalizeStaticVar(D) &&
-         CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D));
+         (D->hasAttr<HIPManagedAttr>() ||
+          CUDAStaticDeviceVarReferencedByHost.count(cast<VarDecl>(D)));
 }
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to