https://github.com/Acthinks created https://github.com/llvm/llvm-project/pull/149716
Closes #147373 >From f941f3065638f8c7c25fc1eeca1883cb9dfdb3cf Mon Sep 17 00:00:00 2001 From: Acthinks <yang...@mail.ustc.edu.cn> Date: Fri, 11 Jul 2025 15:58:09 +0800 Subject: [PATCH] [clang][cuda] support __managed__ variables Closes #147373 --- clang/include/clang/Basic/Attr.td | 2 +- clang/lib/CodeGen/CGCUDANV.cpp | 56 ++++++++-- clang/lib/CodeGen/Targets/NVPTX.cpp | 6 +- clang/test/CodeGenCUDA/Inputs/cuda.h | 2 - clang/test/CodeGenCUDA/anon-ns.cu | 12 +- clang/test/CodeGenCUDA/device-var-linkage.cu | 74 +++++++------ clang/test/CodeGenCUDA/managed-var.cu | 103 +++++++++++------- clang/test/CodeGenCUDA/offloading-entries.cu | 102 ++++++++--------- clang/test/Driver/linker-wrapper-image.c | 15 ++- .../Frontend/Offloading/OffloadWrapper.cpp | 52 +++++++-- 10 files changed, 269 insertions(+), 155 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 224cb6a32af28..9ecdf2322ab64 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1562,7 +1562,7 @@ def CUDAGridConstant : InheritableAttr { def HIPManaged : InheritableAttr { let Spellings = [GNU<"managed">, Declspec<"__managed__">]; let Subjects = SubjectList<[Var]>; - let LangOpts = [HIP]; + let LangOpts = [HIP, CUDA]; let Documentation = [HIPManagedAttrDocs]; } diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index dd26be74e561b..2a71b90a808d1 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -42,7 +42,7 @@ class CGNVCUDARuntime : public CGCUDARuntime { StringRef Prefix; private: - llvm::IntegerType *IntTy, *SizeTy; + llvm::IntegerType *IntTy, *SizeTy, *CharTy; llvm::Type *VoidTy; llvm::PointerType *PtrTy; @@ -231,6 +231,7 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) SizeTy = CGM.SizeTy; VoidTy = CGM.VoidTy; PtrTy = CGM.UnqualPtrTy; + CharTy = CGM.CharTy; if (CGM.getLangOpts().OffloadViaLLVM) Prefix = "llvm"; @@ -547,10 +548,11 @@ void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, } // Replace the original variable Var with the address loaded from variable -// ManagedVar populated by HIP runtime. +// ManagedVar populated by HIP/CUDA runtime. static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar) { SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; + for (auto &&VarUse : Var->uses()) { WorkList.push_back({VarUse.getUser()}); } @@ -661,8 +663,15 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { addUnderscoredPrefixToName("RegisterVar")); // void __hipRegisterManagedVar(void **, char *, char *, const char *, // size_t, unsigned) - llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy, - PtrTy, VarSizeTy, IntTy}; + // void __cudaRegisterManagedVar(void **, void **, char *, const char *, + // int, size_t, int, int) + SmallVector<llvm::Type *, 8> RegisterManagedVarParams; + if (CGM.getLangOpts().HIP) + RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, VarSizeTy, IntTy}; + else + RegisterManagedVarParams = {PtrTy, PtrTy, PtrTy, PtrTy, + IntTy, VarSizeTy, IntTy, IntTy}; + llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), addUnderscoredPrefixToName("RegisterManagedVar")); @@ -693,13 +702,23 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { "HIP managed variables not transformed"); auto *ManagedVar = CGM.getModule().getNamedGlobal( Var->getName().drop_back(StringRef(".managed").size())); - llvm::Value *Args[] = { - &GpuBinaryHandlePtr, - ManagedVar, - Var, - VarName, - llvm::ConstantInt::get(VarSizeTy, VarSize), - llvm::ConstantInt::get(IntTy, Var->getAlignment())}; + SmallVector<llvm::Value *, 8> Args; + if (CGM.getLangOpts().HIP) + Args = {&GpuBinaryHandlePtr, + ManagedVar, + Var, + VarName, + llvm::ConstantInt::get(VarSizeTy, VarSize), + llvm::ConstantInt::get(IntTy, Var->getAlignment())}; + else + Args = {&GpuBinaryHandlePtr, + ManagedVar, + VarName, + VarName, + llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), + llvm::ConstantInt::get(VarSizeTy, VarSize), + llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), + llvm::ConstantInt::get(IntTy, 0)}; if (!Var->isDeclaration()) Builder.CreateCall(RegisterManagedVar, Args); } else { @@ -965,6 +984,18 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { "__cudaRegisterFatBinaryEnd"); CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); } + // Call __cudaInitModule(GpuBinaryHandle) for managed variables + for (auto &&Info : DeviceVars) { + llvm::GlobalVariable *Var = Info.Var; + if (!Var->isDeclaration() && Info.Flags.isManaged()) { + llvm::FunctionCallee NvInitManagedRtWithModule = + CGM.CreateRuntimeFunction( + llvm::FunctionType::get(CharTy, PtrTy, false), + "__cudaInitModule"); + CtorBuilder.CreateCall(NvInitManagedRtWithModule, GpuBinaryHandle); + break; + } + } } else { // Generate a unique module ID. SmallString<64> ModuleID; @@ -1158,6 +1189,9 @@ void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, // transformed managed variable. The transformed managed variable contains // the address of managed memory which will be allocated by the runtime. void CGNVCUDARuntime::transformManagedVars() { + // CUDA managed variables directly access in device code + if (!CGM.getLangOpts().HIP && CGM.getLangOpts().CUDAIsDevice) + return; for (auto &&Info : DeviceVars) { llvm::GlobalVariable *Var = Info.Var; if (Info.Flags.getKind() == DeviceVarFlags::Variable && diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index 82bdfe2666b52..1c36e200f2ce5 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -241,8 +241,8 @@ RValue NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, void NVPTXTargetCodeGenInfo::setTargetAttributes( const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const { - if (GV->isDeclaration()) - return; + // if (GV->isDeclaration()) + // return; const VarDecl *VD = dyn_cast_or_null<VarDecl>(D); if (VD) { if (M.getLangOpts().CUDA) { @@ -250,6 +250,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes( addNVVMMetadata(GV, "surface", 1); else if (VD->getType()->isCUDADeviceBuiltinTextureType()) addNVVMMetadata(GV, "texture", 1); + else if (VD->hasAttr<HIPManagedAttr>()) + addNVVMMetadata(GV, "managed", 1); return; } } diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h index dc85eae0c5178..4630060852d21 100644 --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -8,9 +8,7 @@ #define __global__ __attribute__((global)) #define __host__ __attribute__((host)) #define __shared__ __attribute__((shared)) -#if __HIP__ #define __managed__ __attribute__((managed)) -#endif #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) #define __grid_constant__ __attribute__((grid_constant)) #else diff --git a/clang/test/CodeGenCUDA/anon-ns.cu b/clang/test/CodeGenCUDA/anon-ns.cu index d931f31d0207c..d7398ab71502a 100644 --- a/clang/test/CodeGenCUDA/anon-ns.cu +++ b/clang/test/CodeGenCUDA/anon-ns.cu @@ -34,26 +34,26 @@ // CUDA-DAG: define weak_odr {{.*}}void @[[KERN:_ZN12_GLOBAL__N_16kernelEv__intern__b04fd23c98500190]]( // CUDA-DAG: define weak_odr {{.*}}void @[[KTX:_Z2ktIN12_GLOBAL__N_11XEEvT___intern__b04fd23c98500190]]( // CUDA-DAG: define weak_odr {{.*}}void @[[KTL:_Z2ktIN12_GLOBAL__N_1UlvE_EEvT___intern__b04fd23c98500190]]( +// CUDA-DAG: @[[VM:_ZN12_GLOBAL__N_12vmE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global // CUDA-DAG: @[[VC:_ZN12_GLOBAL__N_12vcE__static__b04fd23c98500190]] = addrspace(4) externally_initialized constant // CUDA-DAG: @[[VT:_Z2vtIN12_GLOBAL__N_11XEE__static__b04fd23c98500190]] = addrspace(1) externally_initialized global // COMMON-DAG: @_ZN12_GLOBAL__N_12vdE = internal addrspace(1) global // COMNEG-NOT: @{{.*}} = {{.*}} c"_ZN12_GLOBAL__N_12vdE{{.*}}\00" -// HIP-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]] -// CUDA-DAG: @llvm.compiler.used = {{.*}}@[[VT]]{{.*}}@[[VC]] +// COMMON-DAG: @llvm.compiler.used = {{.*}}@[[VM]]{{.*}}@[[VT]]{{.*}}@[[VC]] // COMMON-DAG: @[[KERNSTR:.*]] = {{.*}} c"[[KERN]]\00" // COMMON-DAG: @[[KTXSTR:.*]] = {{.*}} c"[[KTX]]\00" // COMMON-DAG: @[[KTLSTR:.*]] = {{.*}} c"[[KTL]]\00" -// HIP-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00" +// COMMON-DAG: @[[VMSTR:.*]] = {{.*}} c"[[VM]]\00" // COMMON-DAG: @[[VCSTR:.*]] = {{.*}} c"[[VC]]\00" // COMMON-DAG: @[[VTSTR:.*]] = {{.*}} c"[[VT]]\00" // COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KERNSTR]] // COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTXSTR]] // COMMON-DAG: call i32 @__{{.*}}RegisterFunction({{.*}}@[[KTLSTR]] -// HIP-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]] +// COMMON-DAG: call void @__{{.*}}RegisterManagedVar({{.*}}@[[VMSTR]] // COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VCSTR]] // COMMON-DAG: call void @__{{.*}}RegisterVar({{.*}}@[[VTSTR]] @@ -67,9 +67,7 @@ namespace { struct X {}; X x; auto lambda = [](){}; -#if __HIP__ __managed__ int vm = 1; -#endif __constant__ int vc = 2; // C should not be externalized since it is used by device code only. @@ -89,9 +87,7 @@ void test() { // A, B, and tempVar<X> should be externalized since they are // used by host code. -#if __HIP__ getSymbol(&vm); -#endif getSymbol(&vc); getSymbol(&vt<X>); } diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu index 4c57323d85f9d..1acd5cd993b31 100644 --- a/clang/test/CodeGenCUDA/device-var-linkage.cu +++ b/clang/test/CodeGenCUDA/device-var-linkage.cu @@ -1,18 +1,29 @@ // RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \ // RUN: -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefixes=DEV,NORDC %s +// RUN: | FileCheck -check-prefixes=DEV,HIP-D,NORDC,HIP-NORDC %s // RUN: %clang_cc1 -triple amdgcn -fcuda-is-device \ // RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefixes=DEV,RDC %s +// RUN: | FileCheck -check-prefixes=DEV,HIP-D %s // RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \ // RUN: -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s +// RUN: | FileCheck -check-prefixes=HOST,HIP-H,NORDC-H %s // RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \ // RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x hip %s \ -// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s +// RUN: | FileCheck -check-prefixes=HOST,HIP-H,RDC-H %s + // RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ -// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefixes=CUDA %s +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=DEV,CUDA-D,NORDC,CUDA-NORDC %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=DEV,CUDA-D %s + +// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \ +// RUN: -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s +// RUN: %clang_cc1 -triple x86_64-unknown-gnu-linux \ +// RUN: -fgpu-rdc -cuid=abc -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s #include "Inputs/cuda.h" @@ -24,12 +35,11 @@ __device__ int v1; // NORDC-H-DAG: @v2 = internal global i32 undef // RDC-H-DAG: @v2 = global i32 undef __constant__ int v2; -// DEV-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-D-DAG: @v3 = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-D-DAG: @v3 = addrspace(1) externally_initialized global i32 0, align 4 // NORDC-H-DAG: @v3 = internal externally_initialized global ptr null // RDC-H-DAG: @v3 = externally_initialized global ptr null -#if __HIP__ __managed__ int v3; -#endif // DEV-DAG: @ev1 = external addrspace(1) global i32 // HOST-DAG: @ev1 = external global i32 @@ -37,45 +47,41 @@ extern __device__ int ev1; // DEV-DAG: @ev2 = external addrspace(4) global i32 // HOST-DAG: @ev2 = external global i32 extern __constant__ int ev2; -// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1) +// HIP-D-DAG: @ev3 = external addrspace(1) externally_initialized global ptr addrspace(1) +// CUDA-D-DAG: @ev3 = external addrspace(1) global i32, align 4 // HOST-DAG: @ev3 = external externally_initialized global ptr -#if __HIP__ extern __managed__ int ev3; -#endif // NORDC-DAG: @_ZL3sv1 = addrspace(1) externally_initialized global i32 0 -// RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// HIP-RDC-DAG: @_ZL3sv1.static.[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 +// CUDA-RDC-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 // HOST-DAG: @_ZL3sv1 = internal global i32 undef -// CUDA-DAG: @_ZL3sv1__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 0 static __device__ int sv1; // NORDC-DAG: @_ZL3sv2 = addrspace(4) externally_initialized constant i32 0 -// RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0 +// HIP-RDC-DAG: @_ZL3sv2.static.[[HASH]] = addrspace(4) externally_initialized constant i32 0 +// CUDA-RDC-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0 // HOST-DAG: @_ZL3sv2 = internal global i32 undef -// CUDA-DAG: @_ZL3sv2__static__[[HASH]] = addrspace(4) externally_initialized constant i32 0 static __constant__ int sv2; -// NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null -// RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-NORDC-DAG: @_ZL3sv3 = addrspace(1) externally_initialized global i32 0, align 4 +// HIP-RDC-DAG: @_ZL3sv3.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-RDC-DAG: @_ZL3sv3__static__[[HASH]] = addrspace(1) externally_initialized global i32 0, align 4 // HOST-DAG: @_ZL3sv3 = internal externally_initialized global ptr null -#if __HIP__ static __managed__ int sv3; -#endif __device__ __host__ int work(int *x); __device__ __host__ int fun1() { - return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) -#if __HIP__ - + work(&ev3) + work(&sv3) -#endif - ; + return work(&ev1) + work(&ev2) + work(&sv1) + work(&sv2) + + work(&ev3) + work(&sv3); } -// HOST: hipRegisterVar({{.*}}@v1 -// HOST: hipRegisterVar({{.*}}@v2 -// HOST: hipRegisterManagedVar({{.*}}@v3 -// HOST-NOT: hipRegisterVar({{.*}}@ev1 -// HOST-NOT: hipRegisterVar({{.*}}@ev2 -// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3 -// HOST: hipRegisterVar({{.*}}@_ZL3sv1 -// HOST: hipRegisterVar({{.*}}@_ZL3sv2 -// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3 +// HIP-H: hipRegisterVar({{.*}}@v1 +// HIP-H: hipRegisterVar({{.*}}@v2 +// HIP-H: hipRegisterManagedVar({{.*}}@v3 +// HIP-H-NOT: hipRegisterVar({{.*}}@ev1 +// HIP-H-NOT: hipRegisterVar({{.*}}@ev2 +// HIP-H-NOT: hipRegisterManagedVar({{.*}}@ev3 +// HIP-H: hipRegisterVar({{.*}}@_ZL3sv1 +// HIP-H: hipRegisterVar({{.*}}@_ZL3sv2 +// HIP-H: hipRegisterManagedVar({{.*}}@_ZL3sv3 diff --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu index 0e7a7be85ac8e..def9ede515280 100644 --- a/clang/test/CodeGenCUDA/managed-var.cu +++ b/clang/test/CodeGenCUDA/managed-var.cu @@ -1,61 +1,84 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=COMMON,DEV,NORDC-D %s +// RUN: -check-prefixes=COMMON,DEV,HIP-D,HIP-NORDC-D %s // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \ // RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.dev -// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,RDC-D %s +// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,HIP-D,HIP-RDC-D %s // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=COMMON,HOST,NORDC %s +// RUN: -check-prefixes=COMMON,HOST,HIP-H,NORDC,HIP-NORDC %s // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x hip %s > %t.host -// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s +// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,HIP-H,RDC,HIP-RDC %s // Check device and host compilation use the same postfix for static // variable name. // RUN: cat %t.dev %t.host | FileCheck -check-prefix=POSTFIX %s +// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \ +// RUN: -emit-llvm -o - -x cuda %s | FileCheck \ +// RUN: -check-prefixes=COMMON,DEV,CUDA-D,CUDA-NORDC-D %s + +// RUN: %clang_cc1 -triple nvptx64 -fcuda-is-device -std=c++11 \ +// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s > %t.dev +// RUN: cat %t.dev | FileCheck -check-prefixes=COMMON,DEV,CUDA-D,CUDA-RDC-D %s + +// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ +// RUN: -emit-llvm -o - -x cuda %s | FileCheck \ +// RUN: -check-prefixes=COMMON,HOST,NORDC %s + +// RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ +// RUN: -emit-llvm -fgpu-rdc -cuid=abc -o - -x cuda %s > %t.host +// RUN: cat %t.host | FileCheck -check-prefixes=COMMON,HOST,RDC %s + #include "Inputs/cuda.h" struct vec { float x,y,z; }; -// DEV-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4 -// DEV-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-D-DAG: @x.managed = addrspace(1) externally_initialized global i32 1, align 4 +// HIP-D-DAG: @x = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-D-DAG: @x = addrspace(1) externally_initialized global i32 1, align 4 // NORDC-DAG: @x.managed = internal global i32 1 // RDC-DAG: @x.managed = global i32 1 // NORDC-DAG: @x = internal externally_initialized global ptr null // RDC-DAG: @x = externally_initialized global ptr null -// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00" +// HIP-H-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00" __managed__ int x = 1; -// DEV-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4 -// DEV-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-D-DAG: @v.managed = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4 +// HIP-D-DAG: @v = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-D-DAG: @v = addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4 __managed__ vec v[100]; -// DEV-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4 -// DEV-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-D-DAG: @v2.managed = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4 +// HIP-D-DAG: @v2 = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-D-DAG: @v2 = addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4 __managed__ vec v2[100] = {{1, 1, 1}}; -// DEV-DAG: @ex.managed = external addrspace(1) global i32, align 4 -// DEV-DAG: @ex = external addrspace(1) externally_initialized global ptr addrspace(1) +// HIP-D-DAG: @ex.managed = external addrspace(1) global i32, align 4 +// HIP-D-DAG: @ex = external addrspace(1) externally_initialized global ptr addrspace(1) +// CUDA-D-DAG: @ex = external addrspace(1) global i32, align 4 // HOST-DAG: @ex.managed = external global i32 // HOST-DAG: @ex = external externally_initialized global ptr extern __managed__ int ex; -// NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 -// NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global ptr addrspace(1) null -// RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 -// RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null +// HIP-NORDC-D-DAG: @_ZL2sx.managed = addrspace(1) externally_initialized global i32 1, align 4 +// HIP-NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-NORDC-D-DAG: @_ZL2sx = addrspace(1) externally_initialized global i32 1, align 4 +// HIP-RDC-D-DAG: @_ZL2sx.static.[[HASH:.*]].managed = addrspace(1) externally_initialized global i32 1, align 4 +// HIP-RDC-D-DAG: @_ZL2sx.static.[[HASH]] = addrspace(1) externally_initialized global ptr addrspace(1) null +// CUDA-RDC-D-DAG: @_ZL2sx__static__[[HASH:.*]] = addrspace(1) externally_initialized global i32 1, align 4 +// // HOST-DAG: @_ZL2sx.managed = internal global i32 1 // HOST-DAG: @_ZL2sx = internal externally_initialized global ptr null -// NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" -// RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" +// HIP-NORDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx\00" +// HIP-RDC-DAG: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH:.*]]\00" // POSTFIX: @_ZL2sx.static.[[HASH:.*]] = addrspace(1) externally_initialized global ptr addrspace(1) null // POSTFIX: @[[DEVNAMESX:[0-9]+]] = {{.*}}c"_ZL2sx.static.[[HASH]]\00" @@ -81,10 +104,11 @@ int foo2() { } // COMMON-LABEL: define {{.*}}@_Z4loadv() -// DEV: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @x, align 4 -// DEV: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr -// DEV: %1 = load i32, ptr %0, align 4 -// DEV: ret i32 %1 +// HIP-D: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @x, align 4 +// HIP-D: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr +// HIP-D: [[RET:%.*]] = load i32, ptr %0, align 4 +// CUDA-D: [[RET:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @x to ptr), align 4 +// DEV: ret i32 [[RET]] // HOST: %ld.managed = load ptr, ptr @x, align 4 // HOST: %0 = load i32, ptr %ld.managed, align 4 // HOST: ret i32 %0 @@ -93,9 +117,10 @@ __device__ __host__ int load() { } // COMMON-LABEL: define {{.*}}@_Z5storev() -// DEV: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @x, align 4 -// DEV: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr -// DEV: store i32 2, ptr %0, align 4 +// HIP-D: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @x, align 4 +// HIP-D: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr +// HIP-D: store i32 2, ptr %0, align 4 +// CUDA-D: store i32 2, ptr addrspacecast (ptr addrspace(1) @x to ptr), align 4 // HOST: %ld.managed = load ptr, ptr @x, align 4 // HOST: store i32 2, ptr %ld.managed, align 4 __device__ __host__ void store() { @@ -103,10 +128,11 @@ __device__ __host__ void store() { } // COMMON-LABEL: define {{.*}}@_Z10addr_takenv() -// DEV: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr -// DEV: store ptr %0, ptr %p.ascast, align 8 -// DEV: %1 = load ptr, ptr %p.ascast, align 8 -// DEV: store i32 3, ptr %1, align 4 +// HIP-D: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr +// HIP-D: store ptr %0, ptr %p.ascast, align 8 +// CUDA-D: store ptr addrspacecast (ptr addrspace(1) @x to ptr), ptr %p, align 8 +// DEV: [[LOAD:%.*]] = load ptr, ptr {{%p.*}}, align 8 +// DEV: store i32 3, ptr [[LOAD]], align 4 // HOST: %ld.managed = load ptr, ptr @x, align 4 // HOST: store ptr %ld.managed, ptr %p, align 8 // HOST: %0 = load ptr, ptr %p, align 8 @@ -152,10 +178,11 @@ float addr_taken2() { } // COMMON-LABEL: define {{.*}}@_Z5load4v() -// DEV: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @ex, align 4 -// DEV: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr -// DEV: %1 = load i32, ptr %0, align 4 -// DEV: ret i32 %1 +// HIP-D: %ld.managed = load ptr addrspace(1), ptr addrspace(1) @ex, align 4 +// HIP-D: %0 = addrspacecast ptr addrspace(1) %ld.managed to ptr +// HIP-D: [[LOAD:%.*]] = load i32, ptr %0, align 4 +// CUDA-D: [[LOAD:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @ex to ptr), align 4 +// DEV: ret i32 [[LOAD]] // HOST: %ld.managed = load ptr, ptr @ex, align 4 // HOST: %0 = load i32, ptr %ld.managed, align 4 // HOST: ret i32 %0 @@ -163,7 +190,7 @@ __device__ __host__ int load4() { return ex; } -// HOST-DAG: __hipRegisterManagedVar({{.*}}, ptr @x, ptr @x.managed, ptr @[[DEVNAMEX]], i64 4, i32 4) -// HOST-DAG: __hipRegisterManagedVar({{.*}}, ptr @_ZL2sx, ptr @_ZL2sx.managed, ptr @[[DEVNAMESX]] -// HOST-NOT: __hipRegisterManagedVar({{.*}}, ptr @ex, ptr @ex.managed -// HOST-DAG: declare void @__hipRegisterManagedVar(ptr, ptr, ptr, ptr, i64, i32) +// HIP-H-DAG: __hipRegisterManagedVar({{.*}}, ptr @x, ptr @x.managed, ptr @[[DEVNAMEX]], i64 4, i32 4) +// HIP-H-DAG: __hipRegisterManagedVar({{.*}}, ptr @_ZL2sx, ptr @_ZL2sx.managed, ptr @[[DEVNAMESX]] +// HIP-H-NOT: __hipRegisterManagedVar({{.*}}, ptr @ex, ptr @ex.managed +// HIP-H-DAG: declare void @__hipRegisterManagedVar(ptr, ptr, ptr, ptr, i64, i32) diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu index fe03cc83b9d21..84fac7bb29105 100644 --- a/clang/test/CodeGenCUDA/offloading-entries.cu +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -17,63 +17,65 @@ #define __managed__ __attribute__((managed)) //. -// CUDA: @managed = global i32 undef, align 4 -// CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries" -// CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries" -// CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries" -// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @managed, ptr @.offloading.entry_name.3, i64 4, i64 0, ptr null }, section "llvm_offload_entries" -// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries" -// CUDA: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries" +// CUDA: @managed.managed = global i32 0, align 4 +// CUDA: @managed = externally_initialized global ptr null +// CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries", align 8 +// CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries", align 8 +// CUDA: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1 +// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries", align 8 //. // HIP: @managed.managed = global i32 0, align 4 // HIP: @managed = externally_initialized global ptr null -// HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries" -// HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries" -// HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries" -// HIP: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries" -// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries" -// HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries" +// HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries", align 8 +// HIP: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries", align 8 +// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries", align 8 +// HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1 +// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries", align 8 //. -// CUDA-COFF: @managed = dso_local global i32 undef, align 4 -// CUDA-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" -// CUDA-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" -// CUDA-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE" -// CUDA-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @managed, ptr @.offloading.entry_name.3, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE" -// CUDA-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" -// CUDA-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// CUDA-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries$OE" +// CUDA-COFF: @managed.managed = dso_local global i32 0, align 4 +// CUDA-COFF: @managed = dso_local externally_initialized global ptr null +// CUDA-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// CUDA-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// CUDA-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// CUDA-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries$OE", align 8 +// CUDA-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE", align 8 +// CUDA-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1 +// CUDA-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries$OE", align 8 //. // HIP-COFF: @managed.managed = dso_local global i32 0, align 4 // HIP-COFF: @managed = dso_local externally_initialized global ptr null -// HIP-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" -// HIP-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE" -// HIP-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE" -// HIP-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries$OE" -// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE" -// HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading" -// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries$OE" +// HIP-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// HIP-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// HIP-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "llvm_offload_entries$OE", align 8 +// HIP-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "llvm_offload_entries$OE", align 8 +// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "llvm_offload_entries$OE", align 8 +// HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1 +// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 4, i32 35, ptr @tex, ptr @.offloading.entry_name.5, i64 1, i64 1, ptr null }, section "llvm_offload_entries$OE", align 8 //. // CUDA-LABEL: @_Z18__device_stub__foov( // CUDA-NEXT: entry: diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index c0de56d58196a..cfd3027e33b15 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -61,6 +61,7 @@ // CUDA: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".nv_fatbin" // CUDA-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1180844977, i32 1, ptr @.fatbin_image, ptr null }, section ".nvFatBinSegment", align 8 // CUDA-NEXT: @.cuda.binary_handle = internal global ptr null +// CUDA-NEXT: @.nv_inited_managed_rt = internal global i8 0 // CUDA: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 101, ptr @.cuda.fatbin_reg, ptr null }] @@ -70,7 +71,16 @@ // CUDA-NEXT: store ptr %0, ptr @.cuda.binary_handle, align 8 // CUDA-NEXT: call void @.cuda.globals_reg(ptr %0) // CUDA-NEXT: call void @__cudaRegisterFatBinaryEnd(ptr %0) -// CUDA-NEXT: %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg) +// CUDA-NEXT: %1 = load i8, ptr @.nv_inited_managed_rt, align 1 +// CUDA-NEXT: %2 = icmp ne i8 %1, 0 +// CUDA-NEXT: br i1 %2, label %if.then, label %if.end + +// CUDA: if.then: +// CUDA-NEXT: %3 = call i8 @__cudaInitModule(ptr %0) +// CUDA-NEXT: br label %if.end + +// CUDA: if.end: +// CUDA-NEXT: %4 = call i32 @atexit(ptr @.cuda.fatbin_unreg) // CUDA-NEXT: ret void // CUDA-NEXT: } // @@ -134,7 +144,8 @@ // CUDA-NEXT: br label %if.end // // CUDA: sw.managed: -// CUDA-NEXT: call void @__cudaRegisterManagedVar(ptr %0, ptr %aux_addr, ptr %addr, ptr %name, i64 %size, i32 %9) +// CUDA-NEXT: call void @__cudaRegisterManagedVar(ptr %0, ptr %aux_addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0) +// CUDA-NEXT: store i8 1, ptr @.nv_inited_managed_rt, align 1 // CUDA-NEXT: br label %if.end // // CUDA: sw.surface: diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index cfddc06fbc00b..0c9b84f593a16 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -356,11 +356,19 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP, FunctionCallee RegVar = M.getOrInsertFunction( IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy); - // Get the __cudaRegisterSurface function declaration. + // Get the __cudaRegisterManagedVar function declaration. + SmallVector<llvm::Type *, 8> RegisterManagedVarParams; + if (IsHIP) + RegisterManagedVarParams = {Int8PtrPtrTy, Int8PtrTy, + Int8PtrTy, Int8PtrTy, + getSizeTTy(M), Type::getInt32Ty(C)}; + else + RegisterManagedVarParams = {Int8PtrPtrTy, Int8PtrPtrTy, + Int8PtrTy, Int8PtrTy, + Type::getInt32Ty(C), getSizeTTy(M), + Type::getInt32Ty(C), Type::getInt32Ty(C)}; FunctionType *RegManagedVarTy = - FunctionType::get(Type::getVoidTy(C), - {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, - getSizeTTy(M), Type::getInt32Ty(C)}, + FunctionType::get(Type::getVoidTy(C), RegisterManagedVarParams, /*isVarArg=*/false); FunctionCallee RegManagedVar = M.getOrInsertFunction( IsHIP ? "__hipRegisterManagedVar" : "__cudaRegisterManagedVar", @@ -498,8 +506,21 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP, // Create managed variable registration code. Builder.SetInsertPoint(SwManagedBB); - Builder.CreateCall(RegManagedVar, {RegGlobalsFn->arg_begin(), AuxAddr, Addr, - Name, Size, Data}); + if (IsHIP) + Builder.CreateCall(RegManagedVar, {RegGlobalsFn->arg_begin(), AuxAddr, Addr, + Name, Size, Data}); + else { + llvm::GlobalVariable *NvInitManagedRt = new llvm::GlobalVariable( + M, Type::getInt8Ty(C), /*isConstant=*/false, + GlobalValue::InternalLinkage, ConstantInt::get(Type::getInt8Ty(C), 0), + ".nv_inited_managed_rt"); + Builder.CreateCall(RegManagedVar, + {RegGlobalsFn->arg_begin(), AuxAddr, Name, Name, Extern, + Size, Const, ConstantInt::get(Type::getInt32Ty(C), 0)}); + + Builder.CreateStore(ConstantInt::get(Type::getInt8Ty(C), 1), + NvInitManagedRt); + } Builder.CreateBr(IfEndBB); Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry), SwManagedBB); @@ -602,8 +623,25 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, Suffix, EmitSurfacesAndTextures), Handle); - if (!IsHIP) + if (!IsHIP) { CtorBuilder.CreateCall(RegFatbinEnd, Handle); + auto *IfThenBB = BasicBlock::Create(C, "if.then", CtorFunc); + auto *IfEndBB = BasicBlock::Create(C, "if.end", CtorFunc); + auto *NvInitManagedRt = M.getNamedGlobal(".nv_inited_managed_rt"); + auto *InitManagedRt = + CtorBuilder.CreateLoad(Type::getInt8Ty(C), NvInitManagedRt); + auto *Cmp = CtorBuilder.CreateICmpNE( + InitManagedRt, ConstantInt::get(Type::getInt8Ty(C), 0)); + CtorBuilder.CreateCondBr(Cmp, IfThenBB, IfEndBB); + CtorBuilder.SetInsertPoint(IfThenBB); + // Call __cudaInitModule(GpuBinaryHandle) for managed variables + llvm::FunctionCallee NvInitManagedRtWithModule = M.getOrInsertFunction( + "__cudaInitModule", + llvm::FunctionType::get(Type::getInt8Ty(C), PtrTy, false)); + CtorBuilder.CreateCall(NvInitManagedRtWithModule, Handle); + CtorBuilder.CreateBr(IfEndBB); + CtorBuilder.SetInsertPoint(IfEndBB); + } CtorBuilder.CreateCall(AtExit, DtorFunc); CtorBuilder.CreateRetVoid(); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits