yaxunl updated this revision to Diff 322213.
yaxunl retitled this revision from "[HIP] Simplify kernel launching" to "[HIP] 
Emit kernel symbol".
yaxunl edited the summary of this revision.
yaxunl added a comment.

Revised by Artem's comments.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D86376/new/

https://reviews.llvm.org/D86376

Files:
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/test/CodeGenCUDA/Inputs/cuda.h
  clang/test/CodeGenCUDA/cxx-call-kernel.cpp
  clang/test/CodeGenCUDA/kernel-dbg-info.cu
  clang/test/CodeGenCUDA/kernel-stub-name.cu
  clang/test/CodeGenCUDA/unnamed-types.cu

Index: clang/test/CodeGenCUDA/unnamed-types.cu
===================================================================
--- clang/test/CodeGenCUDA/unnamed-types.cu
+++ clang/test/CodeGenCUDA/unnamed-types.cu
@@ -54,7 +54,7 @@
               [] __device__ (float x) { return x + 5.f; });
 }
 // HOST: @__hip_register_globals
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
-// HOST: __hipRegisterFunction{{.*}}@_Z17__device_stub__k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
+// HOST: __hipRegisterFunction{{.*}}@_Z2k0IZZ2f1PfENKUlS0_E_clES0_EUlfE_EvS0_T_{{.*}}@0
+// HOST: __hipRegisterFunction{{.*}}@_Z2k1IZ2f1PfEUlfE_Z2f1S0_EUlffE_Z2f1S0_EUlfE0_EvS0_T_T0_T1_{{.*}}@1
 // MSVC: __hipRegisterFunction{{.*}}@"??$k0@V<lambda_1>@?0???R1?0??f1@@YAXPEAM@Z@QEBA@0@Z@@@YAXPEAMV<lambda_1>@?0???R0?0??f1@@YAX0@Z@QEBA@0@Z@@Z{{.*}}@0
 // MSVC: __hipRegisterFunction{{.*}}@"??$k1@V<lambda_2>@?0??f1@@YAXPEAM@Z@V<lambda_3>@?0??2@YAX0@Z@V<lambda_4>@?0??2@YAX0@Z@@@YAXPEAMV<lambda_2>@?0??f1@@YAX0@Z@V<lambda_3>@?0??1@YAX0@Z@V<lambda_4>@?0??1@YAX0@Z@@Z{{.*}}@1
Index: clang/test/CodeGenCUDA/kernel-stub-name.cu
===================================================================
--- clang/test/CodeGenCUDA/kernel-stub-name.cu
+++ clang/test/CodeGenCUDA/kernel-stub-name.cu
@@ -6,6 +6,12 @@
 
 #include "Inputs/cuda.h"
 
+// Kernel handles
+
+// CHECK: @[[HCKERN:ckernel]] = constant i8* null
+// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant i8* null
+// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant i8* null
+
 extern "C" __global__ void ckernel() {}
 
 namespace ns {
@@ -26,9 +32,9 @@
 // Non-template kernel stub functions
 
 // CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
 // CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[NSSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
 
 // CHECK-LABEL: define{{.*}}@_Z8hostfuncv()
 // CHECK: call void @[[CSTUB]]()
@@ -45,11 +51,11 @@
 // Template kernel stub functions
 
 // CHECK: define{{.*}}@[[TSTUB]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[TSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HTKERN]]
 
 // CHECK: declare{{.*}}@[[DSTUB]]
 
 // CHECK-LABEL: define{{.*}}@__hip_register_globals
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[CSTUB]]{{.*}}@[[CKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[NSSTUB]]{{.*}}@[[NSKERN]]
-// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[TSTUB]]{{.*}}@[[TKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
+// CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
Index: clang/test/CodeGenCUDA/kernel-dbg-info.cu
===================================================================
--- clang/test/CodeGenCUDA/kernel-dbg-info.cu
+++ clang/test/CodeGenCUDA/kernel-dbg-info.cu
@@ -30,6 +30,9 @@
   *a = 1;
 }
 
+// Kernel symbol for launching kernel.
+// CHECK: @[[SYM:ckernel]] = constant i8* null
+
 // Device side kernel names
 // CHECK: @[[CKERN:[0-9]*]] = {{.*}} c"ckernel\00"
 
@@ -40,7 +43,7 @@
 // Make sure there is no !dbg between function attributes and '{'
 // CHECK: define{{.*}} void @[[CSTUB:__device_stub__ckernel]]{{.*}} #{{[0-9]+}} {
 // CHECK-NOT: call {{.*}}@hipLaunchByPtr{{.*}}!dbg
-// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[CSTUB]]
+// CHECK: call {{.*}}@hipLaunchByPtr{{.*}}@[[SYM]]
 // CHECK-NOT: ret {{.*}}!dbg
 
 // CHECK-LABEL: define {{.*}}@_Z8hostfuncPi{{.*}}!dbg
Index: clang/test/CodeGenCUDA/cxx-call-kernel.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/cxx-call-kernel.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -x hip -emit-llvm-bc %s -o %t.hip.bc
+// RUN: %clang_cc1 -mlink-bitcode-file %t.hip.bc -DHIP_PLATFORM -emit-llvm \
+// RUN:   %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: @_Z2g1i = constant i8* null
+#if __HIP__
+__global__ void g1(int x) {}
+#else
+extern void g1(int x);
+
+// CHECK: call i32 @hipLaunchKernel{{.*}}@_Z2g1i
+void test() {
+  hipLaunchKernel((void*)g1, 1, 1, nullptr, 0, 0);
+}
+
+// CHECK: __hipRegisterFunction{{.*}}@_Z2g1i
+#endif
Index: clang/test/CodeGenCUDA/Inputs/cuda.h
===================================================================
--- clang/test/CodeGenCUDA/Inputs/cuda.h
+++ clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -2,6 +2,7 @@
 
 #include <stddef.h>
 
+#if __HIP__ || __CUDA__
 #define __constant__ __attribute__((constant))
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -11,13 +12,22 @@
 #define __managed__ __attribute__((managed))
 #endif
 #define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
+#else
+#define __constant__
+#define __device__
+#define __global__
+#define __host__
+#define __shared__
+#define __managed__
+#define __launch_bounds__(...)
+#endif
 
 struct dim3 {
   unsigned x, y, z;
   __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };
 
-#ifdef __HIP__
+#if __HIP__ || HIP_PLATFORM
 typedef struct hipStream *hipStream_t;
 typedef enum hipError {} hipError_t;
 int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -42,12 +42,16 @@
   llvm::LLVMContext &Context;
   /// Convenience reference to the current module
   llvm::Module &TheModule;
-  /// Keeps track of kernel launch stubs emitted in this module
+  /// Keeps track of kernel launch stubs and handles emitted in this module
   struct KernelInfo {
-    llvm::Function *Kernel;
+    llvm::Function *Kernel; // stub function to help launch kernel
     const Decl *D;
   };
   llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+  // Map a device stub function to a symbol for identifying kernel in host code.
+  // For CUDA, the symbol for identifying the kernel is the same as the device
+  // stub function. For HIP, they are different.
+  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
   struct VarInfo {
     llvm::GlobalVariable *Var;
     const VarDecl *D;
@@ -270,6 +274,18 @@
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
                                      FunctionArgList &Args) {
   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
+  llvm::GlobalValue *KernelHandle = CGF.CurFn;
+  if (CGF.getLangOpts().HIP) {
+    auto Linkage = CGF.CurFn->getLinkage();
+    auto *Var = new llvm::GlobalVariable(
+        TheModule, VoidPtrTy, /*isConstant=*/true, Linkage,
+        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrTy),
+        CGM.getMangledName(GlobalDecl(cast<FunctionDecl>(CGF.CurFuncDecl),
+                                      KernelReferenceKind::Kernel)));
+    Var->setAlignment(CGM.getPointerAlign().getAsAlign());
+    KernelHandle = Var;
+  }
+  KernelHandles[CGF.CurFn] = KernelHandle;
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
       (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
@@ -348,7 +364,8 @@
                                ShmemSize.getPointer(), Stream.getPointer()});
 
   // Emit the call to cudaLaunch
-  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+  llvm::Value *Kernel =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
   CallArgList LaunchKernelArgs;
   LaunchKernelArgs.add(RValue::get(Kernel),
                        cudaLaunchKernelFD->getParamDecl(0)->getType());
@@ -403,7 +420,8 @@
 
   // Emit the call to cudaLaunch
   llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
-  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
+  llvm::Value *Arg =
+      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
   CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
   CGF.EmitBranch(EndBlock);
 
@@ -497,7 +515,7 @@
     llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
     llvm::Value *Args[] = {
         &GpuBinaryHandlePtr,
-        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
         KernelName,
         KernelName,
         llvm::ConstantInt::get(IntTy, -1),
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to