[clang] 29f32e6 - [CIR][CUDA][HIP] Emit host-side kernel calls (#179809)

via cfe-commits Tue, 24 Feb 2026 22:51:38 -0800

Author: David Rivera
Date: 2026-02-25T01:51:26-05:00
New Revision: 29f32e674a18978755c9eeb9624c5d0edcf254ea


URL: 
https://github.com/llvm/llvm-project/commit/29f32e674a18978755c9eeb9624c5d0edcf254ea
DIFF: 
https://github.com/llvm/llvm-project/commit/29f32e674a18978755c9eeb9624c5d0edcf254ea.diff

LOG: [CIR][CUDA][HIP] Emit host-side kernel calls (#179809)

Related: https://github.com/llvm/llvm-project/issues/179278,
https://github.com/llvm/llvm-project/issues/175871

Added: 
    

Modified: 
    clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
    clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
    clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
    clang/lib/CIR/CodeGen/CIRGenExpr.cpp
    clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
    clang/lib/CIR/CodeGen/CIRGenFunction.h
    clang/lib/CIR/CodeGen/CIRGenModule.cpp
    clang/test/CIR/CodeGenCUDA/kernel-call.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
index 451c28c3cccc1..8b8e99023eceb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp
@@ -49,6 +49,12 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
   mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
                                 FunctionArgList &args);
   mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
+
+  mlir::Operation *getKernelStub(mlir::Operation *handle) override {
+    auto it = kernelStubs.find(handle);
+    assert(it != kernelStubs.end());
+    return it->second;
+  }
   std::string addPrefixToName(StringRef funcName) const;
   std::string addUnderscoredPrefixToName(StringRef funcName) const;
 

diff  --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp 
b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
index 14189ad7a52f3..25d981ef2f64b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp
@@ -13,8 +13,33 @@
 
//===----------------------------------------------------------------------===//
 
 #include "CIRGenCUDARuntime.h"
+#include "CIRGenBuilder.h"
+#include "CIRGenFunction.h"
+#include "clang/AST/ExprCXX.h"
 
 using namespace clang;
 using namespace CIRGen;
 
 CIRGenCUDARuntime::~CIRGenCUDARuntime() {}
+
+RValue CIRGenCUDARuntime::emitCUDAKernelCallExpr(CIRGenFunction &cgf,
+                                                 const CUDAKernelCallExpr 
*expr,
+                                                 ReturnValueSlot retValue) {
+
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+  mlir::Location loc =
+      cgf.currSrcLoc ? cgf.currSrcLoc.value() : builder.getUnknownLoc();
+
+  cgf.emitIfOnBoolExpr(
+      expr->getConfig(),
+      [&](mlir::OpBuilder &b, mlir::Location l) { cir::YieldOp::create(b, l); 
},
+      loc,
+      [&](mlir::OpBuilder &b, mlir::Location l) {
+        CIRGenCallee callee = cgf.emitCallee(expr->getCallee());
+        cgf.emitCall(expr->getCallee()->getType(), callee, expr, retValue);
+        cir::YieldOp::create(b, l);
+      },
+      loc);
+
+  return RValue::get(nullptr);
+}

diff  --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h 
b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
index 83eb0c02188ba..ba33602511e3b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h
@@ -40,7 +40,13 @@ class CIRGenCUDARuntime {
   virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
                               FunctionArgList &args) = 0;
 
+  virtual RValue emitCUDAKernelCallExpr(CIRGenFunction &cgf,
+                                        const CUDAKernelCallExpr *expr,
+                                        ReturnValueSlot retValue);
+
   virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) = 0;
+
+  virtual mlir::Operation *getKernelStub(mlir::Operation *handle) = 0;
 };
 
 CIRGenCUDARuntime *createNVCUDARuntime(CIRGenModule &cgm);

diff  --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp 
b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index d8519cec27fae..f8ad143977ff1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1980,7 +1980,12 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const 
GlobalDecl &gd) {
 
   cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd);
 
-  assert(!cir::MissingFeatures::hip());
+  if ((cgm.getLangOpts().CUDA || cgm.getLangOpts().HIP) &&
+      !cgm.getLangOpts().CUDAIsDevice && fd->hasAttr<CUDAGlobalAttr>()) {
+    mlir::Operation *handle = cgm.getCUDARuntime().getKernelHandle(callee, gd);
+    callee =
+        mlir::cast<cir::FuncOp>(*cgm.getCUDARuntime().getKernelStub(handle));
+  }
 
   return CIRGenCallee::forDirect(callee, gd);
 }
@@ -2131,10 +2136,8 @@ RValue CIRGenFunction::emitCallExpr(const 
clang::CallExpr *e,
   if (const auto *ce = dyn_cast<CXXMemberCallExpr>(e))
     return emitCXXMemberCallExpr(ce, returnValue);
 
-  if (isa<CUDAKernelCallExpr>(e)) {
-    cgm.errorNYI(e->getSourceRange(), "call to CUDA kernel");
-    return RValue::get(nullptr);
-  }
+  if (const auto *cudaKernelCallExpr = dyn_cast<CUDAKernelCallExpr>(e))
+    return emitCUDAKernelCallExpr(cudaKernelCallExpr, returnValue);
 
   if (const auto *operatorCall = dyn_cast<CXXOperatorCallExpr>(e)) {
     // If the callee decl is a CXXMethodDecl, we need to emit this as a C++

diff  --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp 
b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index 5d28ac6097c80..518b3e55bf79d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -265,6 +265,14 @@ CIRGenFunction::emitCXXOperatorMemberCallExpr(const 
CXXOperatorCallExpr *e,
       /*IsArrow=*/false, e->getArg(0));
 }
 
+RValue CIRGenFunction::emitCUDAKernelCallExpr(const CUDAKernelCallExpr *expr,
+                                              ReturnValueSlot returnValue) {
+  // Emit as a device kernel call if CUDA device code is to be generated.
+  if (!getLangOpts().HIP && getLangOpts().CUDAIsDevice)
+    cgm.errorNYI("CUDA Device side kernel call");
+  return cgm.getCUDARuntime().emitCUDAKernelCallExpr(*this, expr, returnValue);
+}
+
 RValue CIRGenFunction::emitCXXMemberOrOperatorCall(
     const CXXMethodDecl *md, const CIRGenCallee &callee,
     ReturnValueSlot returnValue, mlir::Value thisPtr, mlir::Value 
implicitParam,

diff  --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 4a9a1710fdfae..bb29d47dbb0e5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1589,6 +1589,9 @@ class CIRGenFunction : public CIRGenTypeCache {
                                        const CXXMethodDecl *md,
                                        ReturnValueSlot returnValue);
 
+  RValue emitCUDAKernelCallExpr(const CUDAKernelCallExpr *expr,
+                                ReturnValueSlot returnValue);
+
   RValue emitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *expr);
 
   RValue emitNewOrDeleteBuiltinCall(const FunctionProtoType *type,

diff  --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 953f9221b07a4..223b53731359a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1820,7 +1820,10 @@ cir::FuncOp 
CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
       cast<FunctionDecl>(gd.getDecl())->hasAttr<CUDAGlobalAttr>()) {
     mlir::Operation *handle = getCUDARuntime().getKernelHandle(func, gd);
 
-    if (isForDefinition)
+    // For HIP the kernel handle is a GlobalOp, which cannot be cast to
+    // FuncOp. Return the stub directly in that case.
+    bool isHIPHandle = mlir::isa<cir::GlobalOp>(*handle);
+    if (isForDefinition || isHIPHandle)
       return func;
     return mlir::dyn_cast<cir::FuncOp>(*handle);
   }

diff  --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu 
b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
index d1ecb88a7ee51..2d37b6eef73af 100644
--- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu
@@ -1,5 +1,6 @@
 // Based on clang/test/CodeGenCUDA/kernel-call.cu.
-// Tests device stub body emission for CUDA and HIP kernels.
+// Tests device stub body emission and kernel launch for CUDA/HIP.
+
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \
 // RUN:   -emit-cir %s -x cuda -o %t.cir
@@ -9,6 +10,9 @@
 // RUN:   -x hip -emit-cir %s -o %t.hip.cir
 // RUN: FileCheck --input-file=%t.hip.cir %s --check-prefix=HIP-NEW
 
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-sdk-version=9.2 \
+// RUN:   -emit-cir %s -x cuda -fcuda-is-device -o %t.device.cir
+// RUN: FileCheck --input-file=%t.device.cir %s --check-prefix=DEVICE
 
 #include "Inputs/cuda.h"
 
@@ -59,3 +63,56 @@
 // HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr<!cir.func<(!s32i, 
!cir.float)>>
 // HIP-NEW: cir.call @hipLaunchKernel({{.*}}) : (!cir.ptr<!void> {{.*}}, 
!rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}}, 
!cir.ptr<!rec_hipStream>{{.*}}) -> (!u32i {llvm.noundef})
 __global__ void kernel(int x, float y) {}
+
+// ===----------------------------------------------------------------------===
+// Kernel launch site checks
+// ===----------------------------------------------------------------------===
+
+
+// Device compilation should not emit main
+// DEVICE-NOT: @main
+
+// CUDA-NEW-LABEL: cir.func {{.*}} @main
+// HIP-NEW-LABEL: cir.func {{.*}} @main
+int main(void) {
+  // Check dim3 temporaries are allocated for grid and block dimensions
+  // CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp0"]
+  // CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp1"]
+  // HIP-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp0"]
+  // HIP-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp1"]
+  //
+  // Check dim3 constructors are called for grid and block dimensions
+  // CUDA-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> 
{llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, 
llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i 
{llvm.noundef}) -> ()
+  // CUDA-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> 
{llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, 
llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i 
{llvm.noundef}) -> ()
+  // HIP-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> 
{llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, 
llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i 
{llvm.noundef}) -> ()
+  // HIP-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> 
{llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, 
llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i 
{llvm.noundef}) -> ()
+  //
+  // Check default shared memory (0) and null stream are set
+  // CUDA-NEW: cir.const #cir.int<0> : !u64i
+  // CUDA-NEW: cir.const #cir.ptr<null> : !cir.ptr<!rec_cudaStream>
+  // HIP-NEW: cir.const #cir.int<0> : !u64i
+  // HIP-NEW: cir.const #cir.ptr<null> : !cir.ptr<!rec_hipStream>
+  //
+  // Check Push call configuration is called with grid, block, shared mem, 
stream
+  // CUDA-NEW: cir.call @__cudaPushCallConfiguration({{.*}}) : (!rec_dim3, 
!rec_dim3, !u64i {llvm.noundef}, !cir.ptr<!rec_cudaStream> {llvm.noundef}) -> 
!s32i
+  // HIP-NEW: cir.call @__hipPushCallConfiguration({{.*}}) : (!rec_dim3, 
!rec_dim3, !u64i {llvm.noundef}, !cir.ptr<!rec_hipStream> {llvm.noundef}) -> 
!u32i
+  //
+  // Check the config result is cast to bool for the conditional
+  // CUDA-NEW: cir.cast int_to_bool {{.*}} : !s32i -> !cir.bool
+  // HIP-NEW: cir.cast int_to_bool {{.*}} : !u32i -> !cir.bool
+  //
+  // Check conditional launch: if config fails (true), skip; else call kernel
+  // CUDA-NEW: cir.if %{{.*}} {
+  // CUDA-NEW: } else {
+  // CUDA-NEW:   cir.const #cir.int<42> : !s32i
+  // CUDA-NEW:   cir.const #cir.fp<1.000000e+00> : !cir.float
+  // CUDA-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
+  // CUDA-NEW: }
+  // HIP-NEW: cir.if %{{.*}} {
+  // HIP-NEW: } else {
+  // HIP-NEW:   cir.const #cir.int<42> : !s32i
+  // HIP-NEW:   cir.const #cir.fp<1.000000e+00> : !cir.float
+  // HIP-NEW:   cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name 
= #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float 
{llvm.noundef}) -> ()
+  // HIP-NEW: }
+  kernel<<<1, 1>>>(42, 1.0f);
+}


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] 29f32e6 - [CIR][CUDA][HIP] Emit host-side kernel calls (#179809)

Reply via email to