Author: David Rivera Date: 2026-02-25T01:51:26-05:00 New Revision: 29f32e674a18978755c9eeb9624c5d0edcf254ea
URL: https://github.com/llvm/llvm-project/commit/29f32e674a18978755c9eeb9624c5d0edcf254ea DIFF: https://github.com/llvm/llvm-project/commit/29f32e674a18978755c9eeb9624c5d0edcf254ea.diff LOG: [CIR][CUDA][HIP] Emit host-side kernel calls (#179809) Related: https://github.com/llvm/llvm-project/issues/179278, https://github.com/llvm/llvm-project/issues/175871 Added: Modified: clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h clang/lib/CIR/CodeGen/CIRGenExpr.cpp clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp clang/lib/CIR/CodeGen/CIRGenFunction.h clang/lib/CIR/CodeGen/CIRGenModule.cpp clang/test/CIR/CodeGenCUDA/kernel-call.cu Removed: ################################################################################ diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp index 451c28c3cccc1..8b8e99023eceb 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp @@ -49,6 +49,12 @@ class CIRGenNVCUDARuntime : public CIRGenCUDARuntime { mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc, FunctionArgList &args); mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override; + + mlir::Operation *getKernelStub(mlir::Operation *handle) override { + auto it = kernelStubs.find(handle); + assert(it != kernelStubs.end()); + return it->second; + } std::string addPrefixToName(StringRef funcName) const; std::string addUnderscoredPrefixToName(StringRef funcName) const; diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp index 14189ad7a52f3..25d981ef2f64b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.cpp @@ -13,8 +13,33 @@ //===----------------------------------------------------------------------===// #include "CIRGenCUDARuntime.h" +#include "CIRGenBuilder.h" +#include "CIRGenFunction.h" +#include "clang/AST/ExprCXX.h" using namespace clang; using namespace CIRGen; CIRGenCUDARuntime::~CIRGenCUDARuntime() {} + +RValue CIRGenCUDARuntime::emitCUDAKernelCallExpr(CIRGenFunction &cgf, + const CUDAKernelCallExpr *expr, + ReturnValueSlot retValue) { + + CIRGenBuilderTy &builder = cgm.getBuilder(); + mlir::Location loc = + cgf.currSrcLoc ? cgf.currSrcLoc.value() : builder.getUnknownLoc(); + + cgf.emitIfOnBoolExpr( + expr->getConfig(), + [&](mlir::OpBuilder &b, mlir::Location l) { cir::YieldOp::create(b, l); }, + loc, + [&](mlir::OpBuilder &b, mlir::Location l) { + CIRGenCallee callee = cgf.emitCallee(expr->getCallee()); + cgf.emitCall(expr->getCallee()->getType(), callee, expr, retValue); + cir::YieldOp::create(b, l); + }, + loc); + + return RValue::get(nullptr); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h index 83eb0c02188ba..ba33602511e3b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h +++ b/clang/lib/CIR/CodeGen/CIRGenCUDARuntime.h @@ -40,7 +40,13 @@ class CIRGenCUDARuntime { virtual void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn, FunctionArgList &args) = 0; + virtual RValue emitCUDAKernelCallExpr(CIRGenFunction &cgf, + const CUDAKernelCallExpr *expr, + ReturnValueSlot retValue); + virtual mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) = 0; + + virtual mlir::Operation *getKernelStub(mlir::Operation *handle) = 0; }; CIRGenCUDARuntime *createNVCUDARuntime(CIRGenModule &cgm); diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index d8519cec27fae..f8ad143977ff1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1980,7 +1980,12 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) { cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd); - assert(!cir::MissingFeatures::hip()); + if ((cgm.getLangOpts().CUDA || cgm.getLangOpts().HIP) && + !cgm.getLangOpts().CUDAIsDevice && fd->hasAttr<CUDAGlobalAttr>()) { + mlir::Operation *handle = cgm.getCUDARuntime().getKernelHandle(callee, gd); + callee = + mlir::cast<cir::FuncOp>(*cgm.getCUDARuntime().getKernelStub(handle)); + } return CIRGenCallee::forDirect(callee, gd); } @@ -2131,10 +2136,8 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e, if (const auto *ce = dyn_cast<CXXMemberCallExpr>(e)) return emitCXXMemberCallExpr(ce, returnValue); - if (isa<CUDAKernelCallExpr>(e)) { - cgm.errorNYI(e->getSourceRange(), "call to CUDA kernel"); - return RValue::get(nullptr); - } + if (const auto *cudaKernelCallExpr = dyn_cast<CUDAKernelCallExpr>(e)) + return emitCUDAKernelCallExpr(cudaKernelCallExpr, returnValue); if (const auto *operatorCall = dyn_cast<CXXOperatorCallExpr>(e)) { // If the callee decl is a CXXMethodDecl, we need to emit this as a C++ diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index 5d28ac6097c80..518b3e55bf79d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -265,6 +265,14 @@ CIRGenFunction::emitCXXOperatorMemberCallExpr(const CXXOperatorCallExpr *e, /*IsArrow=*/false, e->getArg(0)); } +RValue CIRGenFunction::emitCUDAKernelCallExpr(const CUDAKernelCallExpr *expr, + ReturnValueSlot returnValue) { + // Emit as a device kernel call if CUDA device code is to be generated. + if (!getLangOpts().HIP && getLangOpts().CUDAIsDevice) + cgm.errorNYI("CUDA Device side kernel call"); + return cgm.getCUDARuntime().emitCUDAKernelCallExpr(*this, expr, returnValue); +} + RValue CIRGenFunction::emitCXXMemberOrOperatorCall( const CXXMethodDecl *md, const CIRGenCallee &callee, ReturnValueSlot returnValue, mlir::Value thisPtr, mlir::Value implicitParam, diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 4a9a1710fdfae..bb29d47dbb0e5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1589,6 +1589,9 @@ class CIRGenFunction : public CIRGenTypeCache { const CXXMethodDecl *md, ReturnValueSlot returnValue); + RValue emitCUDAKernelCallExpr(const CUDAKernelCallExpr *expr, + ReturnValueSlot returnValue); + RValue emitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *expr); RValue emitNewOrDeleteBuiltinCall(const FunctionProtoType *type, diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 953f9221b07a4..223b53731359a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -1820,7 +1820,10 @@ cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd, cast<FunctionDecl>(gd.getDecl())->hasAttr<CUDAGlobalAttr>()) { mlir::Operation *handle = getCUDARuntime().getKernelHandle(func, gd); - if (isForDefinition) + // For HIP the kernel handle is a GlobalOp, which cannot be cast to + // FuncOp. Return the stub directly in that case. + bool isHIPHandle = mlir::isa<cir::GlobalOp>(*handle); + if (isForDefinition || isHIPHandle) return func; return mlir::dyn_cast<cir::FuncOp>(*handle); } diff --git a/clang/test/CIR/CodeGenCUDA/kernel-call.cu b/clang/test/CIR/CodeGenCUDA/kernel-call.cu index d1ecb88a7ee51..2d37b6eef73af 100644 --- a/clang/test/CIR/CodeGenCUDA/kernel-call.cu +++ b/clang/test/CIR/CodeGenCUDA/kernel-call.cu @@ -1,5 +1,6 @@ // Based on clang/test/CodeGenCUDA/kernel-call.cu. -// Tests device stub body emission for CUDA and HIP kernels. +// Tests device stub body emission and kernel launch for CUDA/HIP. + // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-sdk-version=9.2 \ // RUN: -emit-cir %s -x cuda -o %t.cir @@ -9,6 +10,9 @@ // RUN: -x hip -emit-cir %s -o %t.hip.cir // RUN: FileCheck --input-file=%t.hip.cir %s --check-prefix=HIP-NEW +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-sdk-version=9.2 \ +// RUN: -emit-cir %s -x cuda -fcuda-is-device -o %t.device.cir +// RUN: FileCheck --input-file=%t.device.cir %s --check-prefix=DEVICE #include "Inputs/cuda.h" @@ -59,3 +63,56 @@ // HIP-NEW: cir.get_global @_Z6kernelif : !cir.ptr<!cir.func<(!s32i, !cir.float)>> // HIP-NEW: cir.call @hipLaunchKernel({{.*}}) : (!cir.ptr<!void> {{.*}}, !rec_dim3, !rec_dim3, !cir.ptr<!cir.ptr<!void>>{{.*}}, !u64i{{.*}}, !cir.ptr<!rec_hipStream>{{.*}}) -> (!u32i {llvm.noundef}) __global__ void kernel(int x, float y) {} + +// ===----------------------------------------------------------------------=== +// Kernel launch site checks +// ===----------------------------------------------------------------------=== + + +// Device compilation should not emit main +// DEVICE-NOT: @main + +// CUDA-NEW-LABEL: cir.func {{.*}} @main +// HIP-NEW-LABEL: cir.func {{.*}} @main +int main(void) { + // Check dim3 temporaries are allocated for grid and block dimensions + // CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp0"] + // CUDA-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp1"] + // HIP-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp0"] + // HIP-NEW-DAG: cir.alloca !rec_dim3, {{.*}} ["agg.tmp1"] + // + // Check dim3 constructors are called for grid and block dimensions + // CUDA-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}) -> () + // CUDA-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}) -> () + // HIP-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}) -> () + // HIP-NEW: cir.call @_ZN4dim3C1Ejjj({{.*}}) : (!cir.ptr<!rec_dim3> {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}, !u32i {llvm.noundef}) -> () + // + // Check default shared memory (0) and null stream are set + // CUDA-NEW: cir.const #cir.int<0> : !u64i + // CUDA-NEW: cir.const #cir.ptr<null> : !cir.ptr<!rec_cudaStream> + // HIP-NEW: cir.const #cir.int<0> : !u64i + // HIP-NEW: cir.const #cir.ptr<null> : !cir.ptr<!rec_hipStream> + // + // Check Push call configuration is called with grid, block, shared mem, stream + // CUDA-NEW: cir.call @__cudaPushCallConfiguration({{.*}}) : (!rec_dim3, !rec_dim3, !u64i {llvm.noundef}, !cir.ptr<!rec_cudaStream> {llvm.noundef}) -> !s32i + // HIP-NEW: cir.call @__hipPushCallConfiguration({{.*}}) : (!rec_dim3, !rec_dim3, !u64i {llvm.noundef}, !cir.ptr<!rec_hipStream> {llvm.noundef}) -> !u32i + // + // Check the config result is cast to bool for the conditional + // CUDA-NEW: cir.cast int_to_bool {{.*}} : !s32i -> !cir.bool + // HIP-NEW: cir.cast int_to_bool {{.*}} : !u32i -> !cir.bool + // + // Check conditional launch: if config fails (true), skip; else call kernel + // CUDA-NEW: cir.if %{{.*}} { + // CUDA-NEW: } else { + // CUDA-NEW: cir.const #cir.int<42> : !s32i + // CUDA-NEW: cir.const #cir.fp<1.000000e+00> : !cir.float + // CUDA-NEW: cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name = #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float {llvm.noundef}) -> () + // CUDA-NEW: } + // HIP-NEW: cir.if %{{.*}} { + // HIP-NEW: } else { + // HIP-NEW: cir.const #cir.int<42> : !s32i + // HIP-NEW: cir.const #cir.fp<1.000000e+00> : !cir.float + // HIP-NEW: cir.call @_Z21__device_stub__kernelif({{.*}}) {cu.kernel_name = #cir.cu.kernel_name<_Z6kernelif>} : (!s32i {llvm.noundef}, !cir.float {llvm.noundef}) -> () + // HIP-NEW: } + kernel<<<1, 1>>>(42, 1.0f); +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
