https://github.com/darkbuck updated https://github.com/llvm/llvm-project/pull/165519
>From 613b656c7c36c8c41d54fa30dc7a8131467c69eb Mon Sep 17 00:00:00 2001 From: Michael Liao <[email protected]> Date: Sat, 18 Oct 2025 19:46:39 -0400 Subject: [PATCH] [CUDA] Add device-side kernel launch support - CUDA's dynamic parallelism extension allows device-side kernel launches, which share the identical syntax to host-side launches, e.g., kernel<<<Dg, Db, Ns, S>>>(arguments); but differ from the code generation. That device-side kernel launches is eventually translated into the following sequence config = cudaGetParameterBuffer(alignment, size); // setup arguments by copying them into `config`. cudaLaunchDevice(func, config, Dg, Db, Ns, S); - To support the device-side kernel launch, 'CUDAKernelCallExpr' is reused but its config expr is set to a call to 'cudaLaunchDevice'. During the code generation, 'CUDAKernelCallExpr' is expanded into the sequence aforementioned. - As the device-side kernel launch requires the code to be compiled as relocatable device code and linked with '-lcudadevrt'. 'clang-nvlink-wrapper' is modified to forward archives with fat binaries directly. --- clang/include/clang/AST/ASTContext.h | 16 +++ .../clang/Basic/DiagnosticSemaKinds.td | 8 ++ clang/include/clang/Sema/SemaCUDA.h | 5 + clang/include/clang/Serialization/ASTReader.h | 2 +- clang/lib/CodeGen/CGCUDARuntime.cpp | 106 ++++++++++++++++++ clang/lib/CodeGen/CGCUDARuntime.h | 4 + clang/lib/CodeGen/CGExprCXX.cpp | 6 + clang/lib/Sema/SemaCUDA.cpp | 99 +++++++++++++++- clang/lib/Sema/SemaDecl.cpp | 32 ++++-- clang/lib/Serialization/ASTReader.cpp | 8 +- clang/lib/Serialization/ASTWriter.cpp | 37 +++--- clang/test/CodeGenCUDA/Inputs/cuda.h | 8 +- clang/test/CodeGenCUDA/device-kernel-call.cu | 35 ++++++ clang/test/SemaCUDA/Inputs/cuda.h | 7 ++ .../test/SemaCUDA/call-kernel-from-kernel.cu | 5 +- clang/test/SemaCUDA/device-kernel-call.cu | 23 ++++ clang/test/SemaCUDA/function-overload.cu | 26 ++--- clang/test/SemaCUDA/function-target.cu | 4 +- clang/test/SemaCUDA/reference-to-kernel-fn.cu | 4 +- .../ClangNVLinkWrapper.cpp | 46 ++++++++ 20 files changed, 426 insertions(+), 55 deletions(-) create mode 100644 clang/test/CodeGenCUDA/device-kernel-call.cu create mode 100644 clang/test/SemaCUDA/device-kernel-call.cu diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 6e9e737dcae4f..303e8f0e9a7a4 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -500,6 +500,10 @@ class ASTContext : public RefCountedBase<ASTContext> { /// Declaration for the CUDA cudaConfigureCall function. FunctionDecl *cudaConfigureCallDecl = nullptr; + /// Declaration for the CUDA cudaGetParameterBuffer function. + FunctionDecl *cudaGetParameterBufferDecl = nullptr; + /// Declaration for the CUDA cudaLaunchDevice function. + FunctionDecl *cudaLaunchDeviceDecl = nullptr; /// Keeps track of all declaration attributes. /// @@ -1653,6 +1657,18 @@ class ASTContext : public RefCountedBase<ASTContext> { return cudaConfigureCallDecl; } + void setcudaGetParameterBufferDecl(FunctionDecl *FD) { + cudaGetParameterBufferDecl = FD; + } + + FunctionDecl *getcudaGetParameterBufferDecl() { + return cudaGetParameterBufferDecl; + } + + void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; } + + FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; } + /// Returns true iff we need copy/dispose helpers for the given type. bool BlockRequiresCopying(QualType Ty, const VarDecl *D); diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3e864475f22a1..b18585b545226 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9512,6 +9512,8 @@ def err_kern_is_nonstatic_method : Error< "kernel function %0 must be a free function or static member function">; def err_config_scalar_return : Error< "CUDA special function '%0' must have scalar return type">; +def err_config_pointer_return + : Error<"CUDA special function '%0' must have pointer return type">; def err_kern_call_not_global_function : Error< "kernel call to non-global function %0">; def err_global_call_not_config : Error< @@ -13707,4 +13709,10 @@ def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">; + +def err_cuda_device_kernel_launch_not_supported + : Error<"device-side kernel call/launch is not supported">; +def err_cuda_device_kernel_launch_require_rdc + : Error<"kernel launch from __device__ or __global__ function requires " + "relocatable device code (i.e. requires -fgpu-rdc)">; } // end of sema component. diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h index dbc1432860d89..dbb4290f5d149 100644 --- a/clang/include/clang/Sema/SemaCUDA.h +++ b/clang/include/clang/Sema/SemaCUDA.h @@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase { /// of the function that will be called to configure kernel call, with the /// parameters specified via <<<>>>. std::string getConfigureFuncName() const; + /// Return the name of the parameter buffer allocation function for the + /// device kernel launch. + std::string getGetParameterBufferFuncName() const; + /// Return the name of the device kernel launch function. + std::string getLaunchDeviceFuncName() const; /// Record variables that are potentially ODR-used in CUDA/HIP. void recordPotentialODRUsedVariable(MultiExprArg Args, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index a27cfe8a9b307..d276f0d21b958 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -1005,7 +1005,7 @@ class ASTReader /// /// The AST context tracks a few important decls, currently cudaConfigureCall, /// directly. - SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs; + SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs; /// The floating point pragma option settings. SmallVector<uint64_t, 1> FPPragmaOptions; diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp index 121a481213396..9cbdb641d00a1 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.cpp +++ b/clang/lib/CodeGen/CGCUDARuntime.cpp @@ -22,6 +22,112 @@ using namespace CodeGen; CGCUDARuntime::~CGCUDARuntime() {} +static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF, + const CUDAKernelCallExpr *E) { + auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl(); + const FunctionProtoType *GetParamBufProto = + GetParamBuf->getType()->getAs<FunctionProtoType>(); + + DeclRefExpr *DRE = DeclRefExpr::Create( + CGF.getContext(), {}, {}, GetParamBuf, + /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(), + GetParamBuf->getType(), VK_PRValue); + auto *ImpCast = ImplicitCastExpr::Create( + CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()), + CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride()); + + CGCallee Callee = CGF.EmitCallee(ImpCast); + CallArgList Args; + // Use 64B alignment. + Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))), + CGF.getContext().getSizeType()); + // Calculate parameter sizes. + const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>(); + const FunctionProtoType *FTP = + PT->getPointeeType()->getAs<FunctionProtoType>(); + CharUnits Offset = CharUnits::Zero(); + for (auto ArgTy : FTP->getParamTypes()) { + auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy); + Offset = Offset.alignTo(TInfo.Align) + TInfo.Width; + } + Args.add(RValue::get(CGF.CGM.getSize(Offset)), + CGF.getContext().getSizeType()); + const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall( + Args, GetParamBufProto, /*ChainCall=*/false); + auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args); + + return Ret.getScalarVal(); +} + +RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr( + CodeGenFunction &CGF, const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) { + ASTContext &Ctx = CGM.getContext(); + assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee()); + + llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok"); + llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end"); + + llvm::Value *Config = emitGetParamBuf(CGF, E); + CGF.Builder.CreateCondBr( + CGF.Builder.CreateICmpNE(Config, + llvm::Constant::getNullValue(Config->getType())), + ConfigOKBlock, ContBlock); + + CodeGenFunction::ConditionalEvaluation eval(CGF); + + eval.begin(CGF); + CGF.EmitBlock(ConfigOKBlock); + + QualType KernelCalleeFuncTy = + E->getCallee()->getType()->getAs<PointerType>()->getPointeeType(); + CGCallee KernelCallee = CGF.EmitCallee(E->getCallee()); + // Emit kernel arguments. + CallArgList KernelCallArgs; + CGF.EmitCallArgs(KernelCallArgs, + KernelCalleeFuncTy->getAs<FunctionProtoType>(), + E->arguments(), E->getDirectCallee()); + // Copy emitted kernel arguments into that parameter buffer. + RawAddress CfgBase(Config, CGM.Int8Ty, + /*Alignment=*/CharUnits::fromQuantity(64)); + CharUnits Offset = CharUnits::Zero(); + for (auto &Arg : KernelCallArgs) { + auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType()); + Offset = Offset.alignTo(TInfo.Align); + Address Addr = + CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity()); + Arg.copyInto(CGF, Addr); + Offset += TInfo.Width; + } + // Make `cudaLaunchDevice` call, i.e. E->getConfig(). + const CallExpr *LaunchCall = E->getConfig(); + QualType LaunchCalleeFuncTy = LaunchCall->getCallee() + ->getType() + ->getAs<PointerType>() + ->getPointeeType(); + CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee()); + CallArgList LaunchCallArgs; + CGF.EmitCallArgs(LaunchCallArgs, + LaunchCalleeFuncTy->getAs<FunctionProtoType>(), + LaunchCall->arguments(), LaunchCall->getDirectCallee()); + // Replace func and paramterbuffer arguments. + LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()), + CGM.getContext().VoidPtrTy); + LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy); + const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall( + LaunchCallArgs, LaunchCalleeFuncTy->getAs<FunctionProtoType>(), + /*ChainCall=*/false); + CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs, + CallOrInvoke, + /*IsMustTail=*/false, E->getExprLoc()); + CGF.EmitBranch(ContBlock); + + CGF.EmitBlock(ContBlock); + eval.end(CGF); + + return RValue::get(nullptr); +} + RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF, const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue, diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h index 86f776004ee7c..64fb9a31422e0 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -88,6 +88,10 @@ class CGCUDARuntime { ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr); + virtual RValue EmitCUDADeviceKernelCallExpr( + CodeGenFunction &CGF, const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr); + /// Emits a kernel launch stub. virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0; diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index f64cf9f8a6c2d..8a2c021b2210f 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr( RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) { + auto *FD = E->getConfig()->getDirectCallee(); + // Emit as a device kernel call if the config is prepared using + // 'cudaGetParameterBuffer'. + if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD) + return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr( + *this, E, ReturnValue, CallOrInvoke); return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue, CallOrInvoke); } diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 31735a0f5feb3..dd9bcab56b083 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() { ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc, MultiExprArg ExecConfig, SourceLocation GGGLoc) { - FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl(); + bool IsDeviceKernelCall = false; + switch (CurrentTarget()) { + case CUDAFunctionTarget::Global: + case CUDAFunctionTarget::Device: + IsDeviceKernelCall = true; + break; + case CUDAFunctionTarget::HostDevice: + if (getLangOpts().CUDAIsDevice) { + IsDeviceKernelCall = true; + if (FunctionDecl *Caller = + SemaRef.getCurFunctionDecl(/*AllowLambda=*/true); + Caller && isImplicitHostDeviceFunction(Caller)) { + // Under the device compilation, config call under an HD function should + // be treated as a device kernel call. But, for implicit HD ones (such + // as lambdas), need to check whether RDC is enabled or not. + if (!getLangOpts().GPURelocatableDeviceCode) + IsDeviceKernelCall = false; + // HIP doesn't support device-side kernel call yet. Still treat it as + // the host-side kernel call. + if (getLangOpts().HIP) + IsDeviceKernelCall = false; + } + } + break; + default: + break; + } + + if (IsDeviceKernelCall && getLangOpts().HIP) + return ExprError( + Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported)); + + if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode) + return ExprError( + Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc)); + + FunctionDecl *ConfigDecl = IsDeviceKernelCall + ? getASTContext().getcudaLaunchDeviceDecl() + : getASTContext().getcudaConfigureCallDecl(); if (!ConfigDecl) return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use) - << getConfigureFuncName()); + << (IsDeviceKernelCall ? getLaunchDeviceFuncName() + : getConfigureFuncName())); + // Additional check on the launch function if it's a device kernel call. + if (IsDeviceKernelCall) { + auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl(); + if (!GetParamBuf) + return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use) + << getGetParameterBufferFuncName()); + } + QualType ConfigQTy = ConfigDecl->getType(); DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr( getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc); SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl); + if (IsDeviceKernelCall) { + SmallVector<Expr *> Args; + // Use a null pointer as the kernel function, which may not be resolvable + // here. For example, resolving that kernel function may need additional + // kernel arguments. + llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0); + Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero, + SemaRef.Context.IntTy, LLLLoc)); + // Use a null pointer as the placeholder of the parameter buffer, which + // should be replaced with the actual allocation later, in the codegen. + Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero, + SemaRef.Context.IntTy, LLLLoc)); + // Add the original config arguments. + llvm::append_range(Args, ExecConfig); + // Add the default blockDim if it's missing. + if (Args.size() < 4) { + llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1); + Args.push_back(IntegerLiteral::Create(SemaRef.Context, One, + SemaRef.Context.IntTy, LLLLoc)); + } + // Add the default sharedMemSize if it's missing. + if (Args.size() < 5) + Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero, + SemaRef.Context.IntTy, LLLLoc)); + // Add the default stream if it's missing. + if (Args.size() < 6) + Args.push_back(new (SemaRef.Context) CXXNullPtrLiteralExpr( + SemaRef.Context.NullPtrTy, LLLLoc)); + return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr, + /*IsExecConfig=*/true); + } return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr, /*IsExecConfig=*/true); } @@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller, CalleeTarget == CUDAFunctionTarget::InvalidTarget) return CFP_Never; - // (a) Can't call global from some contexts until we support CUDA's - // dynamic parallelism. + // (a) Call global from either global or device contexts is allowed as part + // of CUDA's dynamic parallelism support. if (CalleeTarget == CUDAFunctionTarget::Global && (CallerTarget == CUDAFunctionTarget::Global || CallerTarget == CUDAFunctionTarget::Device)) - return CFP_Never; + return CFP_Native; // (b) Calling HostDevice is OK for everyone. if (CalleeTarget == CUDAFunctionTarget::HostDevice) @@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller, if (CallerTarget == CUDAFunctionTarget::HostDevice) { // It's OK to call a compilation-mode matching function from an HD one. if ((getLangOpts().CUDAIsDevice && - CalleeTarget == CUDAFunctionTarget::Device) || + (CalleeTarget == CUDAFunctionTarget::Device || + CalleeTarget == CUDAFunctionTarget::Global)) || (!getLangOpts().CUDAIsDevice && (CalleeTarget == CUDAFunctionTarget::Host || CalleeTarget == CUDAFunctionTarget::Global))) @@ -1103,6 +1182,14 @@ std::string SemaCUDA::getConfigureFuncName() const { return "cudaConfigureCall"; } +std::string SemaCUDA::getGetParameterBufferFuncName() const { + return "cudaGetParameterBuffer"; +} + +std::string SemaCUDA::getLaunchDeviceFuncName() const { + return "cudaLaunchDevice"; +} + // Record any local constexpr variables that are passed one way on the host // and another on the device. void SemaCUDA::recordPotentialODRUsedVariable( diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 25b89d65847ad..4954c0864b91b 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -11050,14 +11050,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, } if (getLangOpts().CUDA) { - IdentifierInfo *II = NewFD->getIdentifier(); - if (II && II->isStr(CUDA().getConfigureFuncName()) && - !NewFD->isInvalidDecl() && - NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) { - if (!R->castAs<FunctionType>()->getReturnType()->isScalarType()) - Diag(NewFD->getLocation(), diag::err_config_scalar_return) - << CUDA().getConfigureFuncName(); - Context.setcudaConfigureCallDecl(NewFD); + if (IdentifierInfo *II = NewFD->getIdentifier()) { + if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() && + NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) { + if (!R->castAs<FunctionType>()->getReturnType()->isScalarType()) + Diag(NewFD->getLocation(), diag::err_config_scalar_return) + << CUDA().getConfigureFuncName(); + Context.setcudaConfigureCallDecl(NewFD); + } + if (II->isStr(CUDA().getGetParameterBufferFuncName()) && + !NewFD->isInvalidDecl() && + NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) { + if (!R->castAs<FunctionType>()->getReturnType()->isPointerType()) + Diag(NewFD->getLocation(), diag::err_config_pointer_return) + << CUDA().getConfigureFuncName(); + Context.setcudaGetParameterBufferDecl(NewFD); + } + if (II->isStr(CUDA().getLaunchDeviceFuncName()) && + !NewFD->isInvalidDecl() && + NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) { + if (!R->castAs<FunctionType>()->getReturnType()->isScalarType()) + Diag(NewFD->getLocation(), diag::err_config_scalar_return) + << CUDA().getConfigureFuncName(); + Context.setcudaLaunchDeviceDecl(NewFD); + } } } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 55c52154c4113..5c82cafc49177 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -5580,9 +5580,13 @@ void ASTReader::InitializeContext() { // If there were any CUDA special declarations, deserialize them. if (!CUDASpecialDeclRefs.empty()) { - assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!"); + assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!"); Context.setcudaConfigureCallDecl( - cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0]))); + cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0]))); + Context.setcudaGetParameterBufferDecl( + cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1]))); + Context.setcudaLaunchDeviceDecl( + cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2]))); } // Re-export any modules that were imported by a non-module AST file. diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 547497cbd87d9..1871e48df35ff 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -5706,8 +5706,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &SemaRef) { GetDeclRef(SemaRef.getStdAlignValT()); } - if (Context.getcudaConfigureCallDecl()) + if (Context.getcudaConfigureCallDecl() || + Context.getcudaGetParameterBufferDecl() || + Context.getcudaLaunchDeviceDecl()) { GetDeclRef(Context.getcudaConfigureCallDecl()); + GetDeclRef(Context.getcudaGetParameterBufferDecl()); + GetDeclRef(Context.getcudaLaunchDeviceDecl()); + } // Writing all of the known namespaces. for (const auto &I : SemaRef.KnownNamespaces) @@ -5834,19 +5839,19 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) { Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations); } + auto AddEmittedDeclRefOrZero = [this](RecordData &Refs, Decl *D) { + if (!D || !wasDeclEmitted(D)) + Refs.push_back(0); + else + AddDeclRef(D, Refs); + }; + // Write the record containing declaration references of Sema. RecordData SemaDeclRefs; if (SemaRef.StdNamespace || SemaRef.StdBadAlloc || SemaRef.StdAlignValT) { - auto AddEmittedDeclRefOrZero = [this, &SemaDeclRefs](Decl *D) { - if (!D || !wasDeclEmitted(D)) - SemaDeclRefs.push_back(0); - else - AddDeclRef(D, SemaDeclRefs); - }; - - AddEmittedDeclRefOrZero(SemaRef.getStdNamespace()); - AddEmittedDeclRefOrZero(SemaRef.getStdBadAlloc()); - AddEmittedDeclRefOrZero(SemaRef.getStdAlignValT()); + AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdNamespace()); + AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdBadAlloc()); + AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdAlignValT()); } if (!SemaDeclRefs.empty()) Stream.EmitRecord(SEMA_DECL_REFS, SemaDeclRefs); @@ -5862,9 +5867,13 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) { // Write the record containing CUDA-specific declaration references. RecordData CUDASpecialDeclRefs; - if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl(); - CudaCallDecl && wasDeclEmitted(CudaCallDecl)) { - AddDeclRef(CudaCallDecl, CUDASpecialDeclRefs); + if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl(), + *CudaGetParamDecl = Context.getcudaGetParameterBufferDecl(), + *CudaLaunchDecl = Context.getcudaLaunchDeviceDecl(); + CudaCallDecl || CudaGetParamDecl || CudaLaunchDecl) { + AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaCallDecl); + AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaGetParamDecl); + AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaLaunchDecl); Stream.EmitRecord(CUDA_SPECIAL_DECL_REFS, CUDASpecialDeclRefs); } diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h index e7ad784335027..421fa4dd7dbae 100644 --- a/clang/test/CodeGenCUDA/Inputs/cuda.h +++ b/clang/test/CodeGenCUDA/Inputs/cuda.h @@ -72,7 +72,13 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); - +extern "C" __device__ cudaError_t cudaLaunchDevice(void *func, + void *parameterBuffer, + dim3 gridDim, dim3 blockDim, + unsigned int sharedMem, + cudaStream_t stream); +extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment, + size_t size); #endif extern "C" __device__ int printf(const char*, ...); diff --git a/clang/test/CodeGenCUDA/device-kernel-call.cu b/clang/test/CodeGenCUDA/device-kernel-call.cu new file mode 100644 index 0000000000000..eff2b37bd298d --- /dev/null +++ b/clang/test/CodeGenCUDA/device-kernel-call.cu @@ -0,0 +1,35 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fgpu-rdc -emit-llvm %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g2i( +// CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4 +// CHECK-NEXT: ret void +// +__global__ void g2(int x) {} + +// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g1v( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4 +// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call ptr @cudaGetParameterBuffer(i64 noundef 64, i64 noundef 4) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ne ptr [[CALL]], null +// CHECK-NEXT: br i1 [[TMP0]], label %[[DKCALL_CONFIGOK:.*]], label %[[DKCALL_END:.*]] +// CHECK: [[DKCALL_CONFIGOK]]: +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 0 +// CHECK-NEXT: store i32 42, ptr [[TMP1]], align 64 +// CHECK-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 1, i32 noundef 1, i32 noundef 1) #[[ATTR3]] +// CHECK-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 1, i32 noundef 1, i32 noundef 1) #[[ATTR3]] +// CHECK-NEXT: [[CALL2:%.*]] = call i32 @cudaLaunchDevice(ptr noundef @_Z2g2i, ptr noundef [[CALL]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null) #[[ATTR3]] +// CHECK-NEXT: br label %[[DKCALL_END]] +// CHECK: [[DKCALL_END]]: +// CHECK-NEXT: ret void +// +__global__ void g1(void) { + g2<<<1, 1>>>(42); +} diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h index 2bf45e03d91c7..de6f7fb635421 100644 --- a/clang/test/SemaCUDA/Inputs/cuda.h +++ b/clang/test/SemaCUDA/Inputs/cuda.h @@ -46,6 +46,13 @@ extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize, extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); +extern "C" __device__ cudaError_t cudaLaunchDevice(void *func, + void *parameterBuffer, + dim3 gridDim, dim3 blockDim, + unsigned int sharedMem, + cudaStream_t stream); +extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment, + size_t size); #endif // Host- and device-side placement new overloads. diff --git a/clang/test/SemaCUDA/call-kernel-from-kernel.cu b/clang/test/SemaCUDA/call-kernel-from-kernel.cu index 5f8832f3cd070..01dba44339520 100644 --- a/clang/test/SemaCUDA/call-kernel-from-kernel.cu +++ b/clang/test/SemaCUDA/call-kernel-from-kernel.cu @@ -1,9 +1,12 @@ // RUN: %clang_cc1 %s --std=c++11 -triple nvptx -o - \ // RUN: -verify -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note +// RUN: %clang_cc1 %s --std=c++11 -fgpu-rdc -triple nvptx -o - \ +// RUN: -verify=rdc -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note +// rdc-no-diagnostics #include "Inputs/cuda.h" __global__ void kernel1(); __global__ void kernel2() { - kernel1<<<1,1>>>(); // expected-error {{reference to __global__ function 'kernel1' in __global__ function}} + kernel1<<<1,1>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} } diff --git a/clang/test/SemaCUDA/device-kernel-call.cu b/clang/test/SemaCUDA/device-kernel-call.cu new file mode 100644 index 0000000000000..fea6deac02e55 --- /dev/null +++ b/clang/test/SemaCUDA/device-kernel-call.cu @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 -fcuda-is-device -verify=nordc %s +// RUN: %clang_cc1 -fcuda-is-device -fgpu-rdc -verify=rdc %s +// RUN: %clang_cc1 -x hip -fcuda-is-device -verify=hip %s + +// rdc-no-diagnostics + +#include "Inputs/cuda.h" + +__global__ void g2(int x) {} + +// CHECK-LABEL: define{{.*}}g1 +__global__ void g1(void) { + // CHECK: [[CONFIG:%.*]] = call{{.*}}_Z22cudaGetParameterBuffermm(i64{{.*}}64, i64{{.*}}4) + // CHECK-NEXT: [[FLAG:%.*]] = icmp ne ptr [[CONFIG]], null + // CHECK-NEXT: br i1 [[FLAG]], label %[[THEN:.*]], label %[[ENDIF:.*]] + // CHECK: [[THEN]]: + // CHECK-NEXT: [[PPTR:%.*]] = getelementptr{{.*}}i8, ptr [[CONFIG]], i64 0 + // CHECK-NEXT: store i32 42, ptr [[PPTR]] + // CHECK: = call{{.*}} i32 @_Z16cudaLaunchDevicePvS_4dim3S0_jP10cudaStream(ptr{{.*}} @_Z2g2i, ptr{{.*}} [[CONFIG]], + g2<<<1, 1>>>(42); + // nordc-error@-1 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} + // hip-error@-2 {{device-side kernel call/launch is not supported}} +} diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu index 3d05839af7528..11f84a912ea7b 100644 --- a/clang/test/SemaCUDA/function-overload.cu +++ b/clang/test/SemaCUDA/function-overload.cu @@ -91,10 +91,7 @@ __host__ HostReturnTy h() { return HostReturnTy(); } // devdefer-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}} __global__ void g() {} -// dev-note@-1 1+ {{'g' declared here}} -// devdefer-note@-2 1+ {{candidate function not viable: call to __global__ function from __device__ function}} // expected-note@-3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}} -// devdefer-note@-4 1+ {{candidate function not viable: call to __global__ function from __global__ function}} extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); } // host-note@-1 1+ {{'cd' declared here}} @@ -144,9 +141,9 @@ __device__ void devicef() { DeviceFnPtr fp_cdh = cdh; DeviceReturnTy ret_cdh = cdh(); - GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __device__ function}} - g(); // devdefer-error {{no matching function for call to 'g'}} - g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __device__ function}} + GlobalFnPtr fp_g = g; + g(); // expected-error {{call to global function 'g' not configured}} + g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} } __global__ void globalf() { @@ -165,9 +162,9 @@ __global__ void globalf() { DeviceFnPtr fp_cdh = cdh; DeviceReturnTy ret_cdh = cdh(); - GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __global__ function}} - g(); // devdefer-error {{no matching function for call to 'g'}} - g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __global__ function}} + GlobalFnPtr fp_g = g; + g(); // expected-error {{call to global function 'g' not configured}} + g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} } __host__ __device__ void hostdevicef() { @@ -199,20 +196,13 @@ __host__ __device__ void hostdevicef() { CurrentReturnTy ret_cdh = cdh(); GlobalFnPtr fp_g = g; -#if defined(__CUDA_ARCH__) - // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}} -#endif g(); -#if defined (__CUDA_ARCH__) - // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}} -#else - // expected-error@-4 {{call to global function 'g' not configured}} -#endif + // expected-error@-1 {{call to global function 'g' not configured}} g<<<0,0>>>(); #if defined(__CUDA_ARCH__) - // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}} + // expected-error@-2 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} #endif } diff --git a/clang/test/SemaCUDA/function-target.cu b/clang/test/SemaCUDA/function-target.cu index 64444b6676248..66704a320cee1 100644 --- a/clang/test/SemaCUDA/function-target.cu +++ b/clang/test/SemaCUDA/function-target.cu @@ -24,11 +24,11 @@ __host__ void h1(void) { __host__ void d1h(void); // expected-note {{candidate function not viable: call to __host__ function from __device__ function}} __device__ void d1d(void); __host__ __device__ void d1hd(void); -__global__ void d1g(void); // dev-note {{'d1g' declared here}} +__global__ void d1g(void); __device__ void d1(void) { d1h(); // expected-error {{no matching function}} d1d(); d1hd(); - d1g<<<1, 1>>>(); // dev-error {{reference to __global__ function 'd1g' in __device__ function}} + d1g<<<1, 1>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}} } diff --git a/clang/test/SemaCUDA/reference-to-kernel-fn.cu b/clang/test/SemaCUDA/reference-to-kernel-fn.cu index 70a1cda6ab0c8..bdb70fc8b55d1 100644 --- a/clang/test/SemaCUDA/reference-to-kernel-fn.cu +++ b/clang/test/SemaCUDA/reference-to-kernel-fn.cu @@ -8,6 +8,7 @@ // device-side kernel launches.) // host-no-diagnostics +// dev-no-diagnostics #include "Inputs/cuda.h" @@ -19,11 +20,10 @@ typedef void (*fn_ptr_t)(); __host__ __device__ fn_ptr_t get_ptr_hd() { return kernel; - // dev-error@-1 {{reference to __global__ function}} } __host__ fn_ptr_t get_ptr_h() { return kernel; } __device__ fn_ptr_t get_ptr_d() { - return kernel; // dev-error {{reference to __global__ function}} + return kernel; } diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp index 58eb671c61989..07fa67f9b956c 100644 --- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp +++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/Version.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Bitcode/BitcodeWriter.h" @@ -165,6 +166,41 @@ void diagnosticHandler(const DiagnosticInfo &DI) { } } +// Check if the archive has any object file with fat binary. +bool hasFatBinary(const Archive &LibFile) { + Error Err = Error::success(); + for (auto &C : LibFile.children(Err)) { + auto ChildBufferOrErr = C.getMemoryBufferRef(); + if (!ChildBufferOrErr) + return false; + auto ObjFileOrErr = ObjectFile::createObjectFile(*ChildBufferOrErr); + if (!ObjFileOrErr) + return false; + const auto &Obj = **ObjFileOrErr; + // Skip device object files. + if (Obj.getArch() == Triple::nvptx || Obj.getArch() == Triple::nvptx64) + return false; + // For host object files, search for the fat binary section. + for (const auto &Sec : Obj.sections()) { + auto NameOrErr = Sec.getName(); + if (!NameOrErr) + continue; + // Search for fat binary sections. + if (*NameOrErr != "__nv_relfatbin" && *NameOrErr != ".nv_fatbin") + continue; + auto ContentOrErr = Sec.getContents(); + if (!ContentOrErr || + identify_magic(*ContentOrErr) != file_magic::cuda_fatbinary) + continue; + return true; + } + } + // Check err to ensure it's checked. + if (Err) + return false; + return false; +} + Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix, StringRef Extension) { SmallString<128> OutputFile; @@ -487,6 +523,9 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) { for (const opt::Arg *Arg : Args.filtered(OPT_library_path)) LibraryPaths.push_back(Arg->getValue()); + // Archives (with fatbin) forwarded to nvlink. + SmallVector<const char *> ForwardArchives; + bool WholeArchive = false; SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles; for (const opt::Arg *Arg : Args.filtered( @@ -525,6 +564,11 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) { object::Archive::create(Buffer); if (!LibFile) return LibFile.takeError(); + // Skip extracting archives with fat binaries. Forward them to nvlink. + if (hasFatBinary(**LibFile)) { + ForwardArchives.emplace_back(Args.MakeArgString(*Filename)); + break; + } Error Err = Error::success(); for (auto Child : (*LibFile)->children(Err)) { auto ChildBufferOrErr = Child.getMemoryBufferRef(); @@ -687,6 +731,8 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) { return E; Files.emplace_back(Args.MakeArgString(*TempFileOrErr)); } + // Append achives to be forwarded. + append_range(Files, ForwardArchives); return Files; } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
