cdevadas created this revision. cdevadas added reviewers: rjmccall, Anastasia, yaxunl, arsenm. Herald added subscribers: Naghasan, ldrumm, kerbowa, t-tye, tpr, dstuttard, jvesely, kzhuravl. cdevadas requested review of this revision. Herald added subscribers: cfe-commits, wdng. Herald added a project: clang.
In OpenCL, a kernel is allowed to call other kernels as if they are regular functions. To support it, clang emits amdgpu_kernel calling convention for both caller and callee. A backend pass in our downstream compiler alters such calls by introducing regular function bodies which are clones of the callee kernels. This implementation currently limits us in certain ways. For instance, the restriction to not use byref attribute for callee kernels. To avoid such limitations, this patch brings in those cloned functions early on and prevents clang from generating amdgpu_kernel call sites. A new function body will be added for each kernel in the compilation unit expecting that the unused clones will get removed at link time. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D120566 Files: clang/lib/CodeGen/CodeGenModule.cpp clang/lib/CodeGen/TargetInfo.cpp clang/lib/CodeGen/TargetInfo.h clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl clang/test/CodeGenOpenCL/visibility.cl
Index: clang/test/CodeGenOpenCL/visibility.cl =================================================================== --- clang/test/CodeGenOpenCL/visibility.cl +++ clang/test/CodeGenOpenCL/visibility.cl @@ -94,23 +94,6 @@ ext_func_default(); } -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern() - -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_hidden() - -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_protected() - -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern_default() -// FVIS-PROTECTED: declare amdgpu_kernel void @ext_kern_default() -// FVIS-HIDDEN: declare amdgpu_kernel void @ext_kern_default() - - // FVIS-DEFAULT: declare void @ext_func() // FVIS-PROTECTED: declare protected void @ext_func() // FVIS-HIDDEN: declare hidden void @ext_func() @@ -126,3 +109,21 @@ // FVIS-DEFAULT: declare void @ext_func_default() // FVIS-PROTECTED: declare void @ext_func_default() // FVIS-HIDDEN: declare void @ext_func_default() + +// A kernel call will be emitted as a call to its cloned function +// of non-kernel convention. +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_hidden_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_hidden_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_hidden_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_protected_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_protected_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_protected_kernel_body() + +// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_default_kernel_body() +// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_default_kernel_body() +// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_default_kernel_body() Index: clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl @@ -0,0 +1,60 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s + +// AMDGPU disallows kernel callsites from another kernels. For each kernel, clang codegen will introduce +// a cloned function body with a non-kernel calling convention and amdgpu_kernel callsites will get +// transformed to call appropriate clones. + +extern kernel void test_extern_kernel_callee(global int *in); + +// CHECK: define dso_local amdgpu_kernel void @test_kernel_callee(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: store i32 10, i32 addrspace(1)* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +kernel void test_kernel_callee(global int *in) { + *in = (int)(10); +} + +// CHECK: define dso_local amdgpu_kernel void @test_kernel_caller(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_kernel_callee_kernel_body( +// CHECK-NOT: call amdgpu_kernel void @test_kernel_callee( +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_extern_kernel_callee_kernel_body( +// CHECK-NOT: call amdgpu_kernel void @test_kernel_callee( +// CHECK-NEXT: ret void +// +kernel void test_kernel_caller(global int *in) { + test_kernel_callee(in); + test_extern_kernel_callee(in); +} + +// CHECK: declare amdgpu_kernel void @test_extern_kernel_callee(i32 addrspace(1)* noundef align 4) + +// CHECK: define dso_local void @__amdgpu_test_kernel_callee_kernel_body(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* %in, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: store i32 10, i32 addrspace(1)* [[TMP0]], align 4 +// CHECK-NEXT: ret void + +// CHECK: define dso_local void @__amdgpu_test_kernel_caller_kernel_body(i32 addrspace(1)* noundef align 4 %in) +// CHECK-NEXT: entry: +// CHECK-NEXT: [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5) +// CHECK-NEXT: store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_kernel_callee_kernel_body( +// CHECK-NEXT: %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8 +// CHECK-NEXT: call void @__amdgpu_test_extern_kernel_callee_kernel_body( +// CHECK-NEXT: ret void +// + +// CHECK: declare void @__amdgpu_test_extern_kernel_callee_kernel_body(i32 addrspace(1)*) Index: clang/lib/CodeGen/TargetInfo.h =================================================================== --- clang/lib/CodeGen/TargetInfo.h +++ clang/lib/CodeGen/TargetInfo.h @@ -247,6 +247,10 @@ llvm::StringRef Value, llvm::SmallString<32> &Opt) const {} + /// Clean up and other special handling at the end when all functions are + /// codegenerated. + virtual void finalizeModule(llvm::Module &M) const {} + /// Get LLVM calling convention for OpenCL kernel. virtual unsigned getOpenCLKernelCallingConv() const; Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -19,9 +19,9 @@ #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/RecordLayout.h" +#include "clang/Basic/Builtins.h" #include "clang/Basic/CodeGenOptions.h" #include "clang/Basic/DiagnosticFrontend.h" -#include "clang/Basic/Builtins.h" #include "clang/CodeGen/CGFunctionInfo.h" #include "clang/CodeGen/SwiftCallingConv.h" #include "llvm/ADT/SmallBitVector.h" @@ -34,6 +34,7 @@ #include "llvm/IR/IntrinsicsS390.h" #include "llvm/IR/Type.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" #include <algorithm> // std::sort using namespace clang; @@ -9217,6 +9218,7 @@ llvm::Value *BlockLiteral) const override; bool shouldEmitStaticExternCAliases() const override; void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; + void finalizeModule(llvm::Module &M) const override; }; } @@ -9233,6 +9235,26 @@ cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())); } +static llvm::Function *getKernelClone(llvm::Function &F) { + llvm::Module *M = F.getParent(); + SmallString<128> MangledName("__amdgpu_"); + MangledName.append(F.getName()); + MangledName.append("_kernel_body"); + llvm::Function *NewF = M->getFunction(MangledName); + if (!NewF) { + llvm::ValueToValueMapTy ignored; + NewF = F.empty() + ? llvm::Function::Create(F.getFunctionType(), + llvm::GlobalVariable::ExternalLinkage, + "", M) + : CloneFunction(&F, ignored); + NewF->setCallingConv(llvm::CallingConv::C); + NewF->setName(MangledName); + } + + return NewF; +} + void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const { const auto *ReqdWGS = @@ -9435,6 +9457,30 @@ FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); } +void AMDGPUTargetCodeGenInfo::finalizeModule(llvm::Module &M) const { + // Insert a cloned function body for each kernel and adjust the kernel + // callsite to use its equivalent clone function. For extern kernel calls, + // insert a declaration node since the body isn't available. + if (!getABIInfo().getContext().getLangOpts().OpenCL) + return; + + for (auto &F : M) { + if (F.getCallingConv() != llvm::CallingConv::AMDGPU_KERNEL) + continue; + + llvm::Function *Clone = getKernelClone(F); + for (llvm::Function::user_iterator UI = F.user_begin(), UE = F.user_end(); + UI != UE;) { + auto *CI = dyn_cast<llvm::CallInst>(*UI++); + if (!CI) + continue; + + CI->setCalledFunction(Clone); + CI->setCallingConv(llvm::CallingConv::C); + } + } +} + //===----------------------------------------------------------------------===// // SPARC v8 ABI Implementation. // Based on the SPARC Compliance Definition version 2.4.1. Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -576,6 +576,7 @@ "amdgpu_code_object_version", getTarget().getTargetOpts().CodeObjectVersion); } + getTargetCodeGenInfo().finalizeModule(TheModule); } emitLLVMUsed();
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits