yaxunl created this revision. yaxunl added a reviewer: tra. yaxunl requested review of this revision.
Currently for explicit template function instantiation in CUDA/HIP device compilation clang emits instantiated kernel with external linkage and instantiated device function with internal linkage. This is fine for -fno-gpu-rdc since there is only one TU. However this causes duplicate symbols for kernels for -fgpu-rdc if the same instantiation happen in multiple TU. Or missing symbols if a device function calls an explicitly instantiated template function in a different TU. To make explicit template function instantiation work for -fgpu-rdc we need to follow the C++ linkage paradigm, i.e. use weak_odr linkage. https://reviews.llvm.org/D90311 Files: clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenCUDA/device-fun-linkage.cu Index: clang/test/CodeGenCUDA/device-fun-linkage.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/device-fun-linkage.cu @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=NORDC %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -fgpu-rdc -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=RDC %s + +#include "Inputs/cuda.h" + +// NORDC: define internal void @_Z4funcIiEvv() +// NORDC: define void @_Z6kernelIiEvv() +// RDC: define weak_odr void @_Z4funcIiEvv() +// RDC: define weak_odr void @_Z6kernelIiEvv() + +template <typename T> __device__ void func() {} +template <typename T> __global__ void kernel() {} + +template __device__ void func<int>(); +template __global__ void kernel<int>(); Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -4389,13 +4389,16 @@ // and must all be equivalent. However, we are not allowed to // throw away these explicit instantiations. // - // We don't currently support CUDA device code spread out across multiple TUs, + // CUDA/HIP: For -fno-gpu-rdc case, device code is limited to one TU, // so say that CUDA templates are either external (for kernels) or internal. - // This lets llvm perform aggressive inter-procedural optimizations. + // This lets llvm perform aggressive inter-procedural optimizations. For + // -fgpu-rdc case, device function calls across multiple TU's are allowed, + // therefore we need to follow the normal linkage paradigm. if (Linkage == GVA_StrongODR) { - if (Context.getLangOpts().AppleKext) + if (getLangOpts().AppleKext) return llvm::Function::ExternalLinkage; - if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice) + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + !getLangOpts().GPURelocatableDeviceCode) return D->hasAttr<CUDAGlobalAttr>() ? llvm::Function::ExternalLinkage : llvm::Function::InternalLinkage; return llvm::Function::WeakODRLinkage;
Index: clang/test/CodeGenCUDA/device-fun-linkage.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/device-fun-linkage.cu @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=NORDC %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -fgpu-rdc -emit-llvm -o - %s \ +// RUN: | FileCheck -check-prefix=RDC %s + +#include "Inputs/cuda.h" + +// NORDC: define internal void @_Z4funcIiEvv() +// NORDC: define void @_Z6kernelIiEvv() +// RDC: define weak_odr void @_Z4funcIiEvv() +// RDC: define weak_odr void @_Z6kernelIiEvv() + +template <typename T> __device__ void func() {} +template <typename T> __global__ void kernel() {} + +template __device__ void func<int>(); +template __global__ void kernel<int>(); Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -4389,13 +4389,16 @@ // and must all be equivalent. However, we are not allowed to // throw away these explicit instantiations. // - // We don't currently support CUDA device code spread out across multiple TUs, + // CUDA/HIP: For -fno-gpu-rdc case, device code is limited to one TU, // so say that CUDA templates are either external (for kernels) or internal. - // This lets llvm perform aggressive inter-procedural optimizations. + // This lets llvm perform aggressive inter-procedural optimizations. For + // -fgpu-rdc case, device function calls across multiple TU's are allowed, + // therefore we need to follow the normal linkage paradigm. if (Linkage == GVA_StrongODR) { - if (Context.getLangOpts().AppleKext) + if (getLangOpts().AppleKext) return llvm::Function::ExternalLinkage; - if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice) + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + !getLangOpts().GPURelocatableDeviceCode) return D->hasAttr<CUDAGlobalAttr>() ? llvm::Function::ExternalLinkage : llvm::Function::InternalLinkage; return llvm::Function::WeakODRLinkage;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits