https://github.com/LewisCrawford updated https://github.com/llvm/llvm-project/pull/138706
>From 8bbb3f88048e8041e79d609c982b11ba75199e0e Mon Sep 17 00:00:00 2001 From: Lewis Crawford <lcrawf...@nvidia.com> Date: Tue, 6 May 2025 15:16:39 +0000 Subject: [PATCH 1/2] [NVPTX] Add errors for incorrect CUDA addrpaces The CUDA API only accepts kernel params in the global and generic address spaces, so display an error message when attempting to emit pointers outside those address-spaces from CUDA (but still allow them for OpenCL). --- llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 8 +++++++ llvm/test/CodeGen/NVPTX/kernel-param-align.ll | 5 ++-- llvm/test/CodeGen/NVPTX/lower-args-cuda.ll | 13 +++++++++++ llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll | 17 ++++++++++++++ llvm/test/CodeGen/NVPTX/lower-args.ll | 23 ------------------- 5 files changed, 41 insertions(+), 25 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/lower-args-cuda.ll create mode 100644 llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 0e5207cf9b04c..a5a45f7ea89df 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1399,6 +1399,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (PTy) { O << "\t.param .u" << PTySizeInBits << " .ptr"; + bool IsCUDA = static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() == + NVPTX::CUDA; switch (PTy->getAddressSpace()) { default: break; @@ -1406,12 +1408,18 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << " .global"; break; case ADDRESS_SPACE_SHARED: + if (IsCUDA) + report_fatal_error(".shared ptr kernel args unsupported in CUDA."); O << " .shared"; break; case ADDRESS_SPACE_CONST: + if (IsCUDA) + report_fatal_error(".const ptr kernel args unsupported in CUDA."); O << " .const"; break; case ADDRESS_SPACE_LOCAL: + if (IsCUDA) + report_fatal_error(".local ptr kernel args unsupported in CUDA."); O << " .local"; break; } diff --git a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll index a56b85de80143..e85ccf34bb6ac 100644 --- a/llvm/test/CodeGen/NVPTX/kernel-param-align.ll +++ b/llvm/test/CodeGen/NVPTX/kernel-param-align.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +; RUN: llc < %s -mcpu=sm_60 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_60 | %ptxas -arch=sm_60 - %} +target triple = "nvptx64-nvidia-nvcl" %struct.Large = type { [16 x double] } diff --git a/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll new file mode 100644 index 0000000000000..7361ab28badb9 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-cuda.ll @@ -0,0 +1,13 @@ +; RUN: not --crash llc < %s -mcpu=sm_75 -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Make sure we exit with an error message for this input, as pointers to the +; shared address-space are only supported as kernel args in NVCL, not CUDA. +; CHECK: .shared ptr kernel args unsupported in CUDA. +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll new file mode 100644 index 0000000000000..44b44e0c17626 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/lower-args-nvcl.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefixes COMMON,IR +; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefixes COMMON,PTX +; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %} + +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-nvcl" + +; COMMON-LABEL: ptr_nongeneric +define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { +; IR-NOT: addrspacecast +; PTX-NOT: cvta.to.global +; PTX: ld.shared.u32 +; PTX st.global.u32 + %v = load i32, ptr addrspace(3) %in, align 4 + store i32 %v, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 246408ecf6a3a..269e36d6f3728 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -140,29 +140,6 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { ret void } -define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) %in) { -; IR-LABEL: define ptx_kernel void @ptr_nongeneric( -; IR-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) [[IN:%.*]]) { -; IR-NEXT: [[V:%.*]] = load i32, ptr addrspace(3) [[IN]], align 4 -; IR-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4 -; IR-NEXT: ret void -; -; PTX-LABEL: ptr_nongeneric( -; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [ptr_nongeneric_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [ptr_nongeneric_param_1]; -; PTX-NEXT: ld.shared.b32 %r1, [%rd2]; -; PTX-NEXT: st.global.b32 [%rd1], %r1; -; PTX-NEXT: ret; - %v = load i32, ptr addrspace(3) %in, align 4 - store i32 %v, ptr addrspace(1) %out, align 4 - ret void -} - define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { ; IRC-LABEL: define ptx_kernel void @ptr_as_int( ; IRC-SAME: i64 noundef [[I:%.*]], i32 noundef [[V:%.*]]) { >From 785379561e76e8cc13faf8b7a7f2728c035f4303 Mon Sep 17 00:00:00 2001 From: Lewis Crawford <lcrawf...@nvidia.com> Date: Mon, 12 May 2025 10:54:33 +0000 Subject: [PATCH 2/2] Update clang CodeGenCUDA/memcpy-libcall.cu test Fix a clang test which failed with these new error messages, as it illegally passed in kernel args in the .local address space while using the -cuda triple. --- clang/test/CodeGenCUDA/memcpy-libcall.cu | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu index c20fa2faceb01..5a15201121dd6 100644 --- a/clang/test/CodeGenCUDA/memcpy-libcall.cu +++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -32,18 +32,20 @@ void __global__ copy_param_to_global(S *global, S param) { // PTX: st.global.b32 } -// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( -void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, - S param) { - __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX-LABEL: .func (.param .b32 func_retval0) _Z19copy_param_to_local1Si( +int __device__ copy_param_to_local(S param, int i) { + S local; + __builtin_memcpy(&local, ¶m, sizeof(S)); // PTX: ld.param.b32 // PTX: st.local.b32 + return local.data[i]; } -// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( -void __device__ copy_local_to_generic(S *generic, - __attribute__((address_space(5))) S *src) { - __builtin_memcpy(generic, src, sizeof(S)); +// PTX-LABEL: .func _Z21copy_local_to_genericP1Sii( +void __device__ copy_local_to_generic(S *generic, int i, int j) { + S src = {{0, i, 2*i, 3*i, 4*i, 5*i, 6*i, 7*i}}; + src.data[j] = src.data[j+1]; + __builtin_memcpy(generic, &src, sizeof(S)); // PTX: ld.local.b32 // PTX: st.b32 } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits