tra created this revision. tra added reviewers: jdoerfert, yaxunl. Herald added subscribers: asavonic, bixia. tra requested review of this revision. Herald added a project: clang.
Changes the NVPTX ABI to pass aggregates directly. Only clang-generated IR is affected. The change does not affect ABI on thechange function signatures in the generated PTX Discussion: https://llvm.discourse.group/t/nvptx-calling-convention-for-aggregate-arguments-passed-by-value Currently NVPTX ABI passes aggregate values indirectly as a byval pointer. When we need to pass a *value*, LLVM has to store it in an alloca, so it can have a pointer to pass on. This is a double whammy for NVPTX. LLVM often fails to eliminate that alloca (usually SROA considers such pointer as escaped and gives up) and that is noticeable hit on performance. When we lower IR to PTX, the argument is actually passed by copy, so we end up having to do more work just to get the value loaded back from the alloca. So, we do more work for less performance. Switching to passing aggregates directly allows us to generate better code. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D118084 Files: clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGen/nvptx-abi.c clang/test/CodeGenCUDA/kernel-args-alignment.cu clang/test/CodeGenCUDA/kernel-args.cu clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
Index: clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp =================================================================== --- clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp +++ clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp @@ -34,7 +34,7 @@ #pragma omp declare target T a = T(); T f = a; -// CHECK: define{{ protected | }}void @{{.+}}foo{{.+}}([[T]]* noundef byval([[T]]) align {{.+}}) +// CHECK: define{{ protected | }}void @{{.+}}foo{{.+}}([[T]] %{{.+}}) void foo(T a = T()) { return; } @@ -54,7 +54,7 @@ } T1 a1 = T1(); T1 f1 = a1; -// CHECK: define{{ protected | }}void @{{.+}}foo1{{.+}}([[T1]]* noundef byval([[T1]]) align {{.+}}) +// CHECK: define{{ protected | }}void @{{.+}}foo1{{.+}}([[T1]] %{{.+}}) void foo1(T1 a = T1()) { return; } @@ -70,4 +70,3 @@ T1 t = bar1(); } #pragma omp end declare target - Index: clang/test/CodeGenCUDA/kernel-args.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args.cu +++ clang/test/CodeGenCUDA/kernel-args.cu @@ -10,14 +10,14 @@ }; // AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) -// NVPTX: define{{.*}} void @_Z6kernel1A(%struct.A* noundef byval(%struct.A) align 8 %x) +// NVPTX: define{{.*}} void @_Z6kernel1A(%struct.A %x __global__ void kernel(A x) { } class Kernel { public: // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}) - // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(%struct.A* noundef byval(%struct.A) align 8 %x) + // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(%struct.A %x static __global__ void memberKernel(A x){} template<typename T> static __global__ void templateMemberKernel(T x) {} }; @@ -31,10 +31,10 @@ void test() { Kernel K; // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} - // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(%struct.A* noundef byval(%struct.A) align 8 %x) + // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(%struct.A %x launch((void*)templateKernel<A>); // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}} - // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* noundef byval(%struct.A) align 8 %x) + // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x launch((void*)Kernel::templateMemberKernel<A>); } Index: clang/test/CodeGenCUDA/kernel-args-alignment.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args-alignment.cu +++ clang/test/CodeGenCUDA/kernel-args-alignment.cu @@ -36,5 +36,5 @@ // HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24) // DEVICE-LABEL: @_Z6kernelc1SPi -// DEVICE-SAME: i8{{[^,]*}}, %struct.S* noundef byval(%struct.S) align 8{{[^,]*}}, i32* +// DEVICE-SAME: i8{{[^,]*}}, %struct.S %{{[^,]*}}, i32* __global__ void kernel(char a, S s, int *b) {} Index: clang/test/CodeGen/nvptx-abi.c =================================================================== --- clang/test/CodeGen/nvptx-abi.c +++ clang/test/CodeGen/nvptx-abi.c @@ -21,14 +21,14 @@ void foo(float4_t x) { // CHECK-LABEL: @foo -// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %x +// CHECK: (%struct.float4_s %x } void fooN(float4_t x, float4_t y, float4_t z) { // CHECK-LABEL: @fooN -// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %x -// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %y -// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %z +// CHECK-SAME: %struct.float4_s %x +// CHECK-SAME: %struct.float4_s %y +// CHECK-SAME: %struct.float4_s %z } typedef struct nested_s { @@ -39,5 +39,5 @@ void baz(nested_t x) { // CHECK-LABEL: @baz -// CHECK: %struct.nested_s* noundef byval(%struct.nested_s) align 8 %x) +// CHECK: (%struct.nested_s %x } Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -7180,7 +7180,10 @@ return ABIArgInfo::getDirect( CGInfo.getCUDADeviceBuiltinTextureDeviceType()); } - return getNaturalAlignIndirect(Ty, /* byval */ true); + // We want to pass whole aggregate value as one argument. + auto AI = ABIArgInfo::getDirect(); + AI.setCanBeFlattened(false); + return AI; } if (const auto *EIT = Ty->getAs<BitIntType>()) {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits