[PATCH] D118084: [CUDA, NVPTX] Pass byval aggregates directly

Artem Belevich via Phabricator via cfe-commits Mon, 24 Jan 2022 15:37:46 -0800

tra created this revision.
tra added reviewers: jdoerfert, yaxunl.
Herald added subscribers: asavonic, bixia.
tra requested review of this revision.
Herald added a project: clang.


Changes the NVPTX ABI to pass aggregates directly.  Only clang-generated IR is
affected. The change does not affect ABI on thechange function signatures in the
generated PTX

Discussion: 
https://llvm.discourse.group/t/nvptx-calling-convention-for-aggregate-arguments-passed-by-value

Currently NVPTX ABI passes aggregate values indirectly as a byval pointer.  When
we need to pass a *value*, LLVM has to store it in an alloca, so it can have a
pointer to pass on. This is a double whammy for NVPTX. LLVM often fails to
eliminate that alloca (usually SROA considers such pointer as escaped and gives
up) and that is noticeable hit on performance. When we lower IR to PTX, the
argument is actually passed by copy, so we end up having to do more work just to
get the value loaded back from the alloca. So, we do more work for less
performance. Switching to passing aggregates directly allows us to generate
better code.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D118084

Files:
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGen/nvptx-abi.c
  clang/test/CodeGenCUDA/kernel-args-alignment.cu
  clang/test/CodeGenCUDA/kernel-args.cu
  clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp

Index: clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
===================================================================
--- clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
+++ clang/test/OpenMP/nvptx_unsupported_type_codegen.cpp
@@ -34,7 +34,7 @@
 #pragma omp declare target
 T a = T();
 T f = a;
-// CHECK: define{{ protected | }}void @{{.+}}foo{{.+}}([[T]]* noundef byval([[T]]) align {{.+}})
+// CHECK: define{{ protected | }}void @{{.+}}foo{{.+}}([[T]] %{{.+}})
 void foo(T a = T()) {
   return;
 }
@@ -54,7 +54,7 @@
 }
 T1 a1 = T1();
 T1 f1 = a1;
-// CHECK: define{{ protected | }}void @{{.+}}foo1{{.+}}([[T1]]* noundef byval([[T1]]) align {{.+}})
+// CHECK: define{{ protected | }}void @{{.+}}foo1{{.+}}([[T1]] %{{.+}})
 void foo1(T1 a = T1()) {
   return;
 }
@@ -70,4 +70,3 @@
   T1 t = bar1();
 }
 #pragma omp end declare target
-
Index: clang/test/CodeGenCUDA/kernel-args.cu
===================================================================
--- clang/test/CodeGenCUDA/kernel-args.cu
+++ clang/test/CodeGenCUDA/kernel-args.cu
@@ -10,14 +10,14 @@
 };
 
 // AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}})
-// NVPTX: define{{.*}} void @_Z6kernel1A(%struct.A* noundef byval(%struct.A) align 8 %x)
+// NVPTX: define{{.*}} void @_Z6kernel1A(%struct.A %x
 __global__ void kernel(A x) {
 }
 
 class Kernel {
 public:
   // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}})
-  // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(%struct.A* noundef byval(%struct.A) align 8 %x)
+  // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(%struct.A %x
   static __global__ void memberKernel(A x){}
   template<typename T> static __global__ void templateMemberKernel(T x) {}
 };
@@ -31,10 +31,10 @@
 void test() {
   Kernel K;
   // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}
-  // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(%struct.A* noundef byval(%struct.A) align 8 %x)
+  // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(%struct.A %x
   launch((void*)templateKernel<A>);
 
   // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 8 %{{.+}}
-  // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* noundef byval(%struct.A) align 8 %x)
+  // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x
   launch((void*)Kernel::templateMemberKernel<A>);
 }
Index: clang/test/CodeGenCUDA/kernel-args-alignment.cu
===================================================================
--- clang/test/CodeGenCUDA/kernel-args-alignment.cu
+++ clang/test/CodeGenCUDA/kernel-args-alignment.cu
@@ -36,5 +36,5 @@
 // HOST-OLD: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
 
 // DEVICE-LABEL: @_Z6kernelc1SPi
-// DEVICE-SAME: i8{{[^,]*}}, %struct.S* noundef byval(%struct.S) align 8{{[^,]*}}, i32*
+// DEVICE-SAME: i8{{[^,]*}}, %struct.S %{{[^,]*}}, i32*
 __global__ void kernel(char a, S s, int *b) {}
Index: clang/test/CodeGen/nvptx-abi.c
===================================================================
--- clang/test/CodeGen/nvptx-abi.c
+++ clang/test/CodeGen/nvptx-abi.c
@@ -21,14 +21,14 @@
 
 void foo(float4_t x) {
 // CHECK-LABEL: @foo
-// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %x
+// CHECK: (%struct.float4_s %x
 }
 
 void fooN(float4_t x, float4_t y, float4_t z) {
 // CHECK-LABEL: @fooN
-// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %x
-// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %y
-// CHECK: %struct.float4_s* noundef byval(%struct.float4_s) align 4 %z
+// CHECK-SAME: %struct.float4_s %x
+// CHECK-SAME: %struct.float4_s %y
+// CHECK-SAME: %struct.float4_s %z
 }
 
 typedef struct nested_s {
@@ -39,5 +39,5 @@
 
 void baz(nested_t x) {
 // CHECK-LABEL: @baz
-// CHECK: %struct.nested_s* noundef byval(%struct.nested_s) align 8 %x)
+// CHECK: (%struct.nested_s %x
 }
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -7180,7 +7180,10 @@
         return ABIArgInfo::getDirect(
             CGInfo.getCUDADeviceBuiltinTextureDeviceType());
     }
-    return getNaturalAlignIndirect(Ty, /* byval */ true);
+    // We want to pass whole aggregate value as one argument.
+    auto AI = ABIArgInfo::getDirect();
+    AI.setCanBeFlattened(false);
+    return AI;
   }
 
   if (const auto *EIT = Ty->getAs<BitIntType>()) {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D118084: [CUDA, NVPTX] Pass byval aggregates directly

Reply via email to