[PATCH] D97318: [clang][CodeGen] Allow fp16 arg pass by register

Yaxun Liu via Phabricator via cfe-commits Tue, 23 Feb 2021 11:45:45 -0800

yaxunl created this revision.
yaxunl added a reviewer: rjmccall.
Herald added a subscriber: pengfei.
yaxunl requested review of this revision.


HIP supports _Float16 and __fp16 types. In x86_64 host they are for storage
only. Since they have the same size and alignment as int16, they are supposed
to be passed by value in the same way as int16. Currently clang pass them
by stack when included in a struct, which is not efficient. This also causes
interoperability difficulty with gcc. On gcc since there is no _Float16 type,
int16 is used as replacement for _Float16 for passing arguments, which is
passed by register.

This patch changes x86_64 target codegen info so that _Float16 and __fp16
can be passed by register.


https://reviews.llvm.org/D97318

Files:
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGenCUDA/float16.cu

Index: clang/test/CodeGenCUDA/float16.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/float16.cu
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \
+// RUN:   -fhip-new-launch-api | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \
+// RUN:   -fcuda-is-device | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: %struct.A = type { i16 }
+struct A {
+  short x;
+};
+
+// CHECK: %struct.B = type { half }
+struct B {
+  _Float16 x;
+};
+
+// CHECK: %struct.C = type { half }
+struct C {
+  __fp16 x;
+};
+
+// Check struct containing _Float16 is coerced and passed correctly to kernel
+// in a similar way as int16.
+
+// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.A, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0
+// CHECK:  store i16 %x.coerce, i16* %coerce.dive, align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.A* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0
+// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2
+__global__ void kern1(A x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.B, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.B* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %add = fadd contract half %[[HALF]], 0xH3C00
+// DEV:  store half %add, half* %x2, align 2
+__global__ void kern2(B x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.C, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.C* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %conv = fpext half %[[HALF]] to float
+// DEV:  %add = fadd contract float %conv, 1.000000e+00
+// DEV:  %[[HALF:.*]] = fptrunc float %add to half
+// DEV:  store half %[[HALF]], half* %x2, align 2
+__global__ void kern3(C x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce)
+void fun1(A x) {
+  kern1<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce)
+void fun2(B x) {
+  kern2<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce)
+void func3(C x) {
+  kern3<<<1, 1>>>(x);
+}
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -2817,6 +2817,12 @@
         Current = SSE;
       } else
         llvm_unreachable("unexpected long double representation!");
+    } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) {
+      // AMD64 does not support _Float16 or __fp16. When used in
+      // languages supporting it, _Float16 or __fp16 is for storage only,
+      // which is equivalent to 16 bit integer. We need this to interop with
+      // gcc where 16 bit integer is used in place of _Float16 or __fp16.
+      Lo = Integer;
     }
     // FIXME: _Decimal32 and _Decimal64 are SSE.
     // FIXME: _float128 and _Decimal128 are (SSE, SSEUp).

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D97318: [clang][CodeGen] Allow fp16 arg pass by register

Reply via email to