yaxunl created this revision. yaxunl added a reviewer: rjmccall. Herald added a subscriber: pengfei. yaxunl requested review of this revision.
HIP supports _Float16 and __fp16 types. In x86_64 host they are for storage only. Since they have the same size and alignment as int16, they are supposed to be passed by value in the same way as int16. Currently clang pass them by stack when included in a struct, which is not efficient. This also causes interoperability difficulty with gcc. On gcc since there is no _Float16 type, int16 is used as replacement for _Float16 for passing arguments, which is passed by register. This patch changes x86_64 target codegen info so that _Float16 and __fp16 can be passed by register. https://reviews.llvm.org/D97318 Files: clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/float16.cu
Index: clang/test/CodeGenCUDA/float16.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/float16.cu @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \ +// RUN: -fhip-new-launch-api | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \ +// RUN: -fcuda-is-device | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// CHECK: %struct.A = type { i16 } +struct A { + short x; +}; + +// CHECK: %struct.B = type { half } +struct B { + _Float16 x; +}; + +// CHECK: %struct.C = type { half } +struct C { + __fp16 x; +}; + +// Check struct containing _Float16 is coerced and passed correctly to kernel +// in a similar way as int16. + +// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce) +// CHECK: %x = alloca %struct.A, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0 +// CHECK: store i16 %x.coerce, i16* %coerce.dive, align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.A* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0 +// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2 +__global__ void kern1(A x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce) +// CHECK: %x = alloca %struct.B, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.B* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %add = fadd contract half %[[HALF]], 0xH3C00 +// DEV: store half %add, half* %x2, align 2 +__global__ void kern2(B x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce) +// CHECK: %x = alloca %struct.C, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.C* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %conv = fpext half %[[HALF]] to float +// DEV: %add = fadd contract float %conv, 1.000000e+00 +// DEV: %[[HALF:.*]] = fptrunc float %add to half +// DEV: store half %[[HALF]], half* %x2, align 2 +__global__ void kern3(C x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce) +void fun1(A x) { + kern1<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce) +void fun2(B x) { + kern2<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce) +void func3(C x) { + kern3<<<1, 1>>>(x); +} Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -2817,6 +2817,12 @@ Current = SSE; } else llvm_unreachable("unexpected long double representation!"); + } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) { + // AMD64 does not support _Float16 or __fp16. When used in + // languages supporting it, _Float16 or __fp16 is for storage only, + // which is equivalent to 16 bit integer. We need this to interop with + // gcc where 16 bit integer is used in place of _Float16 or __fp16. + Lo = Integer; } // FIXME: _Decimal32 and _Decimal64 are SSE. // FIXME: _float128 and _Decimal128 are (SSE, SSEUp).
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits