llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Rana Pratap Reddy (ranapratap55) <details> <summary>Changes</summary> Change the type signature of `gfx1250 WMMA/SWMMAC` builtins from `__fp16` to `_Float16` in the tablegen builtin definitions. --- Patch is 48.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/183493.diff 2 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+16-16) - (added) clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-wmma-f16.hip (+469) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 40c0828eef1ba..38e35bd7d3b71 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -948,15 +948,15 @@ def __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, fl def __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<8, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<8, int>, _Constant bool, _ExtVector<8, int>, _ExtVector<8, int>, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; @@ -964,8 +964,8 @@ def __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, f def __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<16, int>, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant int, _ExtVector<16, int>, _Constant int, _ExtVector<16, int>, _Constant short, _ExtVector<8, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<16, __fp16>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<16, __fp16>, _Constant short, _ExtVector<8, __fp16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<16, _Float16>, _Constant short, _ExtVector<8, float>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<16, _Float16>, _Constant short, _ExtVector<8, _Float16>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>, _Constant int, _Constant int, int, _Constant int, _Constant int, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUBuiltin<"_ExtVector<16, float>(_ExtVector<16, int>, _ExtVector<8, int>, _Constant short, _ExtVector<16, float>, _Constant int, _Constant int, long int, _Constant int, _Constant int, long int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; @@ -976,13 +976,13 @@ def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, def __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, float>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, __fp16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, _Float16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, _Float16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, _Float16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, int>, _ExtVector<16, int>, _ExtVector<8, _Float16>, _ExtVector<2, int>, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<8, int>, _Constant bool, _ExtVector<16, int>, _ExtVector<8, int>, _ExtVector<2, int>, _Constant bool, _Constant bool, ...)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<32, __fp16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_Constant bool, _ExtVector<16, __fp16>, _Constant bool, _ExtVector<32, __fp16>, _ExtVector<8, __fp16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<32, _Float16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<32, _Float16>, _ExtVector<8, _Float16>, int, _Constant bool, _Constant bool)", [Const], "gfx1250-insts,wavefrontsize32">; // GFX12.5 128B cooperative atomics def __builtin_amdgcn_cooperative_atomic_load_32x4B : AMDGPUBuiltin<"int(int *, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">; diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-wmma-f16.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-wmma-f16.hip new file mode 100644 index 0000000000000..06f8afac153fd --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-wmma-f16.hip @@ -0,0 +1,469 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v16h __attribute__((ext_vector_type(16))); +typedef _Float16 v32h __attribute__((ext_vector_type(32))); +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v8i __attribute__((ext_vector_type(8))); +typedef int v16i __attribute__((ext_vector_type(16))); +typedef float v8f __attribute__((ext_vector_type(8))); + +// CHECK-LABEL: define dso_local void @_Z30test_wmma_f16_16x16x64_fp8_fp8PDv8_DF16_Dv8_iS1_S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x i32> noundef [[A:%.*]], <8 x i32> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x i32> [[A]], ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x i32> [[B]], ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]], i16 0, <8 x half> [[TMP2]], i1 false, i1 true) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[TMP4]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_wmma_f16_16x16x64_fp8_fp8(v8h *out, v8i a, v8i b, v8h c) { + *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, true); +} + +// CHECK-LABEL: define dso_local void @_Z30test_wmma_f16_16x16x64_fp8_bf8PDv8_DF16_Dv8_iS1_S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x i32> noundef [[A:%.*]], <8 x i32> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x i32> [[A]], ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x i32> [[B]], ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]], i16 0, <8 x half> [[TMP2]], i1 false, i1 true) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[TMP4]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_wmma_f16_16x16x64_fp8_bf8(v8h *out, v8i a, v8i b, v8h c) { + *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, true); +} + +// CHECK-LABEL: define dso_local void @_Z30test_wmma_f16_16x16x64_bf8_fp8PDv8_DF16_Dv8_iS1_S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x i32> noundef [[A:%.*]], <8 x i32> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i32>, align 32, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x i32> [[A]], ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x i32> [[B]], ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[A_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> [[TMP0]], <8 x i32> [[TMP1]], i16 0, <8 x half> [[TMP2]], i1 false, i1 true) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[TMP4]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_wmma_f16_16x16x64_bf8_fp8(v8h *out, v8i a, v8i b, ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/183493 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
