Author: Matt Arsenault Date: 2024-11-25T19:47:48-08:00 New Revision: aa7eb5723cb4499f35ed1c5455f668ccc078e7c2
URL: https://github.com/llvm/llvm-project/commit/aa7eb5723cb4499f35ed1c5455f668ccc078e7c2 DIFF: https://github.com/llvm/llvm-project/commit/aa7eb5723cb4499f35ed1c5455f668ccc078e7c2.diff LOG: AMDGPU: Add support for v_dot2_f32_bf16 instruction for gfx950 (#117597) v_dot2_f32_bf16 was added in gfx11 along with v_dot2_f16_f16 and v_dot2_bf16_bf16. All three instructions were part of Dot9 instructions in the compiler. This patch will split existing dot9 (v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16) into new dot9 (v_dot2_f16_f16 and v_dot2_bf16_bf16), and dot12 (v_dot2_f32_bf16). All necessary changes to gfx11 and gfx12 are updated to reflect this change. Co-authored-by: Sirish Pande <sirish.pa...@amd.com> Added: llvm/test/MC/AMDGPU/gfx950_dlops.s Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/amdgpu-features.cl clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl llvm/lib/Target/AMDGPU/AMDGPU.td llvm/lib/Target/AMDGPU/GCNSubtarget.h llvm/lib/Target/AMDGPU/VOP3PInstructions.td llvm/lib/TargetParser/TargetParser.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index fd449697e91216..7d0019eead96b6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -263,7 +263,7 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940 TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts") TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts") -TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot9-insts") +TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot12-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts") TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts") diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index db7fd76ec91189..0b698035ee54c7 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -89,7 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" @@ -101,17 +101,17 @@ // GFX1034: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1035: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1036: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" -// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" +// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" kernel void test() {} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl index 5db280f339e713..f409d2a110d753 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl @@ -23,8 +23,8 @@ kernel void builtins_amdgcn_dl_insts_err( sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot9-insts}} - fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}} - fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}} + fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}} + fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}} siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 34eae17827ff7c..390d9a55e9b16d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -215,3 +215,28 @@ void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) { void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2); } + +// CHECK-LABEL: define dso_local void @builtins_amdgcn_dl_insts( +// CHECK-SAME: ptr addrspace(1) noundef [[OUT:%.*]], float noundef [[FC:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store float [[FC]], ptr addrspace(5) [[FC_ADDR]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 +// CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat> +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat> +// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false) +// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 +// CHECK-NEXT: ret void +// +void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) { + *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index e4e427ef43339b..0a53f661f33657 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -732,7 +732,7 @@ def FeatureDot8Insts : SubtargetFeature<"dot8-insts", def FeatureDot9Insts : SubtargetFeature<"dot9-insts", "HasDot9Insts", "true", - "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16 instructions" + "Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions" >; def FeatureDot10Insts : SubtargetFeature<"dot10-insts", @@ -747,6 +747,12 @@ def FeatureDot11Insts : SubtargetFeature<"dot11-insts", "Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions" >; +def FeatureDot12Insts : SubtargetFeature<"dot12-insts", + "HasDot12Insts", + "true", + "Has v_dot2_f32_bf16 instructions" +>; + def FeatureMAIInsts : SubtargetFeature<"mai-insts", "HasMAIInsts", "true", @@ -1578,7 +1584,8 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, - FeatureFP6BF6ConversionScaleInsts + FeatureFP6BF6ConversionScaleInsts, + FeatureDot12Insts ])>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -1706,6 +1713,7 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureDot8Insts, FeatureDot9Insts, FeatureDot10Insts, + FeatureDot12Insts, FeatureNSAEncoding, FeaturePartialNSAEncoding, FeatureShaderCyclesRegister, @@ -1789,6 +1797,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureDot9Insts, FeatureDot10Insts, FeatureDot11Insts, + FeatureDot12Insts, FeatureNSAEncoding, FeaturePartialNSAEncoding, FeatureShaderCyclesHiLoRegisters, @@ -2361,6 +2370,9 @@ def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">, def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">, AssemblerPredicate<(all_of FeatureDot11Insts)>; +def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">, + AssemblerPredicate<(all_of FeatureDot12Insts)>; + def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 390849dd2e0564..7e994b84426bf0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -156,6 +156,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasDot9Insts = false; bool HasDot10Insts = false; bool HasDot11Insts = false; + bool HasDot12Insts = false; bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasFP8ConversionInsts = false; @@ -825,6 +826,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasDot11Insts; } + bool hasDot12Insts() const { + return HasDot12Insts; + } + bool hasMAIInsts() const { return HasMAIInsts; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 5d8dc5ccd18e55..ee68eb32d9173a 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -406,12 +406,12 @@ def DOT2_BF16_Profile let HasSrc1Mods = 1; } -let SubtargetPredicate = HasDot9Insts in { +let SubtargetPredicate = HasDot12Insts in { defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile, int_amdgcn_fdot2_f32_bf16, 1>; -} // End SubtargetPredicate = HasDot9Insts +} // End SubtargetPredicate = HasDot12Insts } // End let IsDOT = 1 @@ -2118,6 +2118,7 @@ defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mf defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>; defm V_MFMA_F32_32X32X64_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">; defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>; +defm V_DOT2_F32_BF16 : VOP3P_Real_vi<0x1a>; defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 23532b9214a892..110b94a57b802c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -374,6 +374,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["dot9-insts"] = true; Features["dot10-insts"] = true; Features["dot11-insts"] = true; + Features["dot12-insts"] = true; Features["dl-insts"] = true; Features["atomic-ds-pk-add-16-insts"] = true; Features["atomic-flat-pk-add-16-insts"] = true; @@ -406,6 +407,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["dot8-insts"] = true; Features["dot9-insts"] = true; Features["dot10-insts"] = true; + Features["dot12-insts"] = true; Features["dl-insts"] = true; Features["16-bit-insts"] = true; Features["dpp"] = true; @@ -475,6 +477,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["permlane16-swap"] = true; Features["permlane32-swap"] = true; Features["ashr-pk-insts"] = true; + Features["dot12-insts"] = true; Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index b0ef568fbdce31..fca418e3f82002 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950 +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp) @@ -18,6 +20,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0 +; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, s0 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp +; GFX950-NEXT: s_nop 2 +; GFX950-NEXT: global_store_dword v0, v1, s[8:9] +; GFX950-NEXT: s_endpgm +; +; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: +; GFX950-ISEL: ; %bb.0: ; %entry +; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0 +; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 clamp +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-ISEL-NEXT: s_nop 1 +; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9] +; GFX950-ISEL-NEXT: s_endpgm + ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -46,6 +81,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm +; +; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0 +; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, s0 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 +; GFX950-NEXT: s_nop 2 +; GFX950-NEXT: global_store_dword v0, v1, s[8:9] +; GFX950-NEXT: s_endpgm +; +; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: +; GFX950-ISEL: ; %bb.0: ; %entry +; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0 +; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0 +; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-ISEL-NEXT: s_nop 1 +; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9] +; GFX950-ISEL-NEXT: s_endpgm + ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, diff --git a/llvm/test/MC/AMDGPU/gfx950_dlops.s b/llvm/test/MC/AMDGPU/gfx950_dlops.s new file mode 100644 index 00000000000000..4ae60ac785f496 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx950_dlops.s @@ -0,0 +1,61 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s + +v_dot2_f32_bf16 v5, v1, v2, v3 +// GFX950: v_dot2_f32_bf16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c] + +v_dot2_f32_bf16 v5, v1, v2, s3 +// GFX950: v_dot2_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18] + +v_dot2_f32_bf16 v2, v1, 0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.5, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -0.5, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 1.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -1.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 2.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -2.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 4.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, -4.0, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c] + +v_dot2_f32_bf16 v2, v1, 0.15915494, v2 +// GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c] + +v_dot2_f32_bf16 v2, 0.5, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -0.5, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 1.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -1.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 2.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -2.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, 4.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c] + +v_dot2_f32_bf16 v2, -4.0, v1, v2 +// GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index ccc55352413777..ca8b1750a579e2 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -780,6 +780,66 @@ # GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04] 0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04 +# GFX950: v_dot2_f32_bf16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c] +0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c + +# GFX950: v_dot2_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18] +0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18 + +# GFX950: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c] +0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c + +# GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c + +# GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c] +0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c + # GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04] 0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits