llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> A few instructions changed rate. --- Patch is 306.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117261.diff 7 Files Affected: - (modified) llvm/lib/Target/AMDGPU/GCNProcessors.td (+1-1) - (modified) llvm/lib/Target/AMDGPU/SISchedule.td (+63) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+1360-820) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+192-120) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+456-253) - (modified) llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir (+24-13) - (modified) llvm/test/tools/llvm-mca/AMDGPU/gfx950.s (+302-15) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 3403cbab526d46..508f2dd83108d9 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -204,7 +204,7 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_2.Features >; -def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel, +def : ProcessorModel<"gfx950", SIDPGFX950FullSpeedModel, FeatureISAVersion9_5_0.Features >; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index a60b1f28e9d34c..117add324db565 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -64,6 +64,7 @@ def Write8PassMAI : SchedWrite; def Write16PassMAI : SchedWrite; def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; +def Write16PassDGEMM : SchedWrite; // Scalar float instructions def WriteSFPU : SchedWrite; @@ -94,6 +95,7 @@ def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; def SIDPGFX940FullSpeedModel : SISchedMachineModel; +def SIDPGFX950FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; def GFX12SpeedModel : SISchedMachineModel; @@ -169,6 +171,8 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<Write4PassDGEMM, 4>; let ReleaseAtCycles = [8] in def : HWVALUWriteRes<Write8PassDGEMM, 8>; + let ReleaseAtCycles = [16] in + def : HWVALUWriteRes<Write16PassDGEMM, 16>; let ReleaseAtCycles = [2] in def : HWWriteRes<Write2PassMAI, [HWXDL], 2>; @@ -201,6 +205,13 @@ def WriteCopy : SchedWriteVariant<[ SchedVar<PredIsVGPR64Copy, [Write64Bit]>, SchedVar<NoSchedPred, [WriteSALU]>]>; +// Check if any matrix inputs are interpreted as f8 in an f8f6f4 mfma +// instruction. +def PredIsF8_MFMA_SCALE : SchedPredicate<[{ + TII->getNamedOperand(*MI, AMDGPU::OpName::cbsz)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 || + TII->getNamedOperand(*MI, AMDGPU::OpName::blgp)->getImm() <= AMDGPU::MFMAScaleFormats::FP8_E5M2 +}]>; + let SchedModel = SIFullSpeedModel in { defm : SICommonWriteRes; @@ -299,6 +310,58 @@ def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; } // End SchedModel = SIDPGFX940FullSpeedModel + +let SchedModel = SIDPGFX950FullSpeedModel in { +defm : SICommonWriteRes; + +def : HWVALUWriteRes<WriteFloatFMA, 1>; +def : HWVALUWriteRes<WriteDouble, 1>; +def : HWVALUWriteRes<WriteDoubleAdd, 1>; +def : HWVALUWriteRes<WriteDoubleCvt, 1>; +def : HWVALUWriteRes<WriteTrans64, 4>; +def : HWVALUWriteRes<WriteIntMul, 1>; +def : HWVALUWriteRes<Write64Bit, 1>; + +def : InstRW<[WriteCopy], (instrs COPY)>; +def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; +def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X64")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; + +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X32_")>; +def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; + +def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; +def : InstRW<[Write16PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; + +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; + + +// If either matrix format is f8, the instruction takes 2x as many +// cycles. TODO: This isn't reflected in MCA. +def WriteMFMAScale_16X16X128_F8F6F4 : SchedWriteVariant<[ + SchedVar<PredIsF8_MFMA_SCALE, [Write8PassMAI]>, + SchedVar<NoSchedPred, [Write4PassMAI]>]>; +def WriteMFMAScale_32X32X64_F8F6F4 : SchedWriteVariant<[ + SchedVar<PredIsF8_MFMA_SCALE, [Write16PassMAI]>, + SchedVar<NoSchedPred, [Write8PassMAI]>]>; + +def : InstRW<[WriteMFMAScale_16X16X128_F8F6F4, MIMFMARead], + (instregex "^V_MFMA(_SCALE)?_.32_16X16X128_F8F6F4")>; +def : InstRW<[WriteMFMAScale_32X32X64_F8F6F4, MIMFMARead], + (instregex "^V_MFMA(_SCALE)?_.32_32X32X64_F8F6F4")>; + +} // End SchedModel = SIDPGFX950FullSpeedModel + + let SchedModel = GFX10SpeedModel in { // The latency values are 1 / (operations / cycle). diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index d0ae669ffb3d68..5d149f7c0c62ef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s +; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -49,52 +49,366 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ret <4 x float> %result } -define <4 x float> @test_mfma_f32_16x16x32_f16__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] -; GCN-NEXT: s_nop 6 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] +define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) - ret <4 x float> %result + store <4 x float> %result, ptr addrspace(1) %out + ret void } -define <4 x float> @test_mfma_f32_16x16x32_f16___flags__mac(<4 x float> %arg2, <8 x half> %arg0, <8 x half> %arg1) { -; GCN-LABEL: test_mfma_f32_16x16x32_f16___flags__mac: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_accvgpr_write_b32 a0, v0 -; GCN-NEXT: v_accvgpr_write_b32 a1, v1 -; GCN-NEXT: v_accvgpr_write_b32 a2, v2 -; GCN-NEXT: v_accvgpr_write_b32 a3, v3 -; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[4:7], v[8:11], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 -; GCN-NEXT: v_accvgpr_read_b32 v0, a0 -; GCN-NEXT: v_accvgpr_read_b32 v1, a1 -; GCN-NEXT: v_accvgpr_read_b32 v2, a2 -; GCN-NEXT: v_accvgpr_read_b32 v3, a3 -; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) - ret <4 x float> %result +define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { +; SDAG-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: s_endpgm + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) + store <4 x float> %result, ptr addrspace(1) %out + ret void } ; -------------------------------------------------------------------- ; llvm.amdgcn.mfma.f32.32x32x16.f16 ; -------------------------------------------------------------------- -define <16 x float> @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) { -; GCN-LABEL: test_mfma_f32_32x32x16_f16: +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 +; SDAG-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: test_mfma_f32_32x32x16_f16: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], 0 +; GISEL-NEXT: v_mov_b64_e32 v[26:27], 48 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], 16 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 +; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 +; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 +; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 +; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 +; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 +; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 +; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 +; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 +; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 +; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 +; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 +; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 +; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 +; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], 32 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] +; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], a[24:27], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], a[28:31], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: s_endpgm + %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) + store volatile <16 x float> %result, ptr addrspace(1) null + store volatile <16 x float> %arg2, ptr addrspace(1) null + ret void +} + +define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) #1 { +; SDAG-LABEL: test_mfma_f32_32x32x16_f16__flags: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; SDAG-NEXT: v_mov_b64_e32 v[12:13], 48 +; SDAG-NEXT: v_mov_b64_e32 v[14:15], 32 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], 16 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 +; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 +; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 +; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 +; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 +; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 +; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 +; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 +; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 +; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 +; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 +; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 +; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 +; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 +; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], 0 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v0, s20 +; SDAG-NEXT: v_mov_b32_e32 v1, s21 +; SDAG-NEXT: v_mov_b32_e32 v2, s22 +; SDAG-NEXT: v_mov_b32_e32 v3, s23 ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/117261 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits