https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117826
>From e9df017c277f38ff299d45a444e34e7af79401bb Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Wed, 17 Jul 2024 19:36:13 +0400 Subject: [PATCH] AMDGPU: Verify f8f6f4 formats in assembler Verify the register widths of the corresponding operands match the floating point format expected size. --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 32 +++++ llvm/lib/Target/AMDGPU/SIInstrInfo.td | 9 ++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 17 ++- llvm/test/MC/AMDGPU/mai-gfx950-err.s | 127 ++++++++++++++++++ llvm/test/tools/llvm-mca/AMDGPU/gfx950.s | 91 ++++++------- 6 files changed, 224 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 3ff588c4d0a452..37a8c7fef7d0af 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4196,6 +4196,38 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst, if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0) return true; + int BlgpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp); + if (BlgpIdx != -1) { + if (const MFMA_F8F6F4_Info *Info = AMDGPU::isMFMA_F8F6F4(Opc)) { + int CbszIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz); + + unsigned CBSZ = Inst.getOperand(CbszIdx).getImm(); + unsigned BLGP = Inst.getOperand(BlgpIdx).getImm(); + + // Validate the correct register size was used for the floating point + // format operands + + bool Success = true; + if (Info->NumRegsSrcA != mfmaScaleF8F6F4FormatToNumRegs(CBSZ)) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src0Idx).getReg()), + Operands), + "wrong register tuple size for cbsz value " + Twine(CBSZ)); + Success = false; + } + + if (Info->NumRegsSrcB != mfmaScaleF8F6F4FormatToNumRegs(BLGP)) { + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src1Idx).getReg()), + Operands), + "wrong register tuple size for blgp value " + Twine(BLGP)); + Success = false; + } + + return Success; + } + } + const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx == -1) return true; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b27e2529f4a807..cddb87655ee8c3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -3230,6 +3230,15 @@ def getMFMA_F8F6F4_WithSize : GenericTable { let PrimaryKeyName = "getMFMA_F8F6F4_InstWithNumRegs" ; } +def isMFMA_F8F6F4Table : GenericTable { + let FilterClass = "MFMA_F8F6F4_WithSizeTable"; + let CppTypeName = "MFMA_F8F6F4_Info"; +// let Fields = [ "Opcode" ]; + let Fields = [ "Opcode", "F8F8Opcode", "NumRegsSrcA", "NumRegsSrcB" ]; + let PrimaryKey = [ "Opcode" ]; + let PrimaryKeyName = "isMFMA_F8F6F4" ; +} + def FP8DstByteSelTable : GenericTable { let FilterClass = "VOP3_Pseudo"; let CppTypeName = "FP8DstByteSelInfo"; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b233e898589396..ab5f0694c07f95 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -417,6 +417,7 @@ struct FP8DstByteSelInfo { #define GET_WMMAOpcode3AddrMappingTable_IMPL #define GET_getMFMA_F8F6F4_WithSize_DECL #define GET_getMFMA_F8F6F4_WithSize_IMPL +#define GET_isMFMA_F8F6F4Table_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -525,7 +526,7 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info ? Info->is_gfx940_xdl : false; } -static uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { +uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: case MFMAScaleFormats::FP6_E3M2: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index b0581711961b13..9f7fbec6a542f7 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -96,6 +96,13 @@ struct MAIInstInfo { bool is_gfx940_xdl; }; +struct MFMA_F8F6F4_Info { + unsigned Opcode; + unsigned F8F8Opcode; + uint8_t NumRegsSrcA; + uint8_t NumRegsSrcB; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -103,6 +110,8 @@ struct MAIInstInfo { #define GET_MIMGMIPMapping_DECL #define GET_MIMGBiASMapping_DECL #define GET_MAIInstInfoTable_DECL +#define GET_MAIInstInfoTable_DECL +#define GET_isMFMA_F8F6F4Table_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -581,12 +590,8 @@ unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST); LLVM_READONLY CanBeVOPD getCanBeVOPD(unsigned Opc); -struct MFMA_F8F6F4_Info { - unsigned Opcode; - unsigned F8F8Opcode; - uint8_t NumRegsSrcA; - uint8_t NumRegsSrcB; -}; +LLVM_READNONE +uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal); LLVM_READONLY const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ, diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s index a6dff076392c85..e700b0b3cabfe4 100644 --- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s +++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s @@ -29,3 +29,130 @@ v_mfma_ld_scale_b32 v0, v0 neg_hi:[0,1] v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1] neg_hi:[0,1] // CHECK: :[[@LINE-1]]:28: error: not a valid operand + + +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] cbsz:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 + +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] blgp:2 +// CHECK: :[[@LINE-1]]:46: error: wrong register tuple size for blgp value 2 + +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] cbsz:2 blgp:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 +// CHECK: :[[@LINE-2]]:46: error: wrong register tuple size for blgp value 2 + + +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] cbsz:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 + +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] blgp:2 +// CHECK: :[[@LINE-1]]:46: error: wrong register tuple size for blgp value 2 + +v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] cbsz:2 blgp:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 +// CHECK: :[[@LINE-2]]:46: error: wrong register tuple size for blgp value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] v20, v21 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 blgp:2 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] v20, v21 blgp:2 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2 + +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] cbsz:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 + +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] blgp:2 +// CHECK: :[[@LINE-1]]:47: error: wrong register tuple size for blgp value 2 + +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] cbsz:2 +// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2 + +v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] blgp:2 +// CHECK: :[[@LINE-1]]:47: error: wrong register tuple size for blgp value 2 + +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] v32, v33 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + +v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] v32, v33 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + + + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 0 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:1 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 1 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:3 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3 + + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 0 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:1 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 1 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:2 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:3 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 0 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:1 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 1 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:2 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:3 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 0 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:1 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 1 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:2 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2 + +v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:3 +// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:3 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:3 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:7], v[0:3] v20, v21 blgp:3 +// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:3 +// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 3 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[0:3] v20, v21 cbsz:4 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 4 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:4 +// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 4 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp:4 +// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4 + +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4 +// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4 diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s index 03b4041f77bcef..33665b655086c1 100644 --- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s +++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s @@ -1,9 +1,9 @@ # RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s # CHECK: Iterations: 1 -# CHECK: Instructions: 129 -# CHECK: Total Cycles: 1069 -# CHECK: Total uOps: 129 +# CHECK: Instructions: 133 +# CHECK: Total Cycles: 1101 +# CHECK: Total uOps: 133 v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1 v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7] @@ -24,46 +24,37 @@ v_mfma_ld_scale_b32 v0, v0 ;; TODO: test vdc/adc v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:2 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:3 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:7], v[0:3] blgp:4 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1 -v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[4:11], v[0:3] cbsz:4 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 blgp:1 +v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] cbsz:1 blgp:2 ;; FIXME: should have different cycle count depending on whether either matrix is f8 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:3 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:7], v[0:15] blgp:4 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:3 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:4 -v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[4:11], v[0:15] cbsz:4 +v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 ;; FIXME: should have different cycle count depending on whether either matrix is f8 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 - -;; FIXME -;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:1 -;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:2 -;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:3 -;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:4 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:1 +v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:9], v[0:3], v5, v5 cbsz:2 blgp:2 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 - -;; FIXME -;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:1 -;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:2 -;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:3 -;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:4 - +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15], v5, v5 cbsz:2 blgp:1 +v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:9], v[0:15], v5, v5 cbsz:2 blgp:2 ;; TODO: These results are wrong v_smfmac_f32_16x16x64_f16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 @@ -213,28 +204,32 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1 # CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:3 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:7], v[0:3] blgp:4 # CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[4:11], v[0:3] cbsz:4 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] cbsz:1 blgp:2 # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:3 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:7], v[0:15] blgp:4 # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:3 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:4 -# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[4:11], v[0:15] cbsz:4 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2 # CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1 -# CHECK-NEXT: - - - - - - 4.00 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 op_sel_hi:[0,0,0] -# CHECK-NEXT: - - - - - - 8.00 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 op_sel_hi:[0,0,0] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 op_sel_hi:[0,0,0] +# CHECK-NEXT: - - - - - - 4.00 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 op_sel_hi:[0,0,0] blgp:1 +# CHECK-NEXT: - - - - - - 4.00 v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:9], v[0:3], v5, v5 op_sel_hi:[0,0,0] cbsz:2 blgp:2 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 op_sel_hi:[0,0,0] +# CHECK-NEXT: - - - - - - 8.00 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15], v5, v5 op_sel_hi:[0,0,0] cbsz:2 blgp:1 +# CHECK-NEXT: - - - - - - 8.00 v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:9], v[0:15], v5, v5 op_sel_hi:[0,0,0] cbsz:2 blgp:2 # CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_f16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 # CHECK-NEXT: - - - - - - 8.00 v_smfmac_f32_32x32x32_f16 v[10:25], a[2:5], v[4:11], v3 cbsz:3 abid:1 # CHECK-NEXT: - - - - - - 4.00 v_smfmac_f32_16x16x64_bf16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits