llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Co-authored-by: Shilei Tian <shilei.tian@<!-- -->amd.com> --- Patch is 55.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117820.diff 11 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+7) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+62) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+11) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+25-19) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll (+636) - (modified) llvm/test/MC/AMDGPU/gfx950_asm_features.s (+24) - (modified) llvm/test/MC/AMDGPU/gfx950_err.s (+18) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt (+18) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 2c617a90a4fde9..61039938267feb 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -602,5 +602,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16, "UiUiV2hUifIi", "nc" TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16, "UiUiV2yUifIi", "nc", "fp4-cvt-scale-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32, "UiUiV2fUifIi", "nc", "fp4-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16, "V6UiV32yUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16, "V6UiV32hUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32, "V6UiV32fUif", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 691be592e3a4bc..64403f0bf94ebd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -1338,3 +1338,65 @@ void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2); *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3); } + +// CHECK-LABEL: @test_cvt_scalef32_sr_pk32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) +// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32 +// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32 +// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32 +// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32 +// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) +// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32 +// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) +// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32 +// CHECK-NEXT: ret void +// +void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2) +{ + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2); + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2); + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2); + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2); + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2); + *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 99a29dadef56de..73f3559ab05a48 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -602,6 +602,10 @@ class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMTy [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] >, ClangBuiltin<"__builtin_amdgcn_"#name>; +class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< + [DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] +>, ClangBuiltin<"__builtin_amdgcn_"#name>; + def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">; def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">; def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">; @@ -609,6 +613,13 @@ def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i3 def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">; def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">; +def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_bf6_f32">; +def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_fp6_bf16">; +def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">; +def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">; + class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic< [DstTy], [llvm_i32_ty, // src diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ad100f37f8710c..158603a7aff879 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4578,6 +4578,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f16: case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_bf16: case Intrinsic::amdgcn_cvt_scalef32_sr_pk_fp4_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_bf6_f32: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_sr_pk32_fp6_f32: case Intrinsic::amdgcn_ashr_pk_i8_i32: case Intrinsic::amdgcn_ashr_pk_u8_i32: case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 119a4d63704777..b27e2529f4a807 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1779,25 +1779,27 @@ class getSDWASrcForVT <ValueType VT> { // given VT. class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { RegisterOperand ret = - !cond(!eq(VT, f64) : VSrc_f64, - !eq(VT, f32) : VSrc_f32, - !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16), - !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16), - !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16), - !eq(VT, i1) : SSrc_i1, - !eq(VT, v2f32) : VSrc_v2f32, - !eq(VT, v2i32) : VSrc_v2b32, - !eq(VT, v2f16) : VSrc_v2f16, - !eq(VT, v2bf16) : VSrc_v2bf16, - !eq(VT, v2i16) : VSrc_v2b16, - !eq(VT, v4f16) : AVSrc_64, - !eq(VT, v4bf16) : AVSrc_64, - !eq(VT.Size, 512) : VRegSrc_512, - !eq(VT.Size, 192) : VRegSrc_192, - !eq(VT.Size, 128) : VRegSrc_128, - !eq(VT.Size, 96) : VRegSrc_96, - !eq(VT.Size, 64) : VSrc_b64, - 1 : VSrc_b32); + !cond(!eq(VT, f64) : VSrc_f64, + !eq(VT, f32) : VSrc_f32, + !eq(VT, f16) : !if(IsTrue16, VSrcT_f16, VSrc_f16), + !eq(VT, bf16) : !if(IsTrue16, VSrcT_bf16, VSrc_bf16), + !eq(VT, i16) : !if(IsTrue16, VSrcT_b16, VSrc_b16), + !eq(VT, i1) : SSrc_i1, + !eq(VT, v2f32) : VSrc_v2f32, + !eq(VT, v2i32) : VSrc_v2b32, + !eq(VT, v2f16) : VSrc_v2f16, + !eq(VT, v2bf16) : VSrc_v2bf16, + !eq(VT, v2i16) : VSrc_v2b16, + !eq(VT, v4f16) : AVSrc_64, + !eq(VT, v4bf16) : AVSrc_64, + !eq(VT.Size, 1024) : VRegSrc_1024, + !eq(VT.Size, 512) : VRegSrc_512, + !eq(VT.Size, 256) : VRegSrc_256, + !eq(VT.Size, 192) : VRegSrc_192, + !eq(VT.Size, 128) : VRegSrc_128, + !eq(VT.Size, 96) : VRegSrc_96, + !eq(VT.Size, 64) : VSrc_b64, + 1 : VSrc_b32); } // Returns the vreg register class to use for sources of VOP3 instructions for the @@ -2856,6 +2858,10 @@ def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>; def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>; def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>; +def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>; +def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>; +def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>; + def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 51fdd4211a5cf6..6a349d2bf06ea2 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1252,6 +1252,7 @@ def VRegSrc_128: SrcReg9<VReg_128, "OPW128">; def VRegSrc_192: SrcReg9<VReg_192, "OPW192">; def VRegSrc_256: SrcReg9<VReg_256, "OPW256">; def VRegSrc_512: SrcReg9<VReg_512, "OPW512">; +def VRegSrc_1024: SrcReg9<VReg_1024, "OPW1024">; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">; // True 16 Operands diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index c78f5c108e4d53..3a79532cecb917 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1117,6 +1117,12 @@ let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPExcep defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>; defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>; defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>; + defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16>; + defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16>; + defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_bf6_f32>; + defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16>; + defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16>; + defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk32_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32>; } let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in { @@ -2203,6 +2209,12 @@ defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3_Real_gfx9<0x258, "v_cvt_scalef32_pk32_f defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_fp6_bf16">; defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">; defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">; +defm V_CVT_SCALEF32_SR_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25f, "v_cvt_scalef32_sr_pk32_bf6_bf16">; +defm V_CVT_SCALEF32_SR_PK32_BF6_F16 : VOP3_Real_gfx9<0x25e, "v_cvt_scalef32_sr_pk32_bf6_f16">; +defm V_CVT_SCALEF32_SR_PK32_BF6_F32 : VOP3_Real_gfx9<0x255, "v_cvt_scalef32_sr_pk32_bf6_f32">; +defm V_CVT_SCALEF32_SR_PK32_FP6_BF16 : VOP3_Real_gfx9<0x25d, "v_cvt_scalef32_sr_pk32_fp6_bf16">; +defm V_CVT_SCALEF32_SR_PK32_FP6_F16 : VOP3_Real_gfx9<0x25c, "v_cvt_scalef32_sr_pk32_fp6_f16">; +defm V_CVT_SCALEF32_SR_PK32_FP6_F32 : VOP3_Real_gfx9<0x254, "v_cvt_scalef32_sr_pk32_fp6_f32">; } let OtherPredicates = [HasF32ToF16BF16ConversionSRInsts] in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll new file mode 100644 index 00000000000000..3e9ac6cbe3ba6e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll @@ -0,0 +1,636 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s + +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float %scale) + +define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/117820 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits