llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Stanislav Mekhanoshin (rampitec) <details> <summary>Changes</summary> --- Patch is 122.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151415.diff 28 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+138) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+24) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+23-1) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+1) - (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+2) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1) - (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+51) - (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+16) - (added) llvm/test/CodeGen/AMDGPU/code-size-estimate-gfx1250.ll (+28) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll (+539) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+145) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+145) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16-fake16.s (+64) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp16.s (+64) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8-fake16.s (+64) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_dpp8.s (+64) - (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s (+25) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+167-3) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp16.txt (+64) - (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_dpp8.txt (+64) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ec00fadf3039a..172ac467f7cad 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -702,6 +702,10 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 2595442ba7f9e..1c67fc3879bff 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -398,6 +398,144 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_cvt_pk_bf8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.bf8.f16(<2 x half> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_cvt_pk_bf8_f16(global short* out, half2 a) +{ + *out = __builtin_amdgcn_cvt_pk_bf8_f16(a); +} + +// CHECK-LABEL: @test_cvt_pk_fp8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.cvt.pk.fp8.f16(<2 x half> [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_cvt_pk_fp8_f16(global short* out, half2 a) +{ + *out = __builtin_amdgcn_cvt_pk_fp8_f16(a); +} + +// CHECK-LABEL: @test_cvt_sr_bf8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i32 2) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.bf8.f16(half [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i32 3) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_bf8_f16(global int* out, half a, uint sr, int old) +{ + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 0); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 1); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 2); + *out = __builtin_amdgcn_cvt_sr_bf8_f16(a, sr, old, 3); +} + +// CHECK-LABEL: @test_cvt_sr_fp8_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i16, align 2, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i16 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP0]], i32 [[CONV]], i32 [[TMP2]], i32 0) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[TMP6]] to i32 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP5]], i32 [[CONV1]], i32 [[TMP7]], i32 1) +// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP10]], i32 [[CONV2]], i32 [[TMP12]], i32 2) +// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[SR_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[CONV3:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.cvt.sr.fp8.f16(half [[TMP15]], i32 [[CONV3]], i32 [[TMP17]], i32 3) +// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4 +// CHECK-NEXT: ret void +// +void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old) +{ + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 0); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 1); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 2); + *out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3); +} + // CHECK-LABEL: @test_sat_pk4_i4_i8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index c29f6c740f1a4..4a50558ca8e86 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -605,6 +605,30 @@ def int_amdgcn_cvt_pk_f16_bf8 : DefaultAttrsIntrinsic< [llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable] >, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_bf8">; +def int_amdgcn_cvt_pk_fp8_f16 + : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty], + [IntrNoMem, IntrSpeculatable]>, + ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f16">; + +def int_amdgcn_cvt_pk_bf8_f16 + : DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty], + [IntrNoMem, IntrSpeculatable]>, + ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f16">; + +// llvm.amdgcn.cvt.sr.fp8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3] +// byte_sel selects byte to write in vdst. +def int_amdgcn_cvt_sr_fp8_f16 : DefaultAttrsIntrinsic< + [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] +>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f16">; + +// llvm.amdgcn.cvt.sr.bf8.f16 i32 vdst, half src, i32 seed, i32 old, imm byte_sel [0..3] +// byte_sel selects byte to write in vdst. +def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic< + [llvm_i32_ty], [llvm_half_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] +>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">; + class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] >, ClangBuiltin<"__builtin_amdgcn_"#name>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 25f428ac6c8a9..6bca2fec17c6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4577,6 +4577,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f16: + case Intrinsic::amdgcn_cvt_pk_bf8_f16: + case Intrinsic::amdgcn_cvt_sr_fp8_f16: + case Intrinsic::amdgcn_cvt_sr_bf8_f16: case Intrinsic::amdgcn_sat_pk4_i4_i8: case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 44e65b3588888..a4ea8cf423f84 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -689,6 +689,8 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); } + bool isVSrc_NoInline_v2f16() const { return isVSrc_v2f16(); } + bool isVISrcB32() const { return isRegOrInlineNoMods(AMDGPU::VGPR_32RegClassID, MVT::i32); } @@ -2036,6 +2038,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2405,6 +2408,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_IMM_V2INT32: case AMDGPU::OPERAND_KIMM32: @@ -2456,6 +2460,9 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo setImmKindConst(); return; } + [[fallthrough]]; + + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: Inst.addOperand(MCOperand::createImm(Lo_32(Val))); setImmKindLiteral(); @@ -3761,6 +3768,9 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_INLINE_C_BF16) return AMDGPU::isInlinableLiteralBF16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16) + return false; + llvm_unreachable("invalid operand type"); } default: @@ -9421,7 +9431,19 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 || Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 || Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 || - Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) { + Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_FP8_F16_fake16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_dpp8_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_t16_e64_gfx1250 || + Opc == AMDGPU::V_CVT_SR_BF8_F16_fake16_e64_gfx1250)) { Inst.addOperand(Inst.getOperand(0)); } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 11b072e05a6e1..15088ac25863f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -540,6 +540,8 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, printImmediateBFloat16(static_cast<uint16_t>(Imm), STI, O)) return; break; + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + break; default: llvm_unreachable("bad operand type"); } @@ -770,6 +772,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index c49ad798c1824..f3580842c6ff0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -341,6 +341,9 @@ std::optional<uint64_t> AMDGPUMCCodeEmitter::getLitEncoding( return AMDGPU::getInlineEncodingV2BF16(static_cast<uint32_t>(Imm)) .value_or(255); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return 255; + case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 40b8bcde005c0..c56414519a6fe 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -208,6 +208,7 @@ enum OperandType : unsigned { OPERAND_REG_IMM_V2BF16, OPERAND_REG_IMM_V2FP16, OPERAND_REG_IMM_V2INT16, + OPERAND_REG_IMM_NOINLINE_V2FP16, OPERAND_REG_IMM_V2INT32, OPERAND_REG_IMM_V2FP32, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index b77da4d612dd4..e934152d08acb 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -468,6 +468,7 @@ bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c2da937552240..044a681bfed3d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4438,6 +4438,8 @@ bool SIInstrInfo::isInlineConstant(int64_t Imm, uint8_t OperandType) const { case AMDGPU::OPERAND_REG_IMM_V2BF16: case AMDGPU::OPERAND_REG_INLINE_C_V2BF16: return AMDGPU::isInlinableLiteralV2BF16(Imm); + case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16: + return false; case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 7324b6a347447..efcc88e564e65 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2859,6 +2859,7 @@ def VOP_I16_F16 : VOPProf... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/151415 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits