https://github.com/jmmartinez updated https://github.com/llvm/llvm-project/pull/133741
From 41af38793161b0f1535c98c4695c36e081ef2f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Thu, 27 Mar 2025 17:46:34 +0100 Subject: [PATCH 1/9] [Clang][AMDGPU] Add __builtin_amdgcn_cvt_off_f32_i4 This builtin maps to V_CVT_OFF_F32_I4 which treats its input as a 4-bit signed integer and returns 0.0625f * src . SWDEV-518861 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + .../builtins-amdgcn-cvt-off-f32-i4.cl | 15 ++ .../builtins-amdgcn-cvt-off-f32-i4-err.cl | 8 + llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 9 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 + .../AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll | 23 +++ .../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 158 ++++++++++++++++++ 10 files changed, 227 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll create mode 100644 llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 44ef404aee72f..f38148cc795dc 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -140,6 +140,7 @@ BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc") +BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fUi", "nc") BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl new file mode 100644 index 0000000000000..6dc235f9cc6c7 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl @@ -0,0 +1,15 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \ +// RUN: -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]]) +// CHECK-NEXT: ret float [[TMP1]] +// +float test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) { + return __builtin_amdgcn_cvt_off_f32_i4(n); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl new file mode 100644 index 0000000000000..f5b02b80c37da --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s + +void test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) { + struct A{ unsigned x; } a; + __builtin_amdgcn_cvt_off_f32_i4(n, n); // expected-error {{too many arguments to function call, expected 1, have 2}} + __builtin_amdgcn_cvt_off_f32_i4(); // expected-error {{too few arguments to function call, expected 1, have 0}} + __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'unsigned int'}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f53016f62abbe..ebac0f9029791 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3375,6 +3375,12 @@ def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<3>>]>; +// llvm.amdgcn.cvt.off.fp32.i4 int srcA +def int_amdgcn_cvt_off_f32_i4: ClangBuiltin<"__builtin_amdgcn_cvt_off_f32_i4">, + DefaultAttrsIntrinsic<[llvm_float_ty], + [llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // gfx950 intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3246e575ea6a9..533ad349f7500 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -6042,6 +6042,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, // TODO: Handle more intrinsics switch (IntrinsicID) { case Intrinsic::amdgcn_cubeid: + case Intrinsic::amdgcn_cvt_off_f32_i4: return true; case Intrinsic::amdgcn_frexp_mant: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 7cd97e95b0189..535fda7393bc1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -729,6 +729,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_cvt_off_f32_i4: { + ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0)); + if (!CArg) + break; + int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue(); + float ResVal = 0.0625 * CI4BitAsInt; + Constant *Res = ConstantFP::get(II.getType(), ResVal); + return IC.replaceInstUsesWith(II, Res); + } case Intrinsic::amdgcn_ubfe: case Intrinsic::amdgcn_sbfe: { // Decompose simple cases into standard shifts. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 7df1e634b21ba..1d0e81db5a5db 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4585,6 +4585,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_dot4_f32_bf8_bf8: case Intrinsic::amdgcn_cvt_f32_fp8: case Intrinsic::amdgcn_cvt_f32_bf8: + case Intrinsic::amdgcn_cvt_off_f32_i4: case Intrinsic::amdgcn_cvt_pk_f32_fp8: case Intrinsic::amdgcn_cvt_pk_f32_bf8: case Intrinsic::amdgcn_cvt_pk_fp8_f32: diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index def06c1e9a0d7..1dae2e432eb8c 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1578,6 +1578,11 @@ class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat < let OtherPredicates = [Pred]; } +def : GCNPat < + (f32 (int_amdgcn_cvt_off_f32_i4 i32:$src)), + (V_CVT_OFF_F32_I4_e32 VGPR_32:$src) +>; + foreach vt = Reg32Types.types in { def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>; def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll new file mode 100644 index 0000000000000..a25d9c30a2331 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s --global-isel -o - | FileCheck %s + +declare float @llvm.amdgcn.cvt.off.f32.i4(i32) + +define amdgpu_cs float @cvt_var(i32 %a) { +; CHECK-LABEL: cvt_var: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a) + ret float %ret +} + +define amdgpu_cs float @cvt_imm() { +; CHECK-LABEL: cvt_imm: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_off_f32_i4_e32 v0, 4 +; CHECK-NEXT: ; return to shader part epilog + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4) + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll new file mode 100644 index 0000000000000..bac02bd61d0a9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s + +declare float @llvm.amdgcn.cvt.off.f32.i4(i32) + +define float @cvt_var(i32 %a) { +; CHECK-LABEL: define float @cvt_var( +; CHECK-SAME: i32 [[A:%.*]]) { +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[A]]) +; CHECK-NEXT: ret float [[RET]] +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 %a) + ret float %ret +} + +define float @cvt_imm_0() { +; CHECK-LABEL: define float @cvt_imm_0() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 0) + ret float %ret +} + +define float @cvt_imm_1() { +; CHECK-LABEL: define float @cvt_imm_1() { +; CHECK-NEXT: ret float 6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 1) + ret float %ret +} + +define float @cvt_imm_2() { +; CHECK-LABEL: define float @cvt_imm_2() { +; CHECK-NEXT: ret float 1.250000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 2) + ret float %ret +} + +define float @cvt_imm_3() { +; CHECK-LABEL: define float @cvt_imm_3() { +; CHECK-NEXT: ret float 1.875000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 3) + ret float %ret +} + +define float @cvt_imm_4() { +; CHECK-LABEL: define float @cvt_imm_4() { +; CHECK-NEXT: ret float 2.500000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 4) + ret float %ret +} + +define float @cvt_imm_5() { +; CHECK-LABEL: define float @cvt_imm_5() { +; CHECK-NEXT: ret float 3.125000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 5) + ret float %ret +} + +define float @cvt_imm_6() { +; CHECK-LABEL: define float @cvt_imm_6() { +; CHECK-NEXT: ret float 3.750000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 6) + ret float %ret +} + +define float @cvt_imm_7() { +; CHECK-LABEL: define float @cvt_imm_7() { +; CHECK-NEXT: ret float 4.375000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 7) + ret float %ret +} + +define float @cvt_imm_8() { +; CHECK-LABEL: define float @cvt_imm_8() { +; CHECK-NEXT: ret float -5.000000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 8) + ret float %ret +} + +define float @cvt_imm_9() { +; CHECK-LABEL: define float @cvt_imm_9() { +; CHECK-NEXT: ret float -4.375000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 9) + ret float %ret +} + +define float @cvt_imm_10() { +; CHECK-LABEL: define float @cvt_imm_10() { +; CHECK-NEXT: ret float -3.750000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 10) + ret float %ret +} + +define float @cvt_imm_11() { +; CHECK-LABEL: define float @cvt_imm_11() { +; CHECK-NEXT: ret float -3.125000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 11) + ret float %ret +} + +define float @cvt_imm_12() { +; CHECK-LABEL: define float @cvt_imm_12() { +; CHECK-NEXT: ret float -2.500000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 12) + ret float %ret +} + +define float @cvt_imm_13() { +; CHECK-LABEL: define float @cvt_imm_13() { +; CHECK-NEXT: ret float -1.875000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 13) + ret float %ret +} + +define float @cvt_imm_14() { +; CHECK-LABEL: define float @cvt_imm_14() { +; CHECK-NEXT: ret float -1.250000e-01 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 14) + ret float %ret +} + +define float @cvt_imm_15() { +; CHECK-LABEL: define float @cvt_imm_15() { +; CHECK-NEXT: ret float -6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 15) + ret float %ret +} + +define float @cvt_imm_underflow() { +; CHECK-LABEL: define float @cvt_imm_underflow() { +; CHECK-NEXT: ret float -6.250000e-02 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 -1) + ret float %ret +} + +define float @cvt_imm_overflow() { +; CHECK-LABEL: define float @cvt_imm_overflow() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 16) + ret float %ret +} From dc78e8a6e014b1addf2e99af3cdd0bc6b979eb4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 09:47:43 +0200 Subject: [PATCH 2/9] [Review] Turn builtin input from unsigned to signed --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +- clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index f38148cc795dc..b1480675753b1 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -140,7 +140,7 @@ BUILTIN(__builtin_amdgcn_cvt_pknorm_u16, "E2Usff", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_i16, "E2sii", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u16, "E2UsUiUi", "nc") BUILTIN(__builtin_amdgcn_cvt_pk_u8_f32, "UifUiUi", "nc") -BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fUi", "nc") +BUILTIN(__builtin_amdgcn_cvt_off_f32_i4, "fi", "nc") BUILTIN(__builtin_amdgcn_sad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_msad_u8, "UiUiUiUi", "nc") BUILTIN(__builtin_amdgcn_sad_hi_u8, "UiUiUiUi", "nc") diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl index f5b02b80c37da..30ffbfc130a94 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-cvt-off-f32-i4-err.cl @@ -1,8 +1,8 @@ // RUN: %clang_cc1 -triple amdgcn-- -verify -S -o - %s -void test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) { +void test_builtin_amdgcn_cvt_off_f32_i4(int n) { struct A{ unsigned x; } a; __builtin_amdgcn_cvt_off_f32_i4(n, n); // expected-error {{too many arguments to function call, expected 1, have 2}} __builtin_amdgcn_cvt_off_f32_i4(); // expected-error {{too few arguments to function call, expected 1, have 0}} - __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'unsigned int'}} + __builtin_amdgcn_cvt_off_f32_i4(a); // expected-error {{passing '__private struct A' to parameter of incompatible type 'int'}} } From 1cf0a9c62b1039f1ff92667a84dff2f659f57110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 10:45:43 +0200 Subject: [PATCH 3/9] [Review] Added signed OpenCL CodeGen test --- .../builtins-amdgcn-cvt-off-f32-i4.cl | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl index 6dc235f9cc6c7..f554d2f72f869 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-cvt-off-f32-i4.cl @@ -2,7 +2,7 @@ // RUN: %clang_cc1 %s -O0 -triple amdgcn-amd-amdhsa -cl-std=CL1.2 \ // RUN: -emit-llvm -o - | FileCheck %s -// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4( +// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_ui( // CHECK-NEXT: entry: // CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4 @@ -10,6 +10,18 @@ // CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]]) // CHECK-NEXT: ret float [[TMP1]] // -float test_builtin_amdgcn_cvt_off_f32_i4(unsigned n) { +float test_builtin_amdgcn_cvt_off_f32_i4_ui(unsigned n) { + return __builtin_amdgcn_cvt_off_f32_i4(n); +} + +// CHECK-LABEL: @test_builtin_amdgcn_cvt_off_f32_i4_i( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[N:%.*]], ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[N_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 [[TMP0]]) +// CHECK-NEXT: ret float [[TMP1]] +// +float test_builtin_amdgcn_cvt_off_f32_i4_i(int n) { return __builtin_amdgcn_cvt_off_f32_i4(n); } From c10a091e804fd896fdcacc50a49589b858af3eba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 15:33:29 +0200 Subject: [PATCH 4/9] [Review] Add line in the Release notes --- clang/docs/ReleaseNotes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index daad01919ecd4..231bab37c1c91 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -165,6 +165,7 @@ Non-comprehensive list of changes in this release - Support parsing the `cc` operand modifier and alias it to the `c` modifier (#GH127719). - Added `__builtin_elementwise_exp10`. +- For AMDPGU targets, added `__builtin_v_cvt_off_f32_i4` that maps to the `v_cvt_off_f32_i4` instruction. New Compiler Flags ------------------ From 73efe546b97bb5d1d1dc83dcb1de7e7023276d2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 16:39:37 +0200 Subject: [PATCH 5/9] [Review] Move pattern into instruction declaration --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1dae2e432eb8c..170e794af1b4d 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -317,7 +317,7 @@ defm V_CVT_F32_BF16 : VOP1Inst_t16 <"v_cvt_f32_bf16", VOP_F32_BF16>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; -defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32>; +defm V_CVT_OFF_F32_I4 : VOP1Inst <"v_cvt_off_f32_i4", VOP1_F32_I32, int_amdgcn_cvt_off_f32_i4>; } // End ReadsModeReg = 0, mayRaiseFPException = 0 } // End SchedRW = [WriteFloatCvt] @@ -1578,11 +1578,6 @@ class MovDPP8Pattern<Predicate Pred, Instruction Inst, ValueType vt> : GCNPat < let OtherPredicates = [Pred]; } -def : GCNPat < - (f32 (int_amdgcn_cvt_off_f32_i4 i32:$src)), - (V_CVT_OFF_F32_I4_e32 VGPR_32:$src) ->; - foreach vt = Reg32Types.types in { def : MovDPP8Pattern<isGFX10Only, V_MOV_B32_dpp8_gfx10, vt>; def : MovDPP8Pattern<isGFX11Only, V_MOV_B32_dpp8_gfx11, vt>; From 89520eec0984418ae776c11e5a6cb987048f4562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 17:15:27 +0200 Subject: [PATCH 6/9] [Review] Handle undef & poison and update tests --- .../Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 ++++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll | 8 ++++++-- .../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 16 ++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) rename llvm/test/{CodeGen => Transforms/InstCombine}/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll (90%) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 535fda7393bc1..6c48037649cd7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -730,9 +730,19 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } case Intrinsic::amdgcn_cvt_off_f32_i4: { + Value* Arg = II.getArgOperand(0); + Type *Ty = II.getType(); + + if (isa<PoisonValue>(Arg)) + return IC.replaceInstUsesWith(II, PoisonValue::get(Ty)); + + if(IC.getSimplifyQuery().isUndefValue(Arg)) + return IC.replaceInstUsesWith(II, Constant::getNullValue(Ty)); + ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0)); if (!CArg) break; + int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue(); float ResVal = 0.0625 * CI4BitAsInt; Constant *Res = ConstantFP::get(II.getType(), ResVal); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll index a25d9c30a2331..e504eb7a5a124 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.off.f32.i4.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s -o - | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs %s --global-isel -o - | FileCheck %s +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tahiti %s -o - | FileCheck %s +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=tonga %s -o - | FileCheck %s +; RUN: llc --global-isel=0 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s +; RUN: llc --global-isel=1 -mtriple=amdgcn -mcpu=gfx90a %s -o - | FileCheck %s declare float @llvm.amdgcn.cvt.off.f32.i4(i32) diff --git a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll similarity index 90% rename from llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll rename to llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll index bac02bd61d0a9..0850c4e92ef02 100644 --- a/llvm/test/CodeGen/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll @@ -156,3 +156,19 @@ define float @cvt_imm_overflow() { %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 16) ret float %ret } + +define float @cvt_imm_poison() { +; CHECK-LABEL: define float @cvt_imm_poison() { +; CHECK-NEXT: ret float poison +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 poison) + ret float %ret +} + +define float @cvt_imm_undef() { +; CHECK-LABEL: define float @cvt_imm_undef() { +; CHECK-NEXT: ret float 0.000000e+00 +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 undef) + ret float %ret +} From eea660c46bdc789be4ac02bb3089712d9d1b7e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 17:16:59 +0200 Subject: [PATCH 7/9] CI4BitAsInt -> CArg4BitAsInt --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 6c48037649cd7..ab7d8b28152b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -743,8 +743,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!CArg) break; - int CI4BitAsInt = CArg->getValue().trunc(4).getSExtValue(); - float ResVal = 0.0625 * CI4BitAsInt; + int CArg4BitAsInt = CArg->getValue().trunc(4).getSExtValue(); + float ResVal = 0.0625 * CArg4BitAsInt; Constant *Res = ConstantFP::get(II.getType(), ResVal); return IC.replaceInstUsesWith(II, Res); } From 4b7a154af1afbc2105dd43189e99ff9ecdc17a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 17:26:35 +0200 Subject: [PATCH 8/9] [Review] Forgot the constexpr case --- .../AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll index 0850c4e92ef02..1082c6ddb898b 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/simplify-amdgcn.cvt.off.f32.i4.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=instcombine < %s | FileCheck %s -declare float @llvm.amdgcn.cvt.off.f32.i4(i32) +@gv = constant i32 0 define float @cvt_var(i32 %a) { ; CHECK-LABEL: define float @cvt_var( @@ -157,18 +157,27 @@ define float @cvt_imm_overflow() { ret float %ret } -define float @cvt_imm_poison() { -; CHECK-LABEL: define float @cvt_imm_poison() { +define float @cvt_poison() { +; CHECK-LABEL: define float @cvt_poison() { ; CHECK-NEXT: ret float poison ; %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 poison) ret float %ret } -define float @cvt_imm_undef() { -; CHECK-LABEL: define float @cvt_imm_undef() { +define float @cvt_undef() { +; CHECK-LABEL: define float @cvt_undef() { ; CHECK-NEXT: ret float 0.000000e+00 ; %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 undef) ret float %ret } + +define float @cvt_constexpr() { +; CHECK-LABEL: define float @cvt_constexpr() { +; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32)) +; CHECK-NEXT: ret float [[RET]] +; + %ret = call float @llvm.amdgcn.cvt.off.f32.i4(i32 ptrtoint (ptr @gv to i32)) + ret float %ret +} From 57f299e49f02c5de4fa0e8bbbf3452496f80e323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juama...@amd.com> Date: Tue, 1 Apr 2025 17:26:35 +0200 Subject: [PATCH 9/9] [Review] Tabulate the operation result to avoid operations that could depend on the host --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ab7d8b28152b8..2dd85afdfd2ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -743,9 +743,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (!CArg) break; - int CArg4BitAsInt = CArg->getValue().trunc(4).getSExtValue(); - float ResVal = 0.0625 * CArg4BitAsInt; - Constant *Res = ConstantFP::get(II.getType(), ResVal); + // Tabulated 0.0625 * (sext (CArg & 0xf)). + constexpr size_t ResValsSize = 16; + const float ResVals[ResValsSize] = { + 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, + -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625}; + Constant *Res = + ConstantFP::get(Ty, ResVals[CArg->getZExtValue() % ResValsSize]); return IC.replaceInstUsesWith(II, Res); } case Intrinsic::amdgcn_ubfe: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits