https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/77772
>From 1b2085465dd0988459a4c71dab6cd65b1de065be Mon Sep 17 00:00:00 2001 From: Piotr Sobczak <piotr.sobc...@amd.com> Date: Thu, 11 Jan 2024 14:52:59 +0100 Subject: [PATCH 1/3] [AMDGPU] Add global_load_tr for GFX12 Support new amdgcn_global_load_tr instructions for load with transpose. * MC layer support for GLOBAL_LOAD_TR_B64/GLOBAL_LOAD_TR_B128 * Intrinsics int_amdgcn_global_load_tr_b64/int_amdgcn_global_load_tr_b128 * Clang builtins amdgcn_global_load_tr_b64/amdgcn_global_load_tr_b128 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 7 ++ clang/lib/CodeGen/CGBuiltin.cpp | 45 ++++++++ ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 26 +++++ ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 15 +++ ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 16 +++ .../builtins-amdgcn-global-load-tr-w32.cl | 48 ++++++++ .../builtins-amdgcn-global-load-tr-w64.cl | 47 ++++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 21 ++++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + .../Disassembler/AMDGPUDisassembler.cpp | 4 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 33 ++++++ .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 106 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 106 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 6 + .../test/MC/AMDGPU/gfx12_asm_global_load_tr.s | 103 +++++++++++++++++ .../AMDGPU/gfx12_dasm_global_load_tr.txt | 34 ++++++ 16 files changed, 619 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e562ef04a30194..098c309f808537 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -423,6 +423,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 998fcc3af58175..dc634b1c388f46 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18178,6 +18178,51 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); } + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: { + + Intrinsic::ID IID; + llvm::Type *ArgTy; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getInt32Ty(getLLVMContext()), 2); + IID = Intrinsic::amdgcn_global_load_tr_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + ArgTy = llvm::Type::getInt32Ty(getLLVMContext()); + IID = Intrinsic::amdgcn_global_load_tr_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getInt16Ty(getLLVMContext()), 8); + IID = Intrinsic::amdgcn_global_load_tr_b128; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 8); + IID = Intrinsic::amdgcn_global_load_tr_b128; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getInt16Ty(getLLVMContext()), 4); + IID = Intrinsic::amdgcn_global_load_tr_b128; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 4); + IID = Intrinsic::amdgcn_global_load_tr_b128; + break; + } + + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); + return Builder.CreateCall(F, {Addr}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl new file mode 100644 index 00000000000000..10e2325cdea75c --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm \ +// RUN: -verify -S -o - %s + +// REQUIRES: amdgpu-registered-target + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); + + + +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, + global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) +{ + v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + + int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} +} + diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl new file mode 100644 index 00000000000000..299a793a7b31e1 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm \ +// RUN: -verify -S -o - %s + +// REQUIRES: amdgpu-registered-target + +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); + +void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) +{ + int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} +} + diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl new file mode 100644 index 00000000000000..79f374af240c7e --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm \ +// RUN: -verify -S -o - %s + +// REQUIRES: amdgpu-registered-target + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr) +{ + v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} +} + diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl new file mode 100644 index 00000000000000..df523827e668d4 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -0,0 +1,48 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef short v8s __attribute__((ext_vector_type(8))); + +// Wave32 + +// +// amdgcn_global_load_tr_b64 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr); +} + +// +// amdgcn_global_load_tr_b128 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]] +// +v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); +} + diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl new file mode 100644 index 00000000000000..06b51216407377 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -0,0 +1,47 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 + +typedef half v4h __attribute__((ext_vector_type(4))); +typedef short v4s __attribute__((ext_vector_type(4))); + +// Wave64 + +// +// amdgcn_global_load_tr_b64 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret i32 [[TMP0]] +// +int test_amdgcn_global_load_tr_b64_i32(global int* inptr) +{ + return __builtin_amdgcn_global_load_tr_b64_i32(inptr); +} + +// +// amdgcn_global_load_tr_b128 +// + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]] +// +v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] +// +v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); +} + diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index e5596258847f9f..ad850c9c31490c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2496,6 +2496,27 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; +class AMDGPUGlobalLoadTr<LLVMType data_ty> : + Intrinsic< + [data_ty], + [global_ptr_ty], + [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], + "", + [SDNPMemOperand] + >; + +// Wave32 +// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) +// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) +// <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1)) +// Wave64 +// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) +// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) +// <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) + +def int_amdgcn_global_load_tr_b64 : AMDGPUGlobalLoadTr<llvm_any_ty>; +def int_amdgcn_global_load_tr_b128 : AMDGPUGlobalLoadTr<llvm_any_ty>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 391c2b9ec256ea..0cfab44a7a0354 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4837,6 +4837,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: + case Intrinsic::amdgcn_global_load_tr_b64: + case Intrinsic::amdgcn_global_load_tr_b128: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 9dff3f6c2efd02..441032a37dfd9e 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS); if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS); + if (Res) + break; } // Reinitialize Bytes Bytes = Bytes_.slice(0, MaxInstBytesNum); diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 16a8b770e0577d..47c3d806e487e3 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -995,6 +995,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d } // End SubtargetPredicate = HasFlatScratchInsts +let SubtargetPredicate = isGFX12Plus in { + let WaveSizePredicate = isWave32 in { + defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>; + defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>; + } + let WaveSizePredicate = isWave64 in { + defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>; + defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>; + } +} // End SubtargetPredicate = isGFX12Plus + let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in { defm GLOBAL_ATOMIC_FCMPSWAP : FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>; @@ -1559,6 +1570,17 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let OtherPredicates = [isGFX12Plus] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; + + let WaveSizePredicate = isWave32 in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>; + } + let WaveSizePredicate = isWave64 in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>; + } } let OtherPredicates = [isGFX10Plus] in { @@ -2686,6 +2708,17 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; + +let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">; + defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">; +} + +let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in { + defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">; + defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">; +} + defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll new file mode 100644 index 00000000000000..89a9138d4d2c62 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s + +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) + +define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W32-LABEL: global_load_tr_b64: +; GFX12-SDAG-W32: ; %bb.0: ; %entry +; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-W32-NEXT: s_nop 0 +; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W32-NEXT: s_endpgm +; +; GFX12-GISEL-W32-LABEL: global_load_tr_b64: +; GFX12-GISEL-W32: ; %bb.0: ; %entry +; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-W32-NEXT: s_nop 0 +; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W32-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16: +; GFX12-SDAG-W32: ; %bb.0: ; %entry +; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-SDAG-W32-NEXT: s_nop 0 +; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W32-NEXT: s_endpgm +; +; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16: +; GFX12-GISEL-W32: ; %bb.0: ; %entry +; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-GISEL-W32-NEXT: s_nop 0 +; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W32-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) + store <8 x i16> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half: +; GFX12-SDAG-W32: ; %bb.0: ; %entry +; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-SDAG-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-SDAG-W32-NEXT: s_nop 0 +; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W32-NEXT: s_endpgm +; +; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half: +; GFX12-GISEL-W32: ; %bb.0: ; %entry +; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-GISEL-W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W32-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W32-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-GISEL-W32-NEXT: s_nop 0 +; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W32-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll new file mode 100644 index 00000000000000..73dc4fa506c7ba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s + +declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1)) + +define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W64-LABEL: global_load_tr_b64: +; GFX12-SDAG-W64: ; %bb.0: ; %entry +; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-SDAG-W64-NEXT: s_nop 0 +; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W64-NEXT: s_endpgm +; +; GFX12-GISEL-W64-LABEL: global_load_tr_b64: +; GFX12-GISEL-W64: ; %bb.0: ; %entry +; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-GISEL-W64-NEXT: s_nop 0 +; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W64-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16: +; GFX12-SDAG-W64: ; %bb.0: ; %entry +; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-W64-NEXT: s_nop 0 +; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W64-NEXT: s_endpgm +; +; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16: +; GFX12-GISEL-W64: ; %bb.0: ; %entry +; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-W64-NEXT: s_nop 0 +; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W64-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep) + store <4 x i16> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half: +; GFX12-SDAG-W64: ; %bb.0: ; %entry +; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-SDAG-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-SDAG-W64-NEXT: s_nop 0 +; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-W64-NEXT: s_endpgm +; +; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half: +; GFX12-GISEL-W64: ; %bb.0: ; %entry +; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT +; GFX12-GISEL-W64-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-W64-NEXT: global_inv scope:SCOPE_SYS +; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-GISEL-W64-NEXT: s_nop 0 +; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-W64-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep) + store <4 x half> %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index e01eb05e85588d..ab7e97b482da9d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -211,6 +211,12 @@ global_load_lds_ubyte v[2:3], off global_load_lds_ushort v[2:3], off // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +global_load_tr_b128 v[1:4], v5, s[2:3] +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_tr_b64 v[1:2], v[3:4], off +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + image_atomic_fcmpswap v[1:2], v2, s[12:19] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s new file mode 100644 index 00000000000000..597e0d29b43afd --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx12_asm_global_load_tr.s @@ -0,0 +1,103 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W64-ERR --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=W32-ERR --implicit-check-not=error: %s + +global_load_tr_b128 v[1:4], v0, s[0:1] offset:-64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b128 v[1:4], v0, s[0:1] offset:64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b128 v[1:4], v5, s[2:3] +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] + +global_load_tr_b128 v[1:4], v[0:1], off offset:-64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b128 v[1:4], v[0:1], off offset:64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b128 v[1:4], v[5:6], off +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] + +global_load_tr_b64 v[1:2], v0, s[0:1] offset:-64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b64 v[1:2], v0, s[0:1] offset:64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b64 v[1:2], v3, s[2:3] +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] + +global_load_tr_b64 v[1:2], v[0:1], off offset:-64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b64 v[1:2], v[0:1], off offset:64 +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b64 v[1:2], v[3:4], off +// W64-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W32: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] + + + +global_load_tr_b128 v[1:2], v0, s[0:1] offset:-64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b128 v[1:2], v0, s[0:1] offset:64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b128 v[1:2], v5, s[2:3] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] + +global_load_tr_b128 v[1:2], v[0:1], off offset:-64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b128 v[1:2], v[0:1], off offset:64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b128 v[1:2], v[5:6], off +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] + +global_load_tr_b64 v1, v0, s[0:1] offset:-64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b64 v1, v0, s[0:1] offset:64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b64 v1, v3, s[2:3] +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] + +global_load_tr_b64 v1, v[0:1], off offset:-64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] + +global_load_tr_b64 v1, v[0:1], off offset:64 +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] + +global_load_tr_b64 v1, v[3:4], off +// W32-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode +// W64: encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] + diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt new file mode 100644 index 00000000000000..e8498d4aef0a1b --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_global_load_tr.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W64 %s +# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=W32 %s + +# W32: global_load_tr_b128 v[1:4], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +# W64: global_load_tr_b128 v[1:2], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# W32: global_load_tr_b128 v[1:4], v5, s[2:3] ; encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +# W64: global_load_tr_b128 v[1:2], v5, s[2:3] ; encoding: [0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +0x02,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00 + +# W32: global_load_tr_b128 v[1:4], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +# W64: global_load_tr_b128 v[1:2], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# W32: global_load_tr_b128 v[1:4], v[5:6], off ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +# W64: global_load_tr_b128 v[1:2], v[5:6], off ; encoding: [0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +0x7c,0xc0,0x15,0xee,0x01,0x00,0x00,0x00,0x05,0x00,0x00,0x00 + +# W32: global_load_tr_b64 v[1:2], v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +# W64: global_load_tr_b64 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# W32: global_load_tr_b64 v[1:2], v3, s[2:3] ; encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +# W64: global_load_tr_b64 v1, v3, s[2:3] ; encoding: [0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# W32: global_load_tr_b64 v[1:2], v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +# W64: global_load_tr_b64 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# W32: global_load_tr_b64 v[1:2], v[3:4], off ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +# W64: global_load_tr_b64 v1, v[3:4], off ; encoding: [0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x7c,0x00,0x16,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 >From b49e50fc0162daadb163c9773ea9d23e76196daf Mon Sep 17 00:00:00 2001 From: Piotr Sobczak <piotr.sobc...@amd.com> Date: Fri, 12 Jan 2024 13:33:57 +0100 Subject: [PATCH 2/3] Common up intrinsic variants --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 12 ++--- clang/lib/CodeGen/CGBuiltin.cpp | 45 ++++++++----------- ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 12 ++--- ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 6 +-- ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 6 +-- .../builtins-amdgcn-global-load-tr-w32.cl | 28 ++++++------ .../builtins-amdgcn-global-load-tr-w64.cl | 28 ++++++------ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +++---- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 12 ++--- .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 12 ++--- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 12 ++--- 12 files changed, 91 insertions(+), 100 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 098c309f808537..9b745819454aef 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -423,13 +423,13 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index dc634b1c388f46..f9794ebd6be33b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18178,49 +18178,42 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); } - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: { - Intrinsic::ID IID; llvm::Type *ArgTy; switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getInt32Ty(getLLVMContext()), 2); - IID = Intrinsic::amdgcn_global_load_tr_b64; - break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32: ArgTy = llvm::Type::getInt32Ty(getLLVMContext()); - IID = Intrinsic::amdgcn_global_load_tr_b64; break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32: ArgTy = llvm::FixedVectorType::get( - llvm::Type::getInt16Ty(getLLVMContext()), 8); - IID = Intrinsic::amdgcn_global_load_tr_b128; + llvm::Type::getInt32Ty(getLLVMContext()), 2); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16: ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 8); - IID = Intrinsic::amdgcn_global_load_tr_b128; + llvm::Type::getHalfTy(getLLVMContext()), 4); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt16Ty(getLLVMContext()), 4); - IID = Intrinsic::amdgcn_global_load_tr_b128; break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16: ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 4); - IID = Intrinsic::amdgcn_global_load_tr_b128; + llvm::Type::getHalfTy(getLLVMContext()), 8); + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getInt16Ty(getLLVMContext()), 8); break; } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); - llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); + llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy}); return Builder.CreateCall(F, {Addr}); } case AMDGPU::BI__builtin_amdgcn_read_exec: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 10e2325cdea75c..f7afb7cb97edad 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -15,12 +15,12 @@ typedef short v4s __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) { - v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} - v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}} - int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 299a793a7b31e1..04ac0a66db7ce7 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -8,8 +8,8 @@ typedef short v4s __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) { - int out_4 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_5 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_6 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 79f374af240c7e..113b54b853a9f4 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -9,8 +9,8 @@ typedef short v8s __attribute__((ext_vector_type(8))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr) { - v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} - v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index df523827e668d4..b5fcad68a47020 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -9,40 +9,40 @@ typedef short v8s __attribute__((ext_vector_type(8))); // Wave32 // -// amdgcn_global_load_tr_b64 +// amdgcn_global_load_tr // -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]] // -v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr) +v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr) { - return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr); + return __builtin_amdgcn_global_load_tr_v2i32(inptr); } // -// amdgcn_global_load_tr_b128 +// amdgcn_global_load_tr // -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]] // -v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) +v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr) { - return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); + return __builtin_amdgcn_global_load_tr_v8i16(inptr); } -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] // -v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) +v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr) { - return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); + return __builtin_amdgcn_global_load_tr_v8f16(inptr); } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index 06b51216407377..9c48ac071b4d3f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -8,40 +8,40 @@ typedef short v4s __attribute__((ext_vector_type(4))); // Wave64 // -// amdgcn_global_load_tr_b64 +// amdgcn_global_load_tr // -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret i32 [[TMP0]] // -int test_amdgcn_global_load_tr_b64_i32(global int* inptr) +int test_amdgcn_global_load_tr_i32(global int* inptr) { - return __builtin_amdgcn_global_load_tr_b64_i32(inptr); + return __builtin_amdgcn_global_load_tr_i32(inptr); } // -// amdgcn_global_load_tr_b128 +// amdgcn_global_load_tr // -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]] // -v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) +v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr) { - return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); + return __builtin_amdgcn_global_load_tr_v4i16(inptr); } -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] // -v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) +v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr) { - return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); + return __builtin_amdgcn_global_load_tr_v4f16(inptr); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ad850c9c31490c..3a419b4b56caf2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2506,16 +2506,15 @@ class AMDGPUGlobalLoadTr<LLVMType data_ty> : >; // Wave32 -// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -// <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1)) +// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64 +// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128 +// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128 // Wave64 -// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -// <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) +// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64 +// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128 +// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128 -def int_amdgcn_global_load_tr_b64 : AMDGPUGlobalLoadTr<llvm_any_ty>; -def int_amdgcn_global_load_tr_b128 : AMDGPUGlobalLoadTr<llvm_any_ty>; +def int_amdgcn_global_load_tr : AMDGPUGlobalLoadTr<llvm_any_ty>; //===----------------------------------------------------------------------===// // Deep learning intrinsics. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0cfab44a7a0354..410dd352a8459a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4837,8 +4837,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_global_load_tr_b64: - case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_global_load_tr: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 47c3d806e487e3..c6df1c4132f940 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1572,14 +1572,14 @@ let OtherPredicates = [isGFX12Plus] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; let WaveSizePredicate = isWave32 in { - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>; } let WaveSizePredicate = isWave64 in { - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>; } } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 89a9138d4d2c62..398c84b9f79aec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -2,9 +2,9 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s -declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) -declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) -declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-SDAG-W32-LABEL: global_load_tr_b64: @@ -34,7 +34,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-GISEL-W32-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } @@ -67,7 +67,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-GISEL-W32-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-GISEL-W32-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep) store <8 x half> %val, ptr addrspace(1) %use ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 73dc4fa506c7ba..04151cd2db2e0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -2,9 +2,9 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s -declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) -declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) -declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { ; GFX12-SDAG-W64-LABEL: global_load_tr_b64: @@ -34,7 +34,7 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp ; GFX12-GISEL-W64-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep) + %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep) store i32 %val, ptr addrspace(1) %use ret void } @@ -67,7 +67,7 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a ; GFX12-GISEL-W64-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep) + %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep) store <4 x i16> %val, ptr addrspace(1) %use ret void } @@ -100,7 +100,7 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr ; GFX12-GISEL-W64-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep) + %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep) store <4 x half> %val, ptr addrspace(1) %use ret void } >From 4659b3c2d07b1423a155c207fc237edd1c4e4934 Mon Sep 17 00:00:00 2001 From: Piotr Sobczak <piotr.sobc...@amd.com> Date: Fri, 12 Jan 2024 13:58:38 +0100 Subject: [PATCH 3/3] Appease clang formatter --- clang/lib/CodeGen/CGBuiltin.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f9794ebd6be33b..16b57a6ba51b01 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18213,7 +18213,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); - llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy}); + llvm::Function *F = + CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy}); return Builder.CreateCall(F, {Addr}); } case AMDGPU::BI__builtin_amdgcn_read_exec: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits