https://github.com/mariusz-sikora-at-amd created https://github.com/llvm/llvm-project/pull/75917
- image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16 >From f0920d06a57b3bc77b50baf94c4616be597e74c3 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora <mariusz.sik...@amd.com> Date: Mon, 18 Dec 2023 20:08:18 +0100 Subject: [PATCH] [AMDGPU][GFX12] Add 16 bit atomic fadd instructions - image_atomic_pk_add_f16 - image_atomic_pk_add_bf16 - ds_pk_add_bf16 - ds_pk_add_f16 - ds_pk_add_rtn_bf16 - ds_pk_add_rtn_f16 - flat_atomic_pk_add_f16 - flat_atomic_pk_add_bf16 - global_atomic_pk_add_f16 - global_atomic_pk_add_bf16 - buffer_atomic_pk_add_f16 - buffer_atomic_pk_add_bf16 --- clang/test/CodeGenOpenCL/amdgpu-features.cl | 4 +- .../builtins-fp-atomics-gfx12.cl | 92 ++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 45 ++ llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 26 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 4 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 21 + llvm/lib/Target/AMDGPU/DSInstructions.td | 12 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 + llvm/lib/Target/AMDGPU/MIMGInstructions.td | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/lib/TargetParser/TargetParser.cpp | 4 + .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 433 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll | 60 +++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 18 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s | 75 +++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 132 ++++++ llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 60 +++ llvm/test/MC/AMDGPU/gfx12_asm_vimage.s | 54 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 60 +++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 84 ++++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 60 +++ .../Disassembler/AMDGPU/gfx12_dasm_vimage.txt | 54 +++ 29 files changed, 1329 insertions(+), 21 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl create mode 100644 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 8959634572b44e..fe1798406967e8 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -100,8 +100,8 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1200: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1201: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl new file mode 100644 index 00000000000000..20ff12c3376370 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl @@ -0,0 +1,92 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX12 %s + +// REQUIRES: amdgpu-registered-target + +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; + +// CHECK-LABEL: test_local_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// GFX12-LABEL: test_local_add_2bf16 +// GFX12: ds_pk_add_rtn_bf16 +short2 test_local_add_2bf16(__local short2 *addr, short2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_local_add_2bf16_noret +// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> % +// GFX12-LABEL: test_local_add_2bf16_noret +// GFX12: ds_pk_add_bf16 +void test_local_add_2bf16_noret(__local short2 *addr, short2 x) { + __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_local_add_2f16 +// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// GFX12-LABEL: test_local_add_2f16 +// GFX12: ds_pk_add_rtn_f16 +half2 test_local_add_2f16(__local half2 *addr, half2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_local_add_2f16_noret +// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> % +// GFX12-LABEL: test_local_add_2f16_noret +// GFX12: ds_pk_add_f16 +void test_local_add_2f16_noret(__local half2 *addr, half2 x) { + __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_flat_add_2f16 +// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_flat_add_2f16 +// GFX12: flat_atomic_pk_add_f16 +half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_flat_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_flat_add_2bf16 +// GFX12: flat_atomic_pk_add_bf16 +short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_global_add_half2 +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_global_add_half2 +// GFX12: global_atomic_pk_add_f16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN +void test_global_add_half2(__global half2 *addr, half2 x) { + half2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_add_half2_noret +// CHECK: call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %{{.*}}, <2 x half> %{{.*}}) +// GFX12-LABEL: test_global_add_half2_noret +// GFX12: global_atomic_pk_add_f16 v[0:1], v2, off +void test_global_add_half2_noret(__global half2 *addr, half2 x) { + __builtin_amdgcn_global_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_global_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_global_add_2bf16 +// GFX12: global_atomic_pk_add_bf16 v2, v[0:1], v2, off th:TH_ATOMIC_RETURN +void test_global_add_2bf16(__global short2 *addr, short2 x) { + short2 *rtn; + *rtn = __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_global_add_2bf16_noret +// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %{{.*}}, <2 x i16> %{{.*}}) +// GFX12-LABEL: test_global_add_2bf16_noret +// GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off +void test_global_add_2bf16_noret(__global short2 *addr, short2 x) { + __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 51bd9b63c127ed..bea39743525b23 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1026,6 +1026,9 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { defm int_amdgcn_image_atomic_cmpswap : AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, AMDGPUArg<LLVMMatchType<0>, "cmp">]>; + + defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; + defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimAtomic<"ATOMIC_PK_ADD_BF16">; } ////////////////////////////////////////////////////////////////////////// @@ -1294,6 +1297,26 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; +// gfx12+ intrinsic +def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + llvm_v4i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + AMDGPUBufferRsrcTy, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrArgMemOnly, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < [data_ty], @@ -1368,6 +1391,28 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; +// gfx12 intrinsic +def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + llvm_v4i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; +def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < + [llvm_v2i16_ty], + [llvm_v2i16_ty, + AMDGPUBufferRsrcTy, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty, + llvm_i32_ty], + [IntrArgMemOnly, NoCapture<ArgIndex<1>>, + ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<1, 0>; // gfx90a intrinsics def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 060fb66d38f7bc..14b2155938eef6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1473,6 +1473,10 @@ def FeatureISAVersion12 : FeatureSet< FeatureArchitectedFlatScratch, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureImageInsts, FeatureExtendedImageInsts, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40be..f426b7f38428da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -261,6 +261,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 156a264a7c1faa..8205bdac4e2c5d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5425,6 +5425,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) + NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b55199..8d972c46447b8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -587,6 +587,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, + BUFFER_ATOMIC_FADD_BF16, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee2888945185..32964e5c2e3ef1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5872,6 +5872,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: @@ -6079,6 +6082,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); LLT Ty = MRI->getType(VData); + const bool IsAtomicPacked16Bit = + (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + // Check for 16 bit addresses and pack if true. LLT GradTy = MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); @@ -6087,7 +6094,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( const bool IsG16 = ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; const bool IsA16 = AddrTy == S16; - const bool IsD16 = Ty.getScalarType() == S16; + const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { @@ -6129,7 +6136,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( LLT Ty = MRI->getType(VData0); // TODO: Allow atomic swap and bit ops for v2s16/v4s16 - if (Ty.isVector()) + if (Ty.isVector() && !IsAtomicPacked16Bit) return false; if (BaseOpcode->AtomicX2) { @@ -6265,9 +6272,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (NumElts > 4 || DMaskLanes > 4) return false; + // Image atomic instructions are using DMask to specify how many bits + // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). + // DMaskLanes for image atomic has default value '0'. + // We must be sure that atomic variants (especially packed) will not be + // truncated from v2s16 or v4s16 to s16 type. + // + // ChangeElementCount will be needed for image load where Ty is always scalar. const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; const LLT AdjustedTy = - Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); + DMaskLanes == 0 + ? Ty + : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); // The raw dword aligned data component of the load. The only legal cases // where this matters should be when using the packed D16 format, for @@ -7069,6 +7085,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: return legalizeBufferAtomic(MI, B, IntrID); case Intrinsic::trap: return legalizeTrapIntrinsic(MI, MRI, B); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9412f720c62ec..d8d19f65190bb4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2920,6 +2920,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { applyDefaultMapping(OpdMapper); @@ -4200,6 +4201,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: + case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { // vdata_out diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index beb670669581f1..f4415aaa6b1ff9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -278,6 +278,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; @@ -294,6 +295,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; @@ -310,6 +312,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; @@ -326,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291ca0..2a6ad3b0d4a25b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1245,6 +1245,13 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; +let SubtargetPredicate = isGFX12Plus in { +let FPAtomic = 1 in +defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics < + "buffer_atomic_pk_add_bf16", VGPR_32, v2i16 +>; +} + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1708,6 +1715,10 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = isGFX12Plus in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2i16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">; +} + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; @@ -1772,14 +1783,22 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { + let SubtargetPredicate = isGFX9Only in defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; + + let SubtargetPredicate = isGFX12Plus in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>; } // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] let OtherPredicates = [HasAtomicFaddRtnInsts] in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { + let SubtargetPredicate = isGFX9Only in defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; + + let SubtargetPredicate = isGFX12Plus in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>; } // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] let OtherPredicates = [isGFX90APlus] in { @@ -2634,6 +2653,8 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033, defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">; defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">; defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">; +defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>; +defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>; //===----------------------------------------------------------------------===// // MUBUF - GFX10. diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 3a895923fa4b98..3f208aaa453df6 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -487,10 +487,10 @@ let SubtargetPredicate = isGFX90APlus in { } // End SubtargetPredicate = isGFX90APlus let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { - defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; - defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; - defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; - defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; + defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">; + defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; + defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">; + defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; @@ -1236,6 +1236,10 @@ defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">; defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>; +defm DS_PK_ADD_F16 : DS_Real_gfx12<0x09a>; +defm DS_PK_ADD_RTN_F16 : DS_Real_gfx12<0x0aa>; +defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>; +defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>; //===----------------------------------------------------------------------===// // GFX11. diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0dd2b3f5c2c912..d959282f6a5a97 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2597,6 +2597,8 @@ defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMI defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">; defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">; defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; +defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059, "FLAT_ATOMIC_PK_ADD_F16", "flat_atomic_pk_add_f16">; +defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a, "FLAT_ATOMIC_PK_ADD_BF16", "flat_atomic_pk_add_bf16">; // ENC_VGLOBAL. defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>; @@ -2654,6 +2656,8 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059, "GLOBAL_ATOMIC_PK_ADD_F16", "global_atomic_pk_add_f16">; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a, "GLOBAL_ATOMIC_PK_ADD_BF16", "global_atomic_pk_add_bf16">; // ENC_VSCRATCH. defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 240366c8e7daae..28ba445b908d9e 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -1553,6 +1553,8 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c> defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>; defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>; defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>; +defm IMAGE_ATOMIC_PK_ADD_F16 : MIMG_Atomic <mimgopc<0x86, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_f16", 0, 1>; +defm IMAGE_ATOMIC_PK_ADD_BF16 : MIMG_Atomic <mimgopc<0x87, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_bf16", 0, 1>; defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>; let OtherPredicates = [HasExtendedImageInsts] in { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4f4bc45e49b43e..b2b6940abe709a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7035,17 +7035,17 @@ static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, // Re-construct the required return value for a image load intrinsic. // This is more complicated due to the optional use TexFailCtrl which means the required // return type is an aggregate -static SDValue constructRetValue(SelectionDAG &DAG, - MachineSDNode *Result, - ArrayRef<EVT> ResultTypes, - bool IsTexFail, bool Unpacked, bool IsD16, - int DMaskPop, int NumVDataDwords, +static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, + ArrayRef<EVT> ResultTypes, bool IsTexFail, + bool Unpacked, bool IsD16, int DMaskPop, + int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ? - ReqRetNumElts : (ReqRetNumElts + 1) / 2; + int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit) + ? (ReqRetNumElts + 1) / 2 + : ReqRetNumElts; int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? DMaskPop : (DMaskPop + 1) / 2; @@ -7070,7 +7070,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, } } - if (DataDwordVT.isVector()) + if (DataDwordVT.isVector() && !IsAtomicPacked16Bit) Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, NumDataDwords - MaskPopDwords); @@ -7177,6 +7177,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue VData; int NumVDataDwords; bool AdjustRetType = false; + bool IsAtomicPacked16Bit = false; // Offset of intrinsic arguments const unsigned ArgOffset = WithChain ? 2 : 1; @@ -7187,6 +7188,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (BaseOpcode->Atomic) { VData = Op.getOperand(2); + IsAtomicPacked16Bit = + (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || + Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); + bool Is64Bit = VData.getValueSizeInBits() == 64; if (BaseOpcode->AtomicX2) { SDValue VData2 = Op.getOperand(3); @@ -7516,10 +7521,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } if (BaseOpcode->Store) return SDValue(NewNode, 0); - return constructRetValue(DAG, NewNode, - OrigResultTypes, IsTexFail, - Subtarget->hasUnpackedD16VMem(), IsD16, - DMaskLanes, NumVDataDwords, DL); + return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, + NumVDataDwords, IsAtomicPacked16Bit, DL); } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, @@ -8383,9 +8387,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_FADD_BF16); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); @@ -15277,6 +15287,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: case AMDGPUISD::BUFFER_ATOMIC_CSUB: case AMDGPUISD::BUFFER_ATOMIC_FADD: + case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16: case AMDGPUISD::BUFFER_ATOMIC_FMIN: case AMDGPUISD::BUFFER_ATOMIC_FMAX: // Target-specific read-modify-write atomics are sources of divergence. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 173c877b8d29ef..2e6c147805e066 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -195,6 +195,7 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; +defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f9bc623abcd04b..6dddb9217ca3a5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3707,6 +3707,7 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index d741d2ce7942df..824e0979e22ffb 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -281,6 +281,10 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["dot9-insts"] = true; Features["dot10-insts"] = true; Features["dl-insts"] = true; + Features["atomic-ds-pk-add-16-insts"] = true; + Features["atomic-flat-pk-add-16-insts"] = true; + Features["atomic-buffer-global-pk-add-f16-insts"] = true; + Features["atomic-global-pk-add-bf16-inst"] = true; Features["16-bit-insts"] = true; Features["dpp"] = true; Features["gfx8-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll new file mode 100644 index 00000000000000..0b06013a7324e0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -0,0 +1,433 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL + +declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16>, <4 x i32>, i32, i32, i32, i32 immarg) +declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32) +declare <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32, i32, i32) +declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) +declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) +declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) + +define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX12-LABEL: local_atomic_fadd_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret void +} + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: ds_pk_add_bf16 v0, v1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: buffer_gl0_inv +; GFX12-GISEL-NEXT: buffer_gl1_inv +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret void +} + +define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) { +; GFX12-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret <2 x half> %ret +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) { +; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: buffer_gl0_inv +; GFX12-NEXT: buffer_gl1_inv +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX12-GISEL-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: buffer_gl0_inv +; GFX12-GISEL-NEXT: buffer_gl1_inv +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { +; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { +; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) + ret <2 x half> %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { +; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { +; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { +; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) { +; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) { +; GFX12-LABEL: global_atomic_pk_add_v2f16: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + ret void +} + +define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) { +; GFX12-LABEL: global_atomic_pk_add_v2f16_rtn: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn: +; GFX12-GISEL: ; %bb.0: ; %main_body +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data) + ret <2 x half> %ret +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret_offset: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) + ret <2 x half> %ret +} + +define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2f16_ret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog + %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret <2 x half> %ret +} + +define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: struct_buffer_atomic_add_v2f16_ret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog + %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %r = bitcast <2 x half> %orig to float + ret float %r +} + +define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: struct_buffer_atomic_add_v2f16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_ret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog + %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + %r = bitcast <2 x i16> %orig to float + ret float %r +} + +define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: struct_buffer_atomic_add_v2bf16_noret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %orig = call <2 x i16> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + ret void +} + +define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x i16> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) { +; GFX12-LABEL: raw_buffer_atomic_add_v2bf16_ret: +; GFX12: ; %bb.0: +; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: ; return to shader part epilog + %orig = call <2 x i16> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x i16> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) + %ret = bitcast <2 x i16> %orig to float + ret float %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll new file mode 100644 index 00000000000000..9affd307c6a970 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -march=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) { +; GFX12-LABEL: atomic_pk_add_f16_1d_v2: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +main_body: + %out = call <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out_float = bitcast <2 x half> %out to float + ret float %out_float +} + +define amdgpu_ps float @atomic_pk_add_f16_1d_v4(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) { +; GFX12-LABEL: atomic_pk_add_f16_1d_v4: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +main_body: + %out = call <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out_v2f32 = bitcast <4 x half> %out to <2 x float> + %out0 = extractelement <2 x float> %out_v2f32, i32 0 + ret float %out0 +} + +define amdgpu_ps float @atomic_pk_add_bf16_1d_v2(<8 x i32> inreg %rsrc, <2 x i16> %data, i32 %s) { +; GFX12-LABEL: atomic_pk_add_bf16_1d_v2: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +main_body: + %out = call <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out_i32 = bitcast <2 x i16> %out to i32 + %out_float = bitcast i32 %out_i32 to float + ret float %out_float +} + +define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x i16> %data, i32 %s) { +; GFX12-LABEL: atomic_pk_add_bf16_1d_v4: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: ; return to shader part epilog +main_body: + %out = call <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16> %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %out_v2i32 = bitcast <4 x i16> %out to <2 x i32> + %out0 = extractelement <2 x i32> %out_v2i32, i32 0 + %out_float = bitcast i32 %out0 to float + ret float %out_float +} + +declare <2 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v2f16.v2f16(<2 x half>, i32, <8 x i32>, i32, i32) +declare <4 x half> @llvm.amdgcn.image.atomic.pk.add.f16.1d.v4f16.v4f16(<4 x half>, i32, <8 x i32>, i32, i32) +declare <2 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v2i16.v2i16(<2 x i16>, i32, <8 x i32>, i32, i32) +declare <4 x i16> @llvm.amdgcn.image.atomic.pk.add.bf16.1d.v4i16.v4i16(<4 x i16>, i32, <8 x i32>, i32, i32) diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index 89078c1ad4e049..d530d2e09c7628 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -22,6 +22,9 @@ buffer_atomic_min_f64 v[2:3], off, s[12:15], s4 offset:4095 buffer_atomic_pk_add_f16 v0, v2, s[4:7], 0 idxen glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + buffer_inv // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -169,9 +172,15 @@ flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +flat_atomic_pk_add_bf16 v1, v[2:3], v2 th:TH_ATOMIC_RETURN +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +flat_atomic_pk_add_f16 v1, v[2:3], v2 th:TH_ATOMIC_RETURN +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -196,6 +205,9 @@ global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 global_atomic_pk_add_f16 v0, v[0:1], v2, off glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + global_load_lds_dword v2, s[4:5] offset:4 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU @@ -220,6 +232,12 @@ image_atomic_fmax v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D glc image_atomic_fmin v4, v32, s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_1D glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + image_gather4_b_cl_o v[252:255], v[1:8], s[8:15], s[12:15] dmask:0x1 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index ba32dc8820eaad..2cc7d6b085c9e9 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -939,6 +939,81 @@ ds_permute_b32 v5, v1, v2 offset:0 ds_permute_b32 v255, v255, v255 offset:4 // GFX12: [0x04,0x00,0xc8,0xda,0xff,0xff,0x00,0xff] +ds_pk_add_f16 v2, v1 +// GFX12: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_f16 v2, v1 offset:0 +// GFX12: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_f16 v2, v1 offset:4660 +// GFX12: [0x34,0x12,0x68,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_f16 v2, v1 offset:65535 +// GFX12: [0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_f16 v255, v255 +// GFX12: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_f16 v255, v255 offset:0 +// GFX12: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_f16 v255, v255 offset:4660 +// GFX12: [0x34,0x12,0x68,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_f16 v255, v255 offset:65535 +// GFX12: [0xff,0xff,0x68,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_f16 v0, v0 +// GFX12: [0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00] + +ds_pk_add_bf16 v2, v1 +// GFX12: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_bf16 v2, v1 offset:0 +// GFX12: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00] + +ds_pk_add_bf16 v255, v255 +// GFX12: [0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_bf16 v255, v255 offset:4660 +// GFX12: [0x34,0x12,0x6c,0xda,0xff,0xff,0x00,0x00] + +ds_pk_add_bf16 v0, v0 +// GFX12: [0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00] + +ds_pk_add_bf16 v0, v0 offset:65535 +// GFX12: [0xff,0xff,0x6c,0xda,0x00,0x00,0x00,0x00] + +ds_pk_add_rtn_f16 v3, v2, v1 +// GFX12: [0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03] + +ds_pk_add_rtn_f16 v3, v2, v1 offset:4660 +// GFX12: [0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03] + +ds_pk_add_rtn_f16 v255, v0, v200 +// GFX12: [0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff] + +ds_pk_add_rtn_f16 v255, v0, v200 offset:65535 +// GFX12: [0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff] + +ds_pk_add_rtn_f16 v255, v255, v255 +// GFX12: [0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff] + +ds_pk_add_rtn_bf16 v3, v2, v1 +// GFX12: [0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03] + +ds_pk_add_rtn_bf16 v3, v2, v1 offset:4660 +// GFX12: [0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03] + +ds_pk_add_rtn_bf16 v255, v0, v200 +// GFX12: [0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff] + +ds_pk_add_rtn_bf16 v255, v255, v255 +// GFX12: [0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff] + +ds_pk_add_rtn_bf16 v255, v255, v255 offset:65535 +// GFX12: [0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff] + ds_read2_b32 v[5:6], v1 // GFX12: [0x00,0x00,0xdc,0xd8,0x01,0x00,0x00,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s index a7a256cfd2b8fe..ca9fb549769bc8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s @@ -2527,6 +2527,138 @@ buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_NT buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS // GFX12: encoding: [0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f] +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:8388607 +// GFX12: encoding: [0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:8388607 +// GFX12: encoding: [0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:8388607 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7 +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV +// GFX12: encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_f16 v5, off, s[8:11], 0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +buffer_atomic_pk_add_f16 v5, off, s[8:11], -1 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +buffer_atomic_pk_add_f16 v5, off, s[8:11], 0.5 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +buffer_atomic_pk_add_f16 v5, off, s[8:11], -4.0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 glc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 slc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 glc slc dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:8388607 +// GFX12: encoding: [0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:8388607 +// GFX12: encoding: [0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:8388607 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7 +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV +// GFX12: encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], -1 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], 0.5 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], -4.0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 glc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 slc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 glc slc dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:8388607 // GFX12: encoding: [0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s index c0ffc5247d90e8..c435bc5a16dbed 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s @@ -882,6 +882,66 @@ global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 // GFX12: encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] + +global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_f16 v0, v2, s[0:1] +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] + +global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 +// GFX12: encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 +// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_f16 v[0:1], v2, off offset:64 +// GFX12: encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] + +global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_bf16 v0, v2, s[0:1] +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] + +global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 +// GFX12: encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 +// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 +// GFX12: encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + global_atomic_dec_u32 v0, v2, s[0:1] offset:-64 // GFX12: encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s index 72bc164c5e9bdc..093d4de5525f44 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vimage.s @@ -883,6 +883,60 @@ image_atomic_dec_uint v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_R image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT // GFX12: encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] +image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +// GFX12: encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +image_atomic_pk_add_f16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +// GFX12: encoding: [0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00] + +image_atomic_pk_add_f16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D +// GFX12: encoding: [0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_f16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +// GFX12: encoding: [0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_f16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY +// GFX12: encoding: [0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00] + +image_atomic_pk_add_f16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY +// GFX12: encoding: [0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_f16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA +// GFX12: encoding: [0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_f16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +// GFX12: encoding: [0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07] + +image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +// GFX12: encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] + +image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +// GFX12: encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +image_atomic_pk_add_bf16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D +// GFX12: encoding: [0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00] + +image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D +// GFX12: encoding: [0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_bf16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +// GFX12: encoding: [0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_bf16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY +// GFX12: encoding: [0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00] + +image_atomic_pk_add_bf16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY +// GFX12: encoding: [0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_bf16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA +// GFX12: encoding: [0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00] + +image_atomic_pk_add_bf16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +// GFX12: encoding: [0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07] + +image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +// GFX12: encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] + image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7] // GFX12: encoding: [0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index d3c0e714949907..d52c4fef0b8ad4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -1659,6 +1659,66 @@ # GFX12: ds_permute_b32 v255, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xc8,0xda,0xfe,0xfd,0x00,0xff] 0xff,0xff,0xc8,0xda,0xfe,0xfd,0x00,0xff +# GFX12: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00] +0x00,0x00,0x68,0xda,0x02,0x01,0x00,0x00 + +# GFX12: ds_pk_add_f16 v2, v1 offset:65535 ; encoding: [0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00] +0xff,0xff,0x68,0xda,0x02,0x01,0x00,0x00 + +# GFX12: ds_pk_add_f16 v255, v255 ; encoding: [0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00] +0x00,0x00,0x68,0xda,0xff,0xff,0x00,0x00 + +# GFX12: ds_pk_add_f16 v0, v0 ; encoding: [0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00] +0x00,0x00,0x68,0xda,0x00,0x00,0x00,0x00 + +# GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00] +0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00 + +# gfx12: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00] +0x00,0x00,0x6c,0xda,0x02,0x01,0x00,0x00 + +# GFX12: ds_pk_add_f16 v0, v0 offset:4660 ; encoding: [0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00] +0x34,0x12,0x68,0xda,0x00,0x00,0x00,0x00 + +# GFX12: ds_pk_add_bf16 v255, v255 ; encoding: [0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00] +0x00,0x00,0x6c,0xda,0xff,0xff,0x00,0x00 + +# GFX12: ds_pk_add_bf16 v255, v255 offset:65535 ; encoding: [0xff,0xff,0x6c,0xda,0xff,0xff,0x00,0x00] +0xff,0xff,0x6c,0xda,0xff,0xff,0x00,0x00 + +# GFX12: ds_pk_add_bf16 v0, v0 ; encoding: [0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00] +0x00,0x00,0x6c,0xda,0x00,0x00,0x00,0x00 + +# GFX12: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03] +0x00,0x00,0xa8,0xda,0x02,0x01,0x00,0x03 + +# GFX12: ds_pk_add_rtn_f16 v3, v2, v1 offset:4660 ; encoding: [0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03] +0x34,0x12,0xa8,0xda,0x02,0x01,0x00,0x03 + +# GFX12: ds_pk_add_rtn_f16 v255, v0, v200 ; encoding: [0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff] +0x00,0x00,0xa8,0xda,0x00,0xc8,0x00,0xff + +# GFX12: ds_pk_add_rtn_f16 v255, v0, v200 offset:65535 ; encoding: [0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff] +0xff,0xff,0xa8,0xda,0x00,0xc8,0x00,0xff + +# GFX12: ds_pk_add_rtn_f16 v255, v255, v255 ; encoding: [0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff] +0x00,0x00,0xa8,0xda,0xff,0xff,0x00,0xff + +# GFX12: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03] +0x00,0x00,0xac,0xda,0x02,0x01,0x00,0x03 + +# GFX12: ds_pk_add_rtn_bf16 v3, v2, v1 offset:4660 ; encoding: [0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03] +0x34,0x12,0xac,0xda,0x02,0x01,0x00,0x03 + +# GFX12: ds_pk_add_rtn_bf16 v255, v0, v200 ; encoding: [0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff] +0x00,0x00,0xac,0xda,0x00,0xc8,0x00,0xff + +# GFX12: ds_pk_add_rtn_bf16 v255, v255, v255 ; encoding: [0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff] +0x00,0x00,0xac,0xda,0xff,0xff,0x00,0xff + +# GFX12: ds_pk_add_rtn_bf16 v255, v255, v255 offset:65535 ; encoding: [0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff] +0xff,0xff,0xac,0xda,0xff,0xff,0x00,0xff + # GFX12: ds_load_2addr_b32 v[254:255], v1 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0xfe] 0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0xfe diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt index ff8437155e12ed..d882fcb6c69a0b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt @@ -1560,6 +1560,90 @@ # GFX12: buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:8388607 th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f] 0x04,0xc0,0x01,0xc4,0x01,0x18,0xbc,0x00,0x00,0xff,0xff,0x7f +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x65,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x7d,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] +0x03,0x40,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_f16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] +0x03,0x40,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x65,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x7d,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] +0x03,0x80,0x16,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00 + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_pk_add_bf16 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] +0x03,0x80,0x16,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f + # GFX12: buffer_atomic_add_f32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] 0x03,0x80,0x15,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt index d7f9daf295845a..78a96f68918c46 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt @@ -525,6 +525,66 @@ # GFX12: global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] 0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 +# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00 + +# GFX12: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX12: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_f16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX12: global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + # GFX12: global_atomic_dec_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] 0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt index eff2e09e31b1fe..46331d6476171c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vimage.txt @@ -883,6 +883,60 @@ # GFX12: image_atomic_dec_uint v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] 0x00,0x80,0x45,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00 +# GFX12: image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX12: image_atomic_pk_add_f16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00] +0x01,0x80,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00 + +# GFX12: image_atomic_pk_add_f16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x02,0x80,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_f16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x03,0x80,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_f16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00] +0x04,0x80,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00 + +# GFX12: image_atomic_pk_add_f16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x05,0x80,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_f16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA ; encoding: [0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x06,0x80,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_f16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07] +0x07,0x80,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07 + +# GFX12: image_atomic_pk_add_f16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] +0x00,0x80,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00 + +# GFX12: image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x00,0xc0,0x61,0xd0,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX12: image_atomic_pk_add_bf16 v1, [v2, v3], s[4:11] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00] +0x01,0xc0,0x61,0xd0,0x01,0x08,0x00,0x00,0x02,0x03,0x00,0x00 + +# GFX12: image_atomic_pk_add_bf16 v4, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_3D ; encoding: [0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x02,0xc0,0x61,0xd0,0x04,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_bf16 v255, [v4, v5, v6], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_CUBE ; encoding: [0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x03,0xc0,0x61,0xd0,0xff,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_bf16 v[0:1], [v4, v5], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00] +0x04,0xc0,0xe1,0xd0,0x00,0x10,0x00,0x00,0x04,0x05,0x00,0x00 + +# GFX12: image_atomic_pk_add_bf16 v[1:2], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x05,0xc0,0xe1,0xd0,0x01,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_bf16 v[3:4], [v4, v5, v6], s[8:15] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA ; encoding: [0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00] +0x06,0xc0,0xe1,0xd0,0x03,0x10,0x00,0x00,0x04,0x05,0x06,0x00 + +# GFX12: image_atomic_pk_add_bf16 v[254:255], [v4, v5, v6, v7], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; encoding: [0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07] +0x07,0xc0,0xe1,0xd0,0xfe,0xc0,0x00,0x00,0x04,0x05,0x06,0x07 + +# GFX12: image_atomic_pk_add_bf16 v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; encoding: [0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00] +0x00,0xc0,0x61,0xd0,0x00,0x00,0x20,0x00,0x00,0x00,0x00,0x00 + # GFX12: image_bvh_intersect_ray v[4:7], [v9, v10, v[11:13], v[14:16], v[17:19]], s[4:7] ; encoding: [0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e] 0x10,0x40,0xc6,0xd3,0x04,0x08,0x00,0x11,0x09,0x0a,0x0b,0x0e _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits