llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-ir Author: Aaditya (easyonaadit) <details> <summary>Changes</summary> Currently there are no plans to push these into a public header. Initial use is for testing purposes. --- Patch is 523.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127013.diff 27 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+18) - (modified) clang/lib/CodeGen/CGBuiltin.cpp (+53) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn.cl (+77) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+8-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+10-1) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+114-10) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+23-8) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll (+1237) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll (+986) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.max.ll (+986) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.min.ll (+986) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll (+986) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll (+1286) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.uadd.ll (+1240) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+12-12) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+12-12) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.usub.ll (+1286) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll (+1290) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.add.mir (+90) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.and.mir (+89) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.max.mir (+89) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.min.mir (+89) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.or.mir (+89) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.sub.mir (+92) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umax.mir (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.umin.mir (+2-2) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.reduce.xor.mir (+92) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 39e295aced96b..2018f41f1007f 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -346,6 +346,24 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr") BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n") BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n") +//===----------------------------------------------------------------------===// + +// Wave Reduction builtins. + +//===----------------------------------------------------------------------===// + +BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_uadd_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_usub_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_umin_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_umax_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_and_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_or_i32, "ii", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_xor_i32, "ii", "nc") + //===----------------------------------------------------------------------===// // R600-NI only builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 361e4c4bf2e2e..1cd4fe6a974a0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -20212,6 +20212,59 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Value *Env = EmitScalarExpr(E->getArg(0)); return Builder.CreateCall(F, {Env}); } + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32: + IID = Intrinsic::amdgcn_wave_reduce_add; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_uadd_i32: + IID = Intrinsic::amdgcn_wave_reduce_uadd; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32: + IID = Intrinsic::amdgcn_wave_reduce_sub; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_usub_i32: + IID = Intrinsic::amdgcn_wave_reduce_usub; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32: + IID = Intrinsic::amdgcn_wave_reduce_min; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umin_i32: + IID = Intrinsic::amdgcn_wave_reduce_umin; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32: + IID = Intrinsic::amdgcn_wave_reduce_max; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_umax_i32: + IID = Intrinsic::amdgcn_wave_reduce_umax; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_i32: + IID = Intrinsic::amdgcn_wave_reduce_and; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_i32: + IID = Intrinsic::amdgcn_wave_reduce_or; + break; + case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_i32: + IID = Intrinsic::amdgcn_wave_reduce_xor; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Function *F = CGM.getIntrinsic(IID, {Src0->getType()}); + llvm::Value *Strategy = + llvm::ConstantInt::get(llvm::Type::getInt32Ty(getLLVMContext()), 0); + return Builder.CreateCall(F, {Src0, Strategy}); + } case AMDGPU::BI__builtin_amdgcn_read_exec: return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false); case AMDGPU::BI__builtin_amdgcn_read_exec_lo: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index ded5f6b5ac4fd..94a7b5ff9bbb2 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -398,6 +398,83 @@ void test_s_sendmsghalt_var(int in) __builtin_amdgcn_s_sendmsghalt(1, in); } +// CHECK-LABEL: @test_wave_reduce_add_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32( +void test_wave_reduce_add_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_add_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_uadd_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.uadd.i32( +void test_wave_reduce_uadd_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_uadd_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_sub_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32( +void test_wave_reduce_sub_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_sub_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_usub_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.usub.i32( +void test_wave_reduce_usub_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_usub_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_min_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32( +void test_wave_reduce_min_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_min_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_umin_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32( +void test_wave_reduce_umin_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_umin_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_max_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32( +void test_wave_reduce_max_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_max_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_umax_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32( +void test_wave_reduce_umax_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_umax_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_and_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32( +void test_wave_reduce_and_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_and_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_or_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32( +void test_wave_reduce_or_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_or_i32(in); +} + +// CHECK-LABEL: @test_wave_reduce_xor_i32 +// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32( +void test_wave_reduce_xor_i32(global int* out, int in) +{ + *out = __builtin_amdgcn_wave_reduce_xor_i32(in); +} + // CHECK-LABEL: @test_s_barrier // CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier( void test_s_barrier() diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index eb7bde6999491..fac4228d3bc1f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2327,8 +2327,14 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< ], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; -def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; -def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +multiclass AMDGPUWaveReduceOps<list<string> Operations> { + foreach Op = Operations in { def Op : AMDGPUWaveReduce; } +} + +defvar Operations = [ + "umin", "min", "umax", "max", "uadd", "add", "usub", "sub", "and", "or", "xor" +]; +defm int_amdgcn_wave_reduce_ : AMDGPUWaveReduceOps<Operations>; def int_amdgcn_readfirstlane : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 2e5f42c3bdc40..7d8fb718a88ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4981,8 +4981,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); break; } + case Intrinsic::amdgcn_wave_reduce_add: + case Intrinsic::amdgcn_wave_reduce_uadd: + case Intrinsic::amdgcn_wave_reduce_sub: + case Intrinsic::amdgcn_wave_reduce_usub: + case Intrinsic::amdgcn_wave_reduce_min: case Intrinsic::amdgcn_wave_reduce_umin: - case Intrinsic::amdgcn_wave_reduce_umax: { + case Intrinsic::amdgcn_wave_reduce_max: + case Intrinsic::amdgcn_wave_reduce_umax: + case Intrinsic::amdgcn_wave_reduce_and: + case Intrinsic::amdgcn_wave_reduce_or: + case Intrinsic::amdgcn_wave_reduce_xor: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b632c50dae0e3..6711809bf0d31 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4940,6 +4940,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return LoopBB; } +static uint32_t getIdentityValueForWaveReduction(unsigned Opc) { + switch (Opc) { + case AMDGPU::S_MIN_U32: + return std::numeric_limits<uint32_t>::max(); + case AMDGPU::S_MIN_I32: + return std::numeric_limits<int32_t>::max(); + case AMDGPU::S_MAX_U32: + return std::numeric_limits<uint32_t>::min(); + case AMDGPU::S_MAX_I32: + return std::numeric_limits<int32_t>::min(); + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + case AMDGPU::S_OR_B32: + case AMDGPU::S_XOR_B32: + return std::numeric_limits<uint32_t>::min(); + case AMDGPU::S_AND_B32: + return std::numeric_limits<uint32_t>::max(); + default: + llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction"); + } +} + static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, @@ -4955,13 +4977,78 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, Register DstReg = MI.getOperand(0).getReg(); MachineBasicBlock *RetBB = nullptr; if (isSGPR) { - // These operations with a uniform value i.e. SGPR are idempotent. - // Reduced value will be same as given sgpr. - // clang-format off - BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg) - .addReg(SrcReg); - // clang-format on - RetBB = &BB; + switch (Opc) { + case AMDGPU::S_MIN_U32: + case AMDGPU::S_MIN_I32: + case AMDGPU::S_MAX_U32: + case AMDGPU::S_MAX_I32: + case AMDGPU::S_AND_B32: + case AMDGPU::S_OR_B32: { + // Idempotent operations. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); + RetBB = &BB; + break; + } + case AMDGPU::S_XOR_B32: + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: { + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); + const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); + Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass); + Register ActiveLanes = MRI.createVirtualRegister(DstRegClass); + + bool IsWave32 = ST.isWave32(); + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned CountReg = + IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64; + + auto Exec = + BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg); + + auto NewAccumulator = BuildMI(BB, MI, DL, TII->get(CountReg), ActiveLanes) + .addReg(Exec->getOperand(0).getReg()); + + switch (Opc) { + case AMDGPU::S_XOR_B32: { + // Performing an XOR operation on a uniform value + // depends on the parity of the number of active lanes. + // For even parity, the result will be 0, for odd + // parity the result will be the same as the input value. + Register ParityRegister = MRI.createVirtualRegister(DstRegClass); + + auto ParityReg = + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister) + .addReg(NewAccumulator->getOperand(0).getReg()) + .addImm(1); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(ParityReg->getOperand(0).getReg()); + break; + } + case AMDGPU::S_SUB_I32: { + Register NegatedVal = MRI.createVirtualRegister(DstRegClass); + + // Take the negation of the source operand. + auto InvertedValReg = + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), NegatedVal) + .addImm(-1) + .addReg(SrcReg); + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(InvertedValReg->getOperand(0).getReg()) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + case AMDGPU::S_ADD_I32: { + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg) + .addReg(SrcReg) + .addReg(NewAccumulator->getOperand(0).getReg()); + break; + } + } + RetBB = &BB; + } + } } else { // TODO: Implement DPP Strategy and switch based on immediate strategy // operand. For now, for all the cases (default, Iterative and DPP we use @@ -4997,9 +5084,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; // Create initail values of induction variable from Exec, Accumulator and - // insert branch instr to newly created ComputeBlockk - uint32_t InitalValue = - (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; + // insert branch instr to newly created ComputeBlock + uint32_t InitalValue = getIdentityValueForWaveReduction(Opc); auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) @@ -5071,8 +5157,26 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, switch (MI.getOpcode()) { case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); + case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32); + case AMDGPU::WAVE_REDUCE_UADD_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32); + case AMDGPU::WAVE_REDUCE_USUB_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32); + case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32); + case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32); + case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..8a3e93a4faa95 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -277,16 +277,31 @@ def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>; -let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { - def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { +// clang-format off +defvar int_amdgcn_wave_reduce_ = "int_amdgcn_wave_reduce_"; +multiclass + AMDGPUWaveReducePseudoGenerator<string Op, string DataType, string Size> { + let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def !toupper(Op) #"_PSEUDO_" #DataType #Size + : VPseudoInstSI<(outs SGPR_32 : $sdst), + (ins VSrc_b32 : $src, VSrc_b32 : $strategy), + [(set i32 : $sdst, (!cast<AMDGPUWaveReduce>(int_amdgcn_wave_reduce_ #Op) i32 : $src, i32 : $strategy))]> {} } +} +// clang-format on - def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), - (ins VSrc_b32: $src, VSrc_b32:$strategy), - [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { - } +// Input list : [Operation_name, +// type - Signed(I)/Unsigned(U)/Float(F)/Bitwise(B), +// Size_in_bits] +defvar Operations = [ + ["umin", "U", "32"], ["min", "I", "32"], ["umax", "U", "32"], + ["max", "I", "32"], ["uadd", "U", "32"], ["add", "I", "32"], + ["usub", "U", "32"], ["sub", "I", "32"], ["and", "B", "32"], + ["or", "B", "32"], ["xor", "B", "32"] +]; + +foreach Op = Operations in { + defm WAVE_REDUCE_ : AMDGPUWaveReducePseudoGenerator<Op[0], Op[1], Op[2]>; } let usesCustomInserter = 1, Defs = [VCC] in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll new file mode 100644 index 0000000000000..e93e8d4108ba0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll @@ -0,0 +1,1237 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 < %s | FileCheck -check-prefixes=GFX8DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 < %s | FileCheck -check-prefixes=GFX8GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 < %s | FileCheck -check-prefixes=GFX9DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefixes=GFX9GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s + +define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) { +; GFX8DAGISEL-LABEL: uniform_value: +; GFX8DAGISEL: ; %bb.0... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/127013 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits