llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Christudasan Devadasan (cdevadas) <details> <summary>Changes</summary> Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load. --- Patch is 1023.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96162.diff 116 Files Affected: - (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1) - (modified) llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (+63-16) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (+84-85) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+51-51) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+50-50) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll (+42-42) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+102-102) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+65-65) - (modified) llvm/test/CodeGen/AMDGPU/add.v2i16.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+204-204) - (modified) llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/bfe-patterns.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/bfm.ll (+5-5) - (modified) llvm/test/CodeGen/AMDGPU/bitreverse.ll (+157-157) - (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/cluster_stores.ll (+60-60) - (modified) llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/ctlz.ll (+227-227) - (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+223-223) - (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+52-48) - (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/cttz.ll (+185-185) - (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+212-212) - (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+331-334) - (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/ds_read2.ll (+8-8) - (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/fabs.f16.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/fabs.ll (+17-17) - (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+31-32) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+33-33) - (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+143-143) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+56-56) - (modified) llvm/test/CodeGen/AMDGPU/fma-combine.ll (+88-88) - (modified) llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll (+240-240) - (modified) llvm/test/CodeGen/AMDGPU/fnearbyint.ll (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+23-23) - (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.ll (+17-17) - (modified) llvm/test/CodeGen/AMDGPU/fneg.ll (+13-13) - (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll (+26-26) - (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+22-22) - (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+13-13) - (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll (+33-33) - (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll (+46-46) - (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/fp_to_sint.ll (+19-19) - (modified) llvm/test/CodeGen/AMDGPU/fp_to_uint.ll (+19-19) - (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+44-44) - (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+98-98) - (modified) llvm/test/CodeGen/AMDGPU/half.ll (+41-41) - (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+88-88) - (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+124-124) - (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+53-53) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll (+15-15) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll (+32-32) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll (+50-50) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+8-8) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+184-208) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll (+84-84) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+66-66) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll (+14-14) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll (+14-14) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+122-122) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+40-40) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll (+41-41) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+58-58) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+58-58) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+13-13) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+32-32) - (modified) llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+30-30) - (modified) llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+92-92) - (modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/merge-s-load.mir (+144-36) - (modified) llvm/test/CodeGen/AMDGPU/min.ll (+27-27) - (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll (+80-80) - (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+18-11) - (modified) llvm/test/CodeGen/AMDGPU/mul_int24.ll (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll (+55-55) - (modified) llvm/test/CodeGen/AMDGPU/or.ll (+8-8) - (modified) llvm/test/CodeGen/AMDGPU/packed-op-sel.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll (+24-24) - (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+13-13) - (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/sign_extend.ll (+27-27) - (modified) llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+17-17) - (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+17-17) - (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+15-15) - (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+56-56) - (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+27-27) - (modified) llvm/test/CodeGen/AMDGPU/xor.ll (+5-5) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index db5b467f22389..19d5b950d7142 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -967,6 +967,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } + bool hasXnackReplay() const { return GFX8Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 8b42d4a1dee7a..0b285d52b539e 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -216,7 +216,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass { CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, const CombineInfo &Paired); - static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); + static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired, + const GCNSubtarget *STI = nullptr); static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired); const TargetRegisterClass * @@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX2: @@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4: @@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return S_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_LOAD_DWORD_IMM; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs( // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); - const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); - const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); + auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + + // The constrained sload instructions in S_LOAD_IMM class will have + // `early-clobber` flag in the dst operand. Remove the flag before using the + // MOs in copies. + if (Dest0->isEarlyClobber()) + Dest0->setIsEarlyClobber(false); + + if (Dest1->isEarlyClobber()) + Dest1->setIsEarlyClobber(false); BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. @@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - const unsigned Opcode = getNewOpcode(CI, Paired); + const unsigned Opcode = getNewOpcode(CI, Paired, STM); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, - const CombineInfo &Paired) { + const CombineInfo &Paired, + const GCNSubtarget *STI) { const unsigned Width = CI.Width + Paired.Width; switch (getCommonInstClass(CI, Paired)) { @@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } case S_LOAD_IMM: - switch (Width) { - default: - return 0; - case 2: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 3: - return AMDGPU::S_LOAD_DWORDX3_IMM; - case 4: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 8: - return AMDGPU::S_LOAD_DWORDX8_IMM; + // For targets that support XNACK replay, use the constrained load opcode. + if (STI && STI->hasXnackReplay()) { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM_ec; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM_ec; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM_ec; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM_ec; + } + } else { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM; + } } case GLOBAL_LOAD: switch (Width) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index eb20178f9f4d8..3f034eaca4997 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v1, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -770,6 +770,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 @@ -779,11 +780,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -804,8 +803,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v2, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -858,11 +857,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -884,10 +883,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: flat_load_ubyte v4, v[8:9] ; VI-NEXT: flat_load_ubyte v5, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[12:13] -; VI-NEXT: v_mov_b32_e32 v8, s1 -; VI-NEXT: v_mov_b32_e32 v7, s0 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_mov_b32_e32 v10, s1 ; VI-NEXT: v_mov_b32_e32 v9, s0 ; VI-NEXT: s_waitcnt vmcnt(6) @@ -949,18 +948,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_mov_b32_e32 v8, s0 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v11, s1 ; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1005,19 +1004,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1051,18 +1050,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1096,18 +1095,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1157,11 +1156,1... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/96162 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits