llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: None (alex-t) <details> <summary>Changes</summary> We currently lower the SI_IF/ELSE, SI_LOOP, and SI_END_CF to reconverge the wave at the beginning of the CF join basic block or on the loop exit block. This leads to numerous issues related to the spill/split insertion points. LLVM core kits consider the start of the block as the best point to reload the spilled registers. As a result, the vector loads are incorrectly masked out. A similar issue arose when the split kit split the live interval on the CF joining block: the spills were inserted before the exec mask was restored. --- Patch is 5.20 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/92809.diff 327 Files Affected: - (modified) clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu (+1-1) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2-2) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+5-4) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+8-11) - (modified) llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp (+51-41) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+30-4) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+39-4) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+8-4) - (modified) llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp (+194-350) - (modified) llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp (+3-1) - (modified) llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp (+1-1) - (added) llvm/test/%t (+1) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir (+2-2) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/temporal-divergence.mir (-21) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir (+1-3) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll (+14-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir (+3-11) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+105-74) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir (+37-61) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (+98-62) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir (+46-70) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll (+17-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir (+6-18) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll (+4-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir (-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll (+24-14) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+88-56) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll (+35-41) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll (-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll (+9-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i64.ll (+5-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+72-68) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.add.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll (+16-40) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.fadd.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll (+6-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.ll (+24-60) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll (+12-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll (+24-60) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.add.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.ll (+12-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f16.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.format.f32.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll (+12-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.f16.ll (+12-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.i8.ll (+12-30) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.store.ll (+10-25) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll (+6-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.f16.ll (+18-45) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.i8.ll (+18-45) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.store.ll (+20-50) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll (+96-240) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.add.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll (+16-40) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.fadd.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll (+6-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f16.ll (+6-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.format.f32.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.add.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.ll (+8-20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f16.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.format.f32.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.ll (+2-5) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll (+6-15) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.ll (+4-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll (+214-170) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll (+17-10) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll (+23-22) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll (+22-13) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll (+25-17) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn-s-buffer-load.mir (+4-8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll (+8-16) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll (+12-24) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll (+48-72) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll (+6-12) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-waterfall-agpr.mir (+4-8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+262-257) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+225-218) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+80-73) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+76-69) - (modified) llvm/test/CodeGen/AMDGPU/amdpal-callable.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll (+23-13) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+318-228) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+700-548) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+771-546) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+210-155) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+276-192) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+276-192) - (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+117-94) - (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll (+12-9) - (modified) llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll (+24-20) - (modified) llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir (+13-14) - (modified) llvm/test/CodeGen/AMDGPU/branch-condition-and.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+414-371) - (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+50-35) - (modified) llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll (+6-65) - (modified) llvm/test/CodeGen/AMDGPU/bypass-div.ll (+60-36) - (modified) llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/call-skip.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll (+140-92) - (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll (+32-26) - (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+483-450) - (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.mir (+303-198) - (modified) llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/convergence-tokens.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/cse-convergent.ll (+14-13) - (modified) llvm/test/CodeGen/AMDGPU/cse-phi-incoming-val.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+12-9) - (modified) llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/dagcombine-v1i8-extractvecelt-crash.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+376-392) - (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+503-415) - (modified) llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll (+17-8) - (modified) llvm/test/CodeGen/AMDGPU/dpp_combine.mir (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/early-tailduplicator-terminator.mir (+7-9) - (modified) llvm/test/CodeGen/AMDGPU/else.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll (-4) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+732-534) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+732-534) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+22-15) - (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+12-8) - (modified) llvm/test/CodeGen/AMDGPU/fold-fabs.ll (+26-11) - (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+88-56) - (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+482-348) - (modified) llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+13-6) - (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll (+440-304) - (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (+416-272) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+732-534) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+732-534) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+1601-969) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+1198-728) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+1198-728) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+1501-907) - (modified) llvm/test/CodeGen/AMDGPU/hoist-cond.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/hsa.ll (-1) - (modified) llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll (+16-12) - (modified) llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir (+8-19) - (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+237-297) - (modified) llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll (+8-4) - (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+144-174) - (modified) llvm/test/CodeGen/AMDGPU/infinite-loop.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/inline-asm.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+20-10) - (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+136-96) - (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll (+74-40) - (modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+464-260) - (modified) llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll (+51-39) - (modified) llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll (+110-66) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.f16.ll (+10-15) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.format.ll (+12-18) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.load.ll (+70-105) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f16.ll (+22-33) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.format.f32.ll (+24-36) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.buffer.store.ll (+66-99) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.f16.ll (+10-15) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.format.ll (+12-18) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.load.ll (+70-105) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f16.ll (+22-33) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.format.f32.ll (+24-36) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.buffer.store.ll (+66-99) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.f16.ll (+16-24) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.load.ll (+18-27) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.f16.ll (+20-30) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.ptr.tbuffer.store.ll (+50-75) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.f16.ll (+16-24) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.load.ll (+18-27) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.f16.ll (+20-30) - (modified) llvm/test/CodeGen/AMDGPU/legalize-amdgcn.raw.tbuffer.store.ll (+50-75) - (modified) llvm/test/CodeGen/AMDGPU/legalize-soffset-mbuf.ll (+64-96) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll (+16-8) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll (+16-8) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (+158-108) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll (+158-108) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll (+1) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll (+20-12) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+22-20) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll (+17-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll (+204-168) - (modified) llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll (+462-350) - (modified) llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll (+11-9) - (modified) llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll (+4-3) - (modified) llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/loop_break.ll (+25-17) - (modified) llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll (+22-16) - (modified) llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir (+59-47) - (modified) llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir (+96-101) - (modified) llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir (+33-39) - (modified) llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir (+4-8) - (modified) llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir (+15-14) - (modified) llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir (+2-6) - (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll (+24-16) - (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir (+2-6) - (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+226-181) ``````````diff diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu index 946927d88a1ee..3ca766755a631 100644 --- a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu +++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu @@ -10,7 +10,7 @@ // GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope // GFX90A-CAS-LABEL: _Z14atomic_add_casPf // GFX90A-CAS: flat_atomic_cmpswap -// GFX90A-CAS: s_cbranch_execnz +// GFX90A-CAS: s_cbranch_scc1 __device__ float atomic_add_cas(float *p) { return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index be8048ca2459c..75ad7ed5e3fa2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3172,8 +3172,8 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] >; -def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], - [IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty], + [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; // Represent unreachable in a divergent region. def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b48a09489653a..9374933986080 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1553,11 +1553,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { +bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic( + MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. MachineBasicBlock *BB = MI.getParent(); - BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF)) + BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE)) .add(MI.getOperand(1)); Register Reg = MI.getOperand(1).getReg(); @@ -2083,8 +2084,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( MachineInstr &I) const { unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID(); switch (IntrinsicID) { - case Intrinsic::amdgcn_end_cf: - return selectEndCfIntrinsic(I); + case Intrinsic::amdgcn_wave_reconverge: + return selectWaveReconvergeIntrinsic(I); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: return selectDSOrderedIntrinsic(I, IntrinsicID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index f561d5d29efc4..44c89684893f7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectReturnAddress(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; - bool selectEndCfIntrinsic(MachineInstr &MI) const; + bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const; bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const; bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 56345d14a331c..368cc98b9a585 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -785,8 +785,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); const unsigned MovExecOpc = Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - const unsigned MovExecTermOpc = - Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; const unsigned XorTermOpc = Subtarget.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; @@ -949,9 +947,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( B.setInsertPt(*BodyBB, BodyBB->end()); + Register LoopMask = MRI.createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); // Update EXEC, switch all done bits to 0 and all todo bits to 1. B.buildInstr(XorTermOpc) - .addDef(ExecReg) + .addDef(LoopMask) .addReg(ExecReg) .addReg(NewExec); @@ -959,18 +959,15 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // s_cbranch_scc0? // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. - B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB); + B.buildInstr(AMDGPU::SI_WATERFALL_LOOP) + .addReg(LoopMask) + .addReg(NewExec) + .addMBB(LoopBB); // Save the EXEC mask before the loop. BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) .addReg(ExecReg); - // Restore the EXEC mask after the loop. - B.setMBB(*RestoreExecBB); - B.buildInstr(MovExecTermOpc) - .addDef(ExecReg) - .addReg(SaveExecReg); - // Set the insert point after the original instruction, so any new // instructions will be in the remainder. B.setInsertPt(*RemainderBB, RemainderBB->begin()); @@ -4954,7 +4951,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); break; } - case Intrinsic::amdgcn_end_cf: { + case Intrinsic::amdgcn_wave_reconverge: { unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 08e1d6b87b0df..68d81a6ffaaff 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -15,6 +15,7 @@ #include "GCNSubtarget.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass { Function *Else; Function *IfBreak; Function *Loop; - Function *EndCf; + Function *WaveReconverge; DominatorTree *DT; StackVector Stack; @@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass { bool handleLoop(BranchInst *Term); - bool closeControlFlow(BasicBlock *BB); + bool tryWaveReconverge(BasicBlock *BB); public: static char ID; @@ -141,7 +142,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, { IntMask }); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); + WaveReconverge = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_wave_reconverge, { IntMask }); } /// Is the branch condition uniform or did the StructurizeCFG pass @@ -203,8 +204,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// Open a new "If" block bool SIAnnotateControlFlow::openIf(BranchInst *Term) { - if (isUniform(Term)) - return false; IRBuilder<> IRB(Term); Value *IfCall = IRB.CreateCall(If, {Term->getCondition()}); @@ -305,43 +304,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) { } /// Close the last opened control flow -bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { - llvm::Loop *L = LI->getLoopFor(BB); +bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) { - assert(Stack.back().first == BB); + if (succ_empty(BB)) + return false; - if (L && L->getHeader() == BB) { - // We can't insert an EndCF call into a loop header, because it will - // get executed on every iteration of the loop, when it should be - // executed only once before the loop. - SmallVector <BasicBlock *, 8> Latches; - L->getLoopLatches(Latches); + BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); + if (Term->getNumSuccessors() == 1) { + // The current BBs single successor is a top of the stack. We need to + // reconverge over thaqt path. + BasicBlock *SingleSucc = *succ_begin(BB); + BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end(); - SmallVector<BasicBlock *, 2> Preds; - for (BasicBlock *Pred : predecessors(BB)) { - if (!is_contained(Latches, Pred)) - Preds.push_back(Pred); + if (isTopOfStack(SingleSucc)) { + Value *Exec = Stack.back().second; + IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec}); } - - BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr, - false); - } - - Value *Exec = popSaved(); - BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt(); - if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) { - Instruction *ExecDef = cast<Instruction>(Exec); - BasicBlock *DefBB = ExecDef->getParent(); - if (!DT->dominates(DefBB, BB)) { - // Split edge to make Def dominate Use - FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); + } else { + // We have a uniform conditional branch terminating the block. + // THis block may be the last in the Then path of the enclosing divergent + // IF. + if (!isUniform(Term)) + // Divergent loop is going to be further processed in another place + return false; + + for (auto Succ : Term->successors()) { + if (isTopOfStack(Succ)) { + // Just split to make a room for further WAVE_RECONVERGE insertion + SmallVector<BasicBlock*, 2> Preds; + for (auto P : predecessors(Succ)) { + if (DT->dominates(BB, P)) + Preds.push_back(P); + } + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, + nullptr, false); + } } - IRBuilder<> IRB(FirstInsertionPt->getParent(), FirstInsertionPt); - // TODO: StructurizeCFG 'Flow' blocks have debug locations from the - // condition, for now just avoid copying these DebugLocs so that stepping - // out of the then/else block in a debugger doesn't step to the condition. - IRB.SetCurrentDebugLocation(DebugLoc()); - IRB.CreateCall(EndCf, {Exec}); } return true; @@ -365,14 +364,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + + Changed |= tryWaveReconverge(BB); continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(BB)) - Changed |= closeControlFlow(BB); + Stack.pop_back(); + + // Let's take care of uniform loop latch that may be closing the Then + // path of the enclosing divergent branch. + Changed |= tryWaveReconverge(BB); if (DT->dominates(Term->getSuccessor(1), BB)) Changed |= handleLoop(Term); @@ -387,10 +392,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { continue; } - Changed |= closeControlFlow(BB); + Stack.pop_back(); } - Changed |= openIf(Term); + if (isUniform(Term)) + // Uniform conditional branch may be in the block that closes the Then + // path of the divergent conditional branch. + Changed |= tryWaveReconverge(BB); + else + Changed |= openIf(Term); } if (!Stack.empty()) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d7b6941fcf81d..ea1e7c782e02d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6299,7 +6299,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return AMDGPUISD::ELSE; case Intrinsic::amdgcn_loop: return AMDGPUISD::LOOP; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: llvm_unreachable("should not occur"); default: return 0; @@ -9940,8 +9940,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(Load, 0); } - case Intrinsic::amdgcn_end_cf: - return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, + case Intrinsic::amdgcn_wave_reconverge: + return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL, MVT::Other, Op->getOperand(2), Chain), 0); case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_join: @@ -15740,6 +15740,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } + // ISel inserts copy to regs for the successor PHIs + // at the BB end. We need to move the SI_WAVE_RECONVERGE right before the + // branch. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) { + MachineBasicBlock::iterator I(MI); + MachineBasicBlock::iterator Next = std::next(I); + bool NeedToMove = false; + while (Next != MBB.end() && !Next->isBranch()) { + NeedToMove = true; + Next++; + } + + assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) && + "Malformed CFG detected!\n"); + + if (NeedToMove) { + MBB.splice(Next, &MBB, &MI); + } + + break; + } + } + } + // FIXME: This is a hack to fixup AGPR classes to use the properly aligned // classes if required. Ideally the register class constraints would differ // per-subtarget, but there's no easy way to achieve that right now. This is @@ -16336,7 +16362,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, default: Result = false; break; - case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_wave_reconverge: case Intrinsic::amdgcn_loop: Result = true; break; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 08351c49b2231..3412846a5abd9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2103,12 +2103,36 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; + case AMDGPU::S_CMOV_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CMOV_B64)); + break; + case AMDGPU::S_MOV_B32_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_MOV_B32)); break; + case AMDGPU::S_CMOV_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CMOV_B32)); + break; + + case AMDGPU::S_CSELECT_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CSELECT_B32)); + break; + + case AMDGPU::S_CSELECT_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_CSELECT_B64)); + break; + case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -3088,20 +3112,25 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, while (I != E && !I->isBranch() && !I->isReturn()) { switch (I->getOpcode()) { case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_CMOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_AND_B64_term: case AMDGPU::S_AND_SAVEEXEC_B64_term: + case AMDGPU::S_CSELECT_B64_term: case AMDGPU::S_MOV_B32_term: + case AMDGPU::S_CMOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: case AMDGPU::S_AND_B32_term: case AMDGPU::S_AND_SAVEEXEC_B32_term: + case AMDGPU::S_CSELECT_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: + case AMDGPU::SI_WAVE_RECONVERGE: case AMDGPU::SI_KILL_I1_TERMINATOR: case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: // FIXME: It's messy that these need to be considered here at all. @@ -6386,6 +6415,7 @@ static void emitLoadScalarOpsFromVGPRLoop( } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register LoopMask = MRI.createVirtualRegister(BoolXExecRC); MRI.setSimpleHint(SaveExec, CondReg); // Update EXEC to matching lanes, saving original to SaveExec. @@ -6396,11 +6426,14 @@ static void emitLoadScalarOpsFromVGPRLoop( I = BodyBB.end(); // Update EXEC, switch all done bits to 0 and all todo bits to 1. - BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec) + BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), LoopMask) .addReg(Exec) .addReg(SaveExec); - BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); + BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)) + .addReg(LoopMask) + .addReg(SaveExec) + .addMBB(&LoopBB); } // Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register @@ -6502,8 +6535,10 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, .addImm(0); } + // BuildMI(*BodyBB, BodyBB->end(), DL, TII.get(AMDGPU::S_BRANCH)) + // .addMBB(RemainderBB); // Restore the EXEC mask - BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); + // BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); return BodyBB; } @@ -8782,7 +8817,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, .add(Branch->getOperand(0)) .add(Branch->getOperand(1)); MachineInstr *SIEND = - BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_WAVE_RECONVERGE)) .addReg(DstReg); IfEntry->erase(TI); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e7aeaa017306c..c526d5ad662eb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -350,6 +350,8 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< let WaveSizePredicate = isWave64 in { def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; +def S_CMOV_B64_term : WrapTerminatorInst<S_CMOV_B64>; +def S_CSELECT_B64_term : WrapTerminatorInst<S_CSELECT_B64>; def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; @@ -359,6 +361,8 @@ def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>; let WaveSizePredicate = isWave32 in { def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; +def S_CMOV_B32_term : WrapTerminatorInst<S_CMOV_B32>; +def S_CSELECT_B32_term : WrapTerminatorInst<S_CSELECT_B32>; def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; @@ -460,7 +464,7 @@ def SI_ELSE : CFPseudoInstSI < def SI_WATERFALL_LOOP : CFPseudoInstSI < (outs), - (ins brtarget:$target), [], 1> { + (ins SReg_1:$LoopMask, SReg_1:$ExitMask, brtarget:$target), [], 1> { let Size = 8;... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/92809 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits