[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (PR #102130)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/102130 isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only. >From b6c03d1785bd713344c6b578869a0b36fc6473e3 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Tue, 6 Aug 2024 13:50:35 +0200 Subject: [PATCH] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix isExtractHiElt should return new source register instead of returning instruction that defines it. Src = MI.getOperand(0).getReg() is not correct when MI(for example G_UNMERGE_VALUES) defines multiple registers. Refactor existing code to work with source registers only. --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 164 -- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +- .../GlobalISel/combine-fma-add-ext-fma.ll | 8 +- 3 files changed, 74 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 73f3921b2ff4c..f78699f88de56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1372,8 +1372,8 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { MachineInstrBuilder SelectedMI; MachineOperand &LHS = I.getOperand(2); MachineOperand &RHS = I.getOperand(3); - auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); - auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); + auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg()); + auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg()); Register Src0Reg = copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); Register Src1Reg = @@ -2467,14 +2467,48 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const { return false; } +static Register stripCopy(Register Reg, MachineRegisterInfo &MRI) { + return getDefSrcRegIgnoringCopies(Reg, MRI)->Reg; +} + +static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI) { + Register BitcastSrc; + if (mi_match(Reg, MRI, m_GBitcast(m_Reg(BitcastSrc +Reg = BitcastSrc; + return Reg; +} + static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out) { + Register Trunc; + if (!mi_match(In, MRI, m_GTrunc(m_Reg(Trunc +return false; + Register LShlSrc; - if (mi_match(In, MRI, - m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16) { -Out = LShlSrc; + Register Cst; + if (mi_match(Trunc, MRI, m_GLShr(m_Reg(LShlSrc), m_Reg(Cst { +Cst = stripCopy(Cst, MRI); +if (mi_match(Cst, MRI, m_SpecificICst(16))) { + Out = stripBitCast(LShlSrc, MRI); + return true; +} + } + + MachineInstr *Shuffle = MRI.getVRegDef(Trunc); + if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) +return false; + + assert(MRI.getType(Shuffle->getOperand(0).getReg()) == + LLT::fixed_vector(2, 16)); + + ArrayRef Mask = Shuffle->getOperand(3).getShuffleMask(); + assert(Mask.size() == 2); + + if (Mask[0] == 1 && Mask[1] <= 1) { +Out = Shuffle->getOperand(0).getReg(); return true; } + return false; } @@ -3550,11 +3584,8 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair -AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, - bool IsCanonicalizing, - bool AllowAbs, bool OpSel) const { - Register Src = Root.getReg(); +std::pair AMDGPUInstructionSelector::selectVOP3ModsImpl( +Register Src, bool IsCanonicalizing, bool AllowAbs, bool OpSel) const { unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -3617,7 +3648,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg()); return {{ [=](MachineInstrBuilder &MIB) { @@ -3633,7 +3664,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), /*IsCanonicalizing=*/true, /*AllowAbs=*/false); @@ -3660,7 +3691,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root); + std::tie(Src, Mods) = select
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (PR #102130)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/102130?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#102130** https://app.graphite.dev/github/pr/llvm/llvm-project/102130?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#102129** https://app.graphite.dev/github/pr/llvm/llvm-project/102129?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @petar-avramovic and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/102130 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Fix isExtractHiElt when selecting fma_mix (PR #102130)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/102130 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 4ab704d - [AMDGPU][MC] Add tfe disassembler support MIMG opcodes
Author: Petar Avramovic Date: 2021-01-20T10:37:09+01:00 New Revision: 4ab704d62820396af5bd4a4322a5cbc2700a7ec3 URL: https://github.com/llvm/llvm-project/commit/4ab704d62820396af5bd4a4322a5cbc2700a7ec3 DIFF: https://github.com/llvm/llvm-project/commit/4ab704d62820396af5bd4a4322a5cbc2700a7ec3.diff LOG: [AMDGPU][MC] Add tfe disassembler support MIMG opcodes With tfe on there can be a vgpr write to vdata+1. Add tablegen support for 5 register vdata store. This is required for 4 register vdata store with tfe. Differential Revision: https://reviews.llvm.org/D94960 Added: Modified: llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp llvm/lib/Target/AMDGPU/MIMGInstructions.td llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt llvm/test/MC/Disassembler/AMDGPU/gfx8_dasm_all.txt llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt llvm/test/MC/Disassembler/AMDGPU/mimg_vi.txt Removed: diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 08b340c8fd66..4a4aad02938a 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -545,9 +545,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { DstSize = (DstSize + 1) / 2; } - // FIXME: Add tfe support if (MI.getOperand(TFEIdx).getImm()) -return MCDisassembler::Success; +DstSize += 1; if (DstSize == Info->VDataDwords && AddrSize == Info->VAddrDwords) return MCDisassembler::Success; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 7baa6823d16a..54c8cdf196ac 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -413,6 +413,8 @@ multiclass MIMG_Store op, string asm, bit has_d16, bit mip = 0> { defm _V3 : MIMG_Store_Addr_Helper ; let VDataDwords = 4 in defm _V4 : MIMG_Store_Addr_Helper ; +let VDataDwords = 5 in +defm _V5 : MIMG_Store_Addr_Helper ; } } diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt index 1e688c94d793..4b9c899003e4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_mimg.txt @@ -43,6 +43,54 @@ # GFX10: image_load_mip_pck_sgn v[16:19], v[8:10], s[4:11] dmask:0xf dim:SQ_RSRC_IMG_2D unorm r128 ; encoding: [0x08,0x9f,0x14,0xf0,0x08,0x10,0x01,0x00] 0x08,0x9f,0x14,0xf0,0x08,0x10,0x01,0x00 +# GFX10: image_load v16, v[8:9], s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x01,0x00,0xf0,0x08,0x10,0x18,0x00] +0x08,0x01,0x00,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:17], v[8:9], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x03,0x00,0xf0,0x08,0x10,0x18,0x00] +0x08,0x03,0x00,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:18], v[8:9], s[96:103] dmask:0x7 dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x07,0x00,0xf0,0x08,0x10,0x18,0x00] +0x08,0x07,0x00,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:19], v[8:9], s[96:103] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x00,0xf0,0x08,0x10,0x18,0x00] +0x08,0x0f,0x00,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:17], v[8:9], s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x01,0x01,0xf0,0x08,0x10,0x18,0x00] +0x08,0x01,0x01,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:18], v[8:9], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x03,0x01,0xf0,0x08,0x10,0x18,0x00] +0x08,0x03,0x01,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:19], v[8:9], s[96:103] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x07,0x01,0xf0,0x08,0x10,0x18,0x00] +0x08,0x07,0x01,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v[16:20], v[8:9], s[96:103] dmask:0xf dim:SQ_RSRC_IMG_2D tfe ; encoding: [0x08,0x0f,0x01,0xf0,0x08,0x10,0x18,0x00] +0x08,0x0f,0x01,0xf0,0x08,0x10,0x18,0x00 + +# GFX10: image_load v16, v[8:9], s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x01,0x00,0xf0,0x08,0x10,0x18,0x80] +0x08,0x01,0x00,0xf0,0x08,0x10,0x18,0x80 + +# GFX10: image_load v16, v[8:9], s[96:103] dmask:0x3 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x03,0x00,0xf0,0x08,0x10,0x18,0x80] +0x08,0x03,0x00,0xf0,0x08,0x10,0x18,0x80 + +# GFX10: image_load v[16:17], v[8:9], s[96:103] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x07,0x00,0xf0,0x08,0x10,0x18,0x80] +0x08,0x07,0x00,0xf0,0x08,0x10,0x18,0x80 + +# GFX10: image_load v[16:17], v[8:9], s[96:103] dmask:0xf dim:SQ_RSRC_IMG_2D d16 ; encoding: [0x08,0x0f,0x00,0xf0,0x08,0x10,0x18,0x80] +0x08,0x0f,0x00,0xf0,0x08,0x10,0x18,0x80 + +# GFX10: image_load v[16:17], v[8:9], s[96:103] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; encoding: [0x08,0x01,0x01,0xf0,0x08,0x10,0x18,0x80] +0x08,0x01,0x01,0xf0,0x08,0x10,0x18,0x80 +
[llvm-branch-commits] [llvm] 3a042dc - [AMDGPU] Fix default value of glc for mubuf rtn atomics
Author: Petar Avramovic Date: 2020-12-07T14:00:08+01:00 New Revision: 3a042dcd2e1ab53244d504cd87a81b577a62a6ea URL: https://github.com/llvm/llvm-project/commit/3a042dcd2e1ab53244d504cd87a81b577a62a6ea DIFF: https://github.com/llvm/llvm-project/commit/3a042dcd2e1ab53244d504cd87a81b577a62a6ea.diff LOG: [AMDGPU] Fix default value of glc for mubuf rtn atomics Mubuf rtn atomics use GLC_1 thus default value for glc operand should be -1, see https://reviews.llvm.org/D90730. This allows us to report error when rtn atomic requires glc=1 but does not have glc operand in input. Differential Revision: https://reviews.llvm.org/D92654 Added: Modified: llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp llvm/test/MC/AMDGPU/gfx1030_err.s llvm/test/MC/AMDGPU/gfx1030_new.s Removed: diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 45774935287b..56d97588df6e 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -6691,7 +6691,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); if (!IsAtomic || IsAtomicReturn) { -addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); +addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC, + IsAtomicReturn ? -1 : 0); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s index 5a57d3b3048a..dbee18bd2d91 100644 --- a/llvm/test/MC/AMDGPU/gfx1030_err.s +++ b/llvm/test/MC/AMDGPU/gfx1030_err.s @@ -140,3 +140,9 @@ ds_write_src2_b32 v1 offset:65535 ds_write_src2_b64 v1 offset:65535 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 +// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must use glc + +global_atomic_csub v2, v[0:1], v2, off offset:100 slc +// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must use glc diff --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s index 8dc977498cd5..94f4ff3a237a 100644 --- a/llvm/test/MC/AMDGPU/gfx1030_new.s +++ b/llvm/test/MC/AMDGPU/gfx1030_new.s @@ -27,7 +27,7 @@ global_atomic_csub v2, v0, v2, s[2:3] glc global_atomic_csub v2, v0, v2, s[2:3] offset:100 glc slc // GFX10: encoding: [0x64,0x80,0xd3,0xdc,0x00,0x02,0x02,0x02] -buffer_atomic_csub v5, off, s[8:11], s3 +buffer_atomic_csub v5, off, s[8:11], s3 glc // GFX10: encoding: [0x00,0x40,0xd0,0xe0,0x00,0x05,0x02,0x03] buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From f354d303a9addd878dbca7ba88ae71a196173518 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 17 Oct 2024 16:39:55 +0200 Subject: [PATCH] AMDGPU/GlobalISel: RBLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../Target/AMDGPU/AMDGPURBLegalizeHelper.cpp | 302 - .../Target/AMDGPU/AMDGPURBLegalizeHelper.h| 7 +- .../Target/AMDGPU/AMDGPURBLegalizeRules.cpp | 307 - .../lib/Target/AMDGPU/AMDGPURBLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 942 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp index a0f6ecedab7a83..f58f0a315096d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +unsigned NumEltsMerge = +MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); +auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + if (i < NumEltsMerge) +MergeTyParts.push_back(UnmergeReg); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.erase
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 317c41b80b26e55ee35c5859700d91d36e58cd2a Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 17 Oct 2024 15:43:06 +0200 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. Most notably this improves number of values that can be allocated to sgpr in AMDGPU's RBSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 23 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-rb-legalize.mir | 70 .../regbankselect-mui-rb-select.mir | 18 ++--- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 81 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 52 ++-- 9 files changed, 188 insertions(+), 176 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..c21838d227e2d3 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,28 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() template <> -bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); +bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &MI) { + if (!MI.isPHI()) +return false; + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + Register This = MI.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) { +Register Incoming = MI.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_PHI %{{[0-9]*}}:_(s32), %bb.1, %{{[0-9]*}}:_(s32), %bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_PHI %{{[0-9]*}}:_(s1), %bb.1, %{{[0-9]*}}:_(s1), %bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112865)
https://github.com/petar-avramovic closed https://github.com/llvm/llvm-project/pull/112865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112865)
petar-avramovic wrote: Ignore this, see https://github.com/llvm/llvm-project/pull/112882 https://github.com/llvm/llvm-project/pull/112865 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,334 @@ +//===-- AMDGPURBLegalizeRules.cpp -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===--===// + +#include "AMDGPURBLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( +std::initializer_list DstOpMappingList, +std::initializer_list SrcOpMappingList, +LoweringMethodID LoweringMethod) +: DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( +std::initializer_list OpList, +std::function TestFunc) +: OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: +return MRI.getType(Reg) == LLT::scalar(1); + case S16: +return MRI.getType(Reg) == LLT::scalar(16); + case S32: +return MRI.getType(Reg) == LLT::scalar(32); + case S64: +return MRI.getType(Reg) == LLT::scalar(64); + case P1: +return MRI.getType(Reg) == LLT::pointer(1, 64); + + case UniS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: +return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + + case DivS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: +return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + + case _: petar-avramovic wrote: That is "don't check anything", for example used in: ``` addRulesForIOpcs({amdgcn_frexp_exp}) .Any({{DivS32, _, S32}, {{Vgpr32}, {IntrId, Vgpr32}}}) .Any({{DivS32, _, S64}, {{Vgpr32}, {IntrId, Vgpr64}}}); ``` it is used to skip non-virtual-registers(intrinsics ID, imm, physical register or $noreg) in PredicateMapping when writing rules for amdgpu-intrinsics https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,258 @@ +//===- AMDGPURBLegalizeRules -*- C++ -*-==// petar-avramovic wrote: Don't think sharing is good option for this patch. RBLegalizeRules are much more flexible and I would assume faster because of "FastPredicateSlot". If we add more IDs that would work with LLTs only we could rewrite Legalizer using RBLegalizeRules. Other way around is questionable, did not consider upgrading LegalityPredicate and LegalizeMutation to work with Register banks https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -0,0 +1,118 @@ +//===- AMDGPURBLegalizeHelper *- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURBLEGALIZEHELPER_H + +#include "AMDGPURBLegalizeRules.h" +#include "AMDGPURegisterBankInfo.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" + +namespace llvm { +namespace AMDGPU { + +// Receives list of RegBankLLTMapingApplyID and applies register banks on all +// operands. It is user's responsibility to provide RegBankLLTMapingApplyIDs for +// all register operands, there is no need to specify NonReg for trailing imm +// operands. This finishes selection of register banks if there is no need to +// replace instruction. In other case InstApplyMethod will create new +// instruction(s). +class RegBankLegalizeHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const MachineUniformityInfo &MUI; + const RegisterBankInfo &RBI; + const RegBankLegalizeRules &RBLRules; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT V2S32 = LLT::fixed_vector(2, 32); + LLT V3S32 = LLT::fixed_vector(3, 32); + LLT V4S32 = LLT::fixed_vector(4, 32); + LLT V6S32 = LLT::fixed_vector(6, 32); + LLT V7S32 = LLT::fixed_vector(7, 32); + LLT V8S32 = LLT::fixed_vector(8, 32); + + LLT V3S64 = LLT::fixed_vector(3, 64); + LLT V4S64 = LLT::fixed_vector(4, 64); + LLT V16S64 = LLT::fixed_vector(16, 64); + + LLT P1 = LLT::pointer(1, 64); + LLT P4 = LLT::pointer(4, 64); + LLT P6 = LLT::pointer(6, 32); + +public: + RegBankLegalizeHelper(MachineIRBuilder &B, MachineRegisterInfo &MRI, +const MachineUniformityInfo &MUI, +const RegisterBankInfo &RBI, +const RegBankLegalizeRules &RBLRules) + : B(B), MRI(MRI), MUI(MUI), RBI(RBI), RBLRules(RBLRules), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; + + bool findRuleAndApplyMapping(MachineInstr &MI); + + // Manual apply helpers. + void applyMappingPHI(MachineInstr &MI); + void applyMappingTrivial(MachineInstr &MI); + +private: + Register createVgpr(LLT Ty) { +return MRI.createVirtualRegister({VgprRB, Ty}); + } + Register createSgpr(LLT Ty) { +return MRI.createVirtualRegister({SgprRB, Ty}); + } + Register createVcc() { return MRI.createVirtualRegister({VccRB, S1}); } + + const RegisterBank *getRegBank(Register Reg) { +const RegisterBank *RB = MRI.getRegBankOrNull(Reg); +// This assert is not guaranteed by default. RB-select ensures that all +// instructions that we want to RB-legalize have reg banks on all registers. +// There might be a few exceptions. Workaround for them is to not write +// 'mapping' for register operand that is expected to have reg class. +assert(RB); +return RB; petar-avramovic wrote: > What about the assigned register class case? I intended for RBLegalize to work with register banks only. RBSelect is meant to deal with assigned register classes: - on def in first loop // Assign register banks to ALL def registers on G_ instructions. - on use in second loop // Reassign uses of G_ instructions to only have register banks. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); petar-avramovic wrote: Since this is our pass I felt there was no need to complicate it with observers https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); + + // Assign register banks to ALL def registers on G_ instructions. + // Same for copies if they have no register bank or class on def. + for (MachineBasicBlock &MBB : MF) { +for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) +continue; + + for (MachineOperand &DefOP : MI.defs()) { +Register DefReg = getVReg(DefOP); +if (!DefReg) + continue; + +// Copies can have register class on def registers. +if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; +} + +if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID)); +} else { + if (MRI.getType(DefReg) == LLT::scalar(1)) +setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID)); + else +setRB(MI, DefOP, B, MRI, RBI.getRegBank(A
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { petar-avramovic wrote: I used exec regardless of wave32/64 should I switch to TRI.getExec()? ``` B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) .addUse(AMDGPU::EXEC, RegState::Implicit); ``` https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/112882 Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. >From 5cc5acd4754d7a78fa7df9ae45542e3c1561f13b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 17 Oct 2024 16:39:55 +0200 Subject: [PATCH] AMDGPU/GlobalISel: RBLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../Target/AMDGPU/AMDGPURBLegalizeHelper.cpp | 302 - .../Target/AMDGPU/AMDGPURBLegalizeHelper.h| 7 +- .../Target/AMDGPU/AMDGPURBLegalizeRules.cpp | 307 - .../lib/Target/AMDGPU/AMDGPURBLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 942 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp index a0f6ecedab7a83..f58f0a315096d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +unsigned NumEltsMerge = +MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); +auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); +
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112866 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/112882?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#112866** https://app.graphite.dev/github/pr/llvm/llvm-project/112866?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#112882** https://app.graphite.dev/github/pr/llvm/llvm-project/112882?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#112864** https://app.graphite.dev/github/pr/llvm/llvm-project/112864?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>: 1 other dependent PR ([#112865](https://github.com/llvm/llvm-project/pull/112865) https://app.graphite.dev/github/pr/llvm/llvm-project/112865?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>) * **#112863** https://app.graphite.dev/github/pr/llvm/llvm-project/112863?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#112862** https://app.graphite.dev/github/pr/llvm/llvm-project/112862?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @petar-avramovic and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 0be714e28b9a7d2c0ee070827efc1587524bbc28 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Tue, 22 Oct 2024 17:46:04 +0200 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 3 + .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 299 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 7 +- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 7 files changed, 943 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index f8c2abc0049d9c..1a53f7ee55e42e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -23,7 +23,10 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7220773b94792d..3996167633a221 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 1b1d2f95a8e6fb552d1853b93e15a12775a7b01b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Tue, 22 Oct 2024 18:06:55 +0200 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr in AMDGPU's RBSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 28 ++- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-rb-legalize.mir | 70 .../regbankselect-mui-rb-select.mir | 18 ++--- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 81 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 52 ++-- 9 files changed, 194 insertions(+), 175 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..359d4a11a69060 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -54,9 +55,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 74a1970b576cb6943271614cfe616459cd73956c Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 284 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 927 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..86be855d730305 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From a8d15f3f4854a364fc0b905544840112283b41a3 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +81,232 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { petar-avramovic wrote: There is nothing useful in real operands, it is a simple sgpr to vgpr COPY which is quite common. What makes that COPY special is implicit exec, for example: `%21:vgpr_32(s32) = COPY %8:sgpr(s32), implicit $exec_lo` implicit exec is there to stop other passes from moving the COPY outside of the loop. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -69,11 +82,297 @@ FunctionPass *llvm::createAMDGPURegBankLegalizePass() { return new AMDGPURegBankLegalize(); } -using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + +class AMDGPURegBankLegalizeCombiner { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + + static constexpr LLT S1 = LLT::scalar(1); + static constexpr LLT S16 = LLT::scalar(16); + static constexpr LLT S32 = LLT::scalar(32); + static constexpr LLT S64 = LLT::scalar(64); + +public: + AMDGPURegBankLegalizeCombiner(MachineIRBuilder &B, const SIRegisterInfo &TRI, +const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}; + + bool isLaneMask(Register Reg) { +const RegisterBank *RB = MRI.getRegBankOrNull(Reg); +if (RB && RB->getID() == AMDGPU::VCCRegBankID) + return true; + +const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); +return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); + } + + void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { +MI.eraseFromParent(); +if (Optional0 && isTriviallyDead(*Optional0, MRI)) + Optional0->eraseFromParent(); + } + + std::pair tryMatch(Register Src, unsigned Opcode) { +MachineInstr *MatchMI = MRI.getVRegDef(Src); +if (MatchMI->getOpcode() != Opcode) + return {nullptr, Register()}; +return {MatchMI, MatchMI->getOperand(1).getReg()}; + } + + void tryCombineCopy(MachineInstr &MI) { +using namespace llvm::MIPatternMatch; +Register Dst = MI.getOperand(0).getReg(); +Register Src = MI.getOperand(1).getReg(); +// Skip copies of physical registers. +if (!Dst.isVirtual() || !Src.isVirtual()) + return; + +// This is a cross bank copy, sgpr S1 to lane mask. +// +// %Src:sgpr(s1) = G_TRUNC %TruncS32Src:sgpr(s32) +// %Dst:lane-mask(s1) = COPY %Src:sgpr(s1) +// -> +// %Dst:lane-mask(s1) = G_COPY_VCC_SCC %TruncS32Src:sgpr(s32) +if (isLaneMask(Dst) && MRI.getRegBankOrNull(Src) == SgprRB) { + auto [Trunc, TruncS32Src] = tryMatch(Src, AMDGPU::G_TRUNC); + assert(Trunc && MRI.getType(TruncS32Src) == S32 && + "sgpr S1 must be result of G_TRUNC of sgpr S32"); + + B.setInstr(MI); + // Ensure that truncated bits in BoolSrc are 0. + auto One = B.buildConstant({SgprRB, S32}, 1); + auto BoolSrc = B.buildAnd({SgprRB, S32}, TruncS32Src, One); + B.buildInstr(AMDGPU::G_COPY_VCC_SCC, {Dst}, {BoolSrc}); + cleanUpAfterCombine(MI, Trunc); + return; +} + +// Src = G_READANYLANE RALSrc +// Dst = COPY Src +// -> +// Dst = RALSrc +if (MRI.getRegBankOrNull(Dst) == VgprRB && +MRI.getRegBankOrNull(Src) == SgprRB) { + auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_READANYLANE); + if (!RAL) +return; + + assert(MRI.getRegBank(RALSrc) == VgprRB); + MRI.replaceRegWith(Dst, RALSrc); + cleanUpAfterCombine(MI, RAL); + return; +} + } + + void tryCombineS1AnyExt(MachineInstr &MI) { +// %Src:sgpr(S1) = G_TRUNC %TruncSrc +// %Dst = G_ANYEXT %Src:sgpr(S1) +// -> +// %Dst = G_... %TruncSrc +Register Dst = MI.getOperand(0).getReg(); +Register Src = MI.getOperand(1).getReg(); +if (MRI.getType(Src) != S1) + return; + +auto [Trunc, TruncSrc] = tryMatch(Src, AMDGPU::G_TRUNC); +if (!Trunc) + return; + +LLT DstTy = MRI.getType(Dst); +LLT TruncSrcTy = MRI.getType(TruncSrc); + +if (DstTy == TruncSrcTy) { + MRI.replaceRegWith(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; +} + +B.setInstr(MI); + +if (DstTy == S32 && TruncSrcTy == S64) { + auto Unmerge = B.buildUnmerge({SgprRB, S32}, TruncSrc); + MRI.replaceRegWith(Dst, Unmerge.getReg(0)); + cleanUpAfterCombine(MI, Trunc); + return; +} + +if (DstTy == S32 && TruncSrcTy == S16) { + B.buildAnyExt(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; +} + +if (DstTy == S16 && TruncSrcTy == S32) { + B.buildTrunc(Dst, TruncSrc); + cleanUpAfterCombine(MI, Trunc); + return; +} + +llvm_unreachable("missing an
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -697,6 +697,15 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res, return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); } +MachineInstrBuilder +MachineIRBuilder::buildUnmerge(MachineRegisterInfo::VRegAttrs Attrs, + const SrcOp &Op) { + LLT OpTy = Op.getLLTTy(*getMRI()); + unsigned NumRegs = OpTy.getSizeInBits() / Attrs.Ty.getSizeInBits(); petar-avramovic wrote: divideCoefficientBy works only for vectors, we need to unmerge scalars also. Don't know about scalable vectors, they seem to only be used to say something is legal, don't know if there is actual lowering done for them. Other places in builder don't check for them. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { petar-avramovic wrote: This allows for more registers to be allocated to sgpr. Avoiding them could be done later in some optimization pass when we can decide if we want to do global-isel equivalent of moveToVALU, but we don't do it for correctness but to influence register allocation to sgpr or vgpr. Why would we want to avoid vcc to scc copy? Question for @nhaehnle also. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
petar-avramovic wrote: Added LLVM_DEBUG around dumps, RB->RegBank rename for methods and class names https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
petar-avramovic wrote: Ping. There were changes because of improvements to builder accepting regbank+LLT, think I addressed most of the comments. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 98a00e5a2ed28da3a4608d9c209a04f0cff6fe12 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:41:59 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 4675f79f28222cef60d1607acb1b682ca3363eb6 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 929 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 15ccf1a38af9a5..19d8d466e3b12e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -36,6 +36,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -0,0 +1,334 @@ +//===-- AMDGPURBLegalizeRules.cpp -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===--===// + +#include "AMDGPURBLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( +std::initializer_list DstOpMappingList, +std::initializer_list SrcOpMappingList, +LoweringMethodID LoweringMethod) +: DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( +std::initializer_list OpList, +std::function TestFunc) +: OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: +return MRI.getType(Reg) == LLT::scalar(1); + case S16: +return MRI.getType(Reg) == LLT::scalar(16); + case S32: +return MRI.getType(Reg) == LLT::scalar(32); + case S64: +return MRI.getType(Reg) == LLT::scalar(64); + case P1: +return MRI.getType(Reg) == LLT::pointer(1, 64); + + case UniS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: +return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + + case DivS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: +return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + + case _: +return true; + default: +llvm_unreachable("missing matchUniformityAndLLT\n"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { +if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg() && + MI.getOperand(i).getReg() != AMDGPU::NoRegister) petar-avramovic wrote: Removed NoRegister check since we don't need it at the moment. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, +const DstOp &SgprDst, +const SrcOp &VgprSrc, +const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) +MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) +MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); petar-avramovic wrote: How should it work in regards to possibility to insert illegal sgpr to vgpr copy and can it fail like register class version? Or are we looking for something much simpler: no reg bank -set reg bank same reg bank - do nothing different reg bank - insert copy https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); petar-avramovic wrote: Never changes use lock_guard+mutex, why switch to call_once? https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -0,0 +1,334 @@ +//===-- AMDGPURBLegalizeRules.cpp -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--===// +// +/// Definitions of RBLegalize Rules for all opcodes. +/// Implementation of container for all the Rules and search. +/// Fast search for most common case when Rule.Predicate checks LLT and +/// uniformity of register in operand 0. +// +//===--===// + +#include "AMDGPURBLegalizeRules.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" + +using namespace llvm; +using namespace AMDGPU; + +RegBankLLTMapping::RegBankLLTMapping( +std::initializer_list DstOpMappingList, +std::initializer_list SrcOpMappingList, +LoweringMethodID LoweringMethod) +: DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList), + LoweringMethod(LoweringMethod) {} + +PredicateMapping::PredicateMapping( +std::initializer_list OpList, +std::function TestFunc) +: OpUniformityAndTypes(OpList), TestFunc(TestFunc) {} + +bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) { + switch (UniID) { + case S1: +return MRI.getType(Reg) == LLT::scalar(1); + case S16: +return MRI.getType(Reg) == LLT::scalar(16); + case S32: +return MRI.getType(Reg) == LLT::scalar(32); + case S64: +return MRI.getType(Reg) == LLT::scalar(64); + case P1: +return MRI.getType(Reg) == LLT::pointer(1, 64); + + case UniS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg); + case UniS16: +return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg); + case UniS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg); + case UniS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg); + + case DivS1: +return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg); + case DivS32: +return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg); + case DivS64: +return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg); + case DivP1: +return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg); + + case _: +return true; + default: +llvm_unreachable("missing matchUniformityAndLLT\n"); + } +} + +bool PredicateMapping::match(const MachineInstr &MI, + const MachineUniformityInfo &MUI, + const MachineRegisterInfo &MRI) const { + // Check LLT signature. + for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) { +if (OpUniformityAndTypes[i] == _) { + if (MI.getOperand(i).isReg() && + MI.getOperand(i).getReg() != AMDGPU::NoRegister) +return false; + continue; +} + +// Remaining IDs check registers. +if (!MI.getOperand(i).isReg()) + return false; + +if (!matchUniformityAndLLT(MI.getOperand(i).getReg(), + OpUniformityAndTypes[i], MUI, MRI)) + return false; + } + + // More complex check. + if (TestFunc) +return TestFunc(MI); + + return true; +} + +SetOfRulesForOpcode::SetOfRulesForOpcode() {} + +SetOfRulesForOpcode::SetOfRulesForOpcode(FastRulesTypes FastTypes) +: FastTypes(FastTypes) {} + +UniformityLLTOpPredicateID LLTToId(LLT Ty) { + if (Ty == LLT::scalar(16)) +return S16; + if (Ty == LLT::scalar(32)) +return S32; + if (Ty == LLT::scalar(64)) +return S64; + if (Ty == LLT::fixed_vector(2, 16)) +return V2S16; + if (Ty == LLT::fixed_vector(2, 32)) +return V2S32; + if (Ty == LLT::fixed_vector(3, 32)) +return V3S32; + if (Ty == LLT::fixed_vector(4, 32)) +return V4S32; + return _; +} + +const RegBankLLTMapping & +SetOfRulesForOpcode::findMappingForMI(const MachineInstr &MI, + const MachineRegisterInfo &MRI, + const MachineUniformityInfo &MUI) const { + // Search in "Fast Rules". + // Note: if fast rules are enabled, RegBankLLTMapping must be added in each + // slot that could "match fast Predicate". If not, Invalid Mapping is + // returned which results in failure, does not search "Slow Rules". + if (FastTypes != No) { petar-avramovic wrote: Renamed to NoFastRules https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBLegalize (PR #112864)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
@@ -54,9 +54,28 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() template <> -bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); +bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &MI) { + if (!MI.isPHI()) +return false; + const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + Register This = MI.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) { +Register Incoming = MI.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { petar-avramovic wrote: Should we try to do something about PHIs then? Machine uniformity analysis in later stages is not used really. It gives different results then LLVM-IR and G-MIR versions, mostly because sgpr register class without LLT stops propagation of divergence/uniformity https://github.com/llvm/llvm-project/pull/112866 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; petar-avramovic wrote: I guess that we can avoid it if we assume that TPC.isGISelCSEEnabled() is true. Copied from other passes. https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: StandaloneRegBankSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; + + if (MI.isInlineAsm()) +return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({&RB, Ty}); +DefOP.setReg(NewReg); + +auto &MBB = *MI.getParent(); +B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); +B.buildCopy(Reg, NewReg); + +// The problem was discoverd for uniform S1 that was used as both +// lane mask(vcc) and regular sgpr S1. +// - lane-mask(vcc) use was by si_if, this use is divergent and requires +// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets +// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. +// - the regular regular sgpr S1(uniform) instruction is now broken since +// it uses sreg_64_xexec(S1) which is divergent. + +// "Clear" reg classes from uses on generic instructions and but register +// banks instead. +for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { +for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) +Op.setReg(NewReg); +} + } +} + + } else { +MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { +auto DefMI = MRI.getVRegDef(Reg)->getIterator(); +MachineBasicBlock *DefMBB = DefMI->getParent(); +B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { +B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { +return true; + } +} + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) +return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) +return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); petar-avramovic wrote: Do we need CSE bilder here? This pass only sets register banks, and in some cases builds copies but dont think we need CSE for them. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: StandaloneRegBankSelect (PR #112863)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RBSelect (PR #112863)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: StandaloneRegBankSelect (PR #112863)
@@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) +return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) +return false; petar-avramovic wrote: I copied that from existing regbankselect. MI.isPreISelOpcode() || MI.isCopy() also works https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; + const TargetPassConfig &TPC = getAnalysis(); + GISelCSEAnalysisWrapper &Wrapper = + getAnalysis().getCSEWrapper(); + GISelCSEInfo *CSEInfo = nullptr; + GISelObserverWrapper Observer; + + if (TPC.isGISelCSEEnabled()) { +MIRBuilder = std::make_unique(); +CSEInfo = &Wrapper.get(TPC.getCSEConfig()); +MIRBuilder->setCSEInfo(CSEInfo); +Observer.addObserver(CSEInfo); +MIRBuilder->setChangeObserver(Observer); + } else { +MIRBuilder = std::make_unique(); + } + MIRBuilder->setMF(MF); + + RAIIDelegateInstaller DelegateInstaller(MF, &Observer); + RAIIMFObserverInstaller MFObserverInstaller(MF, Observer); + + const MachineUniformityInfo &MUI = + getAnalysis().getUniformityInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes. + const RegBankLegalizeRules &RBLRules = getRules(ST, MRI); + + // Logic that does legalization based on IDs assigned to Opcode. + RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules); + + SmallVector AllInst; + + for (auto &MBB : MF) { +for (MachineInstr &MI : MBB) { + AllInst.push_back(&MI); +} + } + + for (auto &MI : AllInst) { +if (!MI->isPreISelOpcode()) + continue; + +unsigned Opc = MI->getOpcode(); + +// Insert point for use operands needs some calculation. +if (Opc == G_PHI) { + RBLegalizeHelper.applyMappingPHI(*MI); + continue; +} + +// Opcodes that support pretty much all combinations of reg banks and LLTs +// (except S1). There is no point in writing rules for them. +if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES || +Opc == G_MERGE_VALUES) { + RBLegalizeHelper.applyMappingTrivial(*MI); + continue; +} + +// Opcodes that also support S1. S1 rules are in RegBankLegalizeRules. +// Remaining reg bank and LLT combinations are trivially accepted. +if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) && +!isS1(MI->getOperand(0).getReg(), MRI)) { + assert(isSgprRB(MI->getOperand(0).getReg(), MRI)); + continue; +} + +if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) { + MI->dump(); + llvm_unreachable("failed to match any of the rules"); +} + } + + LLT S1 = LLT::scalar(1); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + + // SGPR S1 clean up combines: + // - SGPR S1(S32) to SGPR S1(S32) Copy: anyext + trunc combine. + // In RBLegalize 'S1 Dst' are legalized into S32 as'S1Dst = Trunc S32Dst' + // and 'S1 Src' into 'S32Src = Anyext S1Src'. + // S1 Truncs and Anyexts that come from legalizer will also be cleaned up. + // Note: they can have non-S32 types e.g. S16 = Anyext S1 or S1 = Trunc S64. + // - Sgpr S1(S32) to VCC Copy: G_COPY_VCC_SCC combine. + // Divergent instruction uses Sgpr S1 as input that should be lane mask(VCC) + // Legalizing this use creates Sgpr S1(S32) to VCC Copy. + + // Note: Remaining S1 copies, S1s are either SGPR S1(S32) or VCC S1: + // - VCC to VCC Copy: nothing to do here, just a regular copy. + // - VCC to SGPR S1 Copy: Should not exist in a form of COPY instruction(*). + // Note: For 'uniform-in-VCC to SGPR-S1 copy' G_COPY_SCC_VCC is used + // instead. When only available instruction creates VCC result, use of + // UniformInVcc results in creating G_COPY_SCC_VCC. + + // (*)Explanation for 'SGPR S1(uniform) = COPY VCC(divergent)': + // Copy from divergent to uniform register indicates an error in either: + // - Uniformity analysis: Uniform instruction has divergent input. If one of + // the inputs is divergent, instruction should be divergent! + // - RBLegalizer not executing in waterfall loop (missing implementation) + + using namespace MIPatternMatch; + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + for (auto &MBB : MF) { +for (auto &MI : make_early_inc_range(MBB)) { + + if (MI.getOpcode() == G_TRUNC && isTriviallyDead(MI, MRI)) { +
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
petar-avramovic wrote: Rebase for new-reg-bank-select taking over AMDGPURegBankSelect TODO: helper changes after DstOp improvements as it accepts bank+LLT https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add skeletons for new register bank select passes (PR #112862)
@@ -39,6 +39,8 @@ FunctionPass *createSIFoldOperandsLegacyPass(); FunctionPass *createSIPeepholeSDWALegacyPass(); FunctionPass *createSILowerI1CopiesLegacyPass(); FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass(); +FunctionPass *createAMDGPURBSelectPass(); petar-avramovic wrote: Reverting default reg-bank-select to RegBankSelect, new pass will take AMDGPURegBankSelect https://github.com/llvm/llvm-project/pull/112862 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 610eeeacfb418948bf36a8a0b4eee40c4ada24aa Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:15:10 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 28 ++- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 70 .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 52 ++-- 9 files changed, 194 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..359d4a11a69060 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -54,9 +55,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add skeletons for new register bank select passes (PR #112862)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112862 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 619288bc69a9d1290a2a5c521582b0204e0608b6 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:09:50 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 299 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 941 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7220773b94792d..3996167633a221 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +unsigned NumEltsMerge = +MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); +auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + if (i < NumEltsMerge) +MergeTyParts.push_back(UnmergeReg); +} +B.buildMergeLikeInstr(Dst, MergeTyP
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112866 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 1f2cecde58eb1b2443b8b93d27bde2b1de0e Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:09:50 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 299 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 941 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7220773b94792d..3996167633a221 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +unsigned NumEltsMerge = +MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); +auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + if (i < NumEltsMerge) +MergeTyParts.push_back(UnmergeReg); +} +B.buildMergeLikeInstr(Dst, MergeTyP
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 000d17beab5b20702c92c07f9a2241ac50a63629 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:15:10 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 28 ++- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 70 .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 52 ++-- 9 files changed, 194 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..359d4a11a69060 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -54,9 +55,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Add skeletons for new register bank select passes (PR #112862)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112862 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
https://github.com/petar-avramovic edited https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() { using namespace AMDGPU; +const RegBankLegalizeRules &getRules(const GCNSubtarget &ST, + MachineRegisterInfo &MRI) { + static std::mutex GlobalMutex; + static SmallDenseMap> + CacheForRuleSet; + std::lock_guard Lock(GlobalMutex); + if (!CacheForRuleSet.contains(ST.getGeneration())) { +auto Rules = std::make_unique(ST, MRI); +CacheForRuleSet[ST.getGeneration()] = std::move(Rules); + } else { +CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI); + } + return *CacheForRuleSet[ST.getGeneration()]; +} + bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) { + + const GCNSubtarget &ST = MF.getSubtarget(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Setup the instruction builder with CSE. + std::unique_ptr MIRBuilder; petar-avramovic wrote: Simplified to always use CSE builder https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From b2fd498febbdd9ed4e65cb18a541aaeee85550b1 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:09:50 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 299 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 941 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 7220773b94792d..3996167633a221 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -37,6 +37,97 @@ bool RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { return true; } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(BasePtrReg); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(BasePtrReg); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePtrPlusOffsetReg; +if (ByteOffset == 0) { + BasePtrPlusOffsetReg = BasePtrReg; +} else { + BasePtrPlusOffsetReg = MRI.createVirtualRegister({PtrRB, PtrTy}); + Register OffsetReg = MRI.createVirtualRegister({PtrRB, OffsetTy}); + B.buildConstant(OffsetReg, ByteOffset); + B.buildPtrAdd(BasePtrPlusOffsetReg, BasePtrReg, OffsetReg); +} +MachineMemOperand *BasePtrPlusOffsetMMO = +MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +Register PartLoad = MRI.createVirtualRegister({DstRB, PartTy}); +B.buildLoad(PartLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); +LoadPartRegs.push_back(PartLoad); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Load(s) are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge them all together in Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge(MergeTy, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + MergeTyParts.push_back(UnmergeReg); +} + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register BasePtrReg = MI.getOperand(1).getReg(); + + Register BasePtrPlusOffsetReg; + BasePtrPlusOffsetReg = BasePtrReg; + + MachineMemOperand *BasePtrPlusOffsetMMO = + MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + Register WideLoad = MRI.createVirtualRegister({DstRB, WideTy}); + B.buildLoad(WideLoad, BasePtrPlusOffsetReg, *BasePtrPlusOffsetMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +unsigned NumEltsMerge = +MRI.getType(Dst).getSizeInBits() / MergeTy.getSizeInBits(); +auto Unmerge = B.buildUnmerge(MergeTy, WideLoad); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + Register UnmergeReg = Unmerge->getOperand(i).getReg(); + MRI.setRegBank(UnmergeReg, *DstRB); + if (i < NumEltsMerge) +MergeTyParts.push_back(UnmergeReg); +} +B.buildMergeLikeInstr(Dst, MergeTyP
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From f5c645d18bc3b17f92ff2ef65e1c676f0fc0dfa0 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Mon, 28 Oct 2024 15:15:10 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 28 ++- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 70 .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 52 ++-- 9 files changed, 194 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..359d4a11a69060 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -54,9 +55,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From a269e8da1b872b3f2390037a594757940cf8369b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(; + B.buildCopy(Reg, NewReg); + + // The problem was discovered for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // Replace virtual registers with register class on generic instructions + // uses with virtual registers with register bank. + for (auto &UseMI : MRI.use_instructions(Reg)) { +if (shouldRegBankSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } +} + } + +} else { + MRI.setRegBank(Reg, *RB); +} + } + + std::optional tryGetVReg(MachineOperand &Op) { +if (!Op.isReg()) + return std::nullopt; + +Register Reg = Op.getReg(); +if (!Reg.isVirtual()) + return std::nullopt; + +return Reg; + } + + void assignBanksOnDefs(MachineInstr &MI) { +if (!shouldRegBankSelect(MI)) + return; + +for (MachineOperand &DefOP : MI.defs()) { + auto MaybeDefReg = tryGetVReg(DefOP); + if (!MaybeDefReg) +continue; + Register DefReg = *MaybeDefReg; + + // Copies can have register class on def registers. + if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { +continue; + } + + if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { +setRBDef(MI, DefOP, SgprRB); + } else { +if (MRI.getType(DefReg) == LLT::scalar(1)) + setRBDef(MI, DefOP, VccRB); +else + setRBDef(MI, DefOP, VgprRB); + } +} + } + + void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP, + const RegisterBank *RB) { +Register Reg = UseOP.getReg(); + +LLT Ty = MRI.getType(Reg); +Register NewReg = MRI.createVirtualRegister({RB, Ty}); +UseOP.setReg(NewReg); + +if (MI.isPHI()) { + auto DefMI = MRI.getVRegDef(Reg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); +} else { + B.setInstr(MI); +} + +B.buildCopy(NewReg, Reg); + } + + void constrainBanksOnUses(MachineInstr &MI) { +if (!shouldRegBankSelect(
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From e6285ef8415e03337a080fa13456a2495023a8e6 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 929 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 15ccf1a38af9a5..19d8d466e3b12e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -36,6 +36,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From eb6a8fc2973ad31f607af56c61a4c6ba6f30d982 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 287 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 929 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 15ccf1a38af9a5..19d8d466e3b12e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -36,6 +36,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -114,6 +191,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 6ec049db2a5572c4cb0514b9ca44c7ff215b461f Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator(; + B.buildCopy(Reg, NewReg); + + // The problem was discovered for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // Replace virtual registers with register class on generic instructions + // uses with virtual registers with register bank. + for (auto &UseMI : MRI.use_instructions(Reg)) { +if (shouldRegBankSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { +if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); petar-avramovic wrote: Yes, this was a bug. make_early_inc_range also works but it might be assuming how MRI keeps track of use instructions internally https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { +return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { +MachineInstr *MI = MRI.getVRegDef(Reg); +if (!MI->isCopy()) + return false; + +for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) +continue; + + if (Op.getReg() == TRI.getExec()) { +return true; + } +} + +return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, +const RegisterBank *RB) { +Register Reg = DefOP.getReg(); +// Register that already has Register class got it during pre-inst selection +// of another instruction. Maybe cross bank copy was required so we insert a +// copy that can be removed later. This simplifies post regbanklegalize +// combiner and avoids need to special case some patterns. +if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); petar-avramovic wrote: Why? I intend for new regbankselect be simple and not use observers. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -69,3 +72,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, return std::pair(Reg, 0); } + +IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF) +: MRI(MF.getRegInfo()) { + initLaneMaskIntrinsics(MF); +} + +bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) const { + return S32S64LaneMask.contains(Reg); +} + +void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) { + for (auto &MBB : MF) { +for (auto &MI : MBB) { + GIntrinsic *GI = dyn_cast(&MI); + if (GI && GI->is(Intrinsic::amdgcn_if_break)) { +S32S64LaneMask.insert(MI.getOperand(3).getReg()); +findLCSSAPhi(MI.getOperand(0).getReg()); + } + + if (MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE) { +findLCSSAPhi(MI.getOperand(0).getReg()); + } petar-avramovic wrote: Consequence of what legalizer does, si.if and si.else are inst-selected to SI_IF and SI_ELSE in AMDGPULegalizerInfo::legalizeIntrinsic, if.break is still intrinsic for reg bank selection https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 3854308d10edc1329086faf26542ca469b26c589 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 284 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 927 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..5c4195cb15fb2c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From c59ad6e821a49e48df70edff1fdb044eb0083b1c Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
@@ -66,9 +81,208 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), +SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), +VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), +VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { petar-avramovic wrote: > Where are these exec operands getting inserted? Should we have a different > pseudo instead? In temporal-divergence lowering, upcoming change in AMDGPUGlobalISelDivergenceLowering, I will just remove this check in this patch. https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/110256 >From 2ea25b291ffdae0d3b9b6821199080f133de34c7 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 27 Sep 2024 13:59:31 +0200 Subject: [PATCH] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base Use i32 for offset instead of i16, this way it does not get interpreted as negative 16 bit offset. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 6 +++--- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 12 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d3d5bc924525fc..ff8798edb3cc0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); return true; } @@ -1966,7 +1966,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; -Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); +Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32); return true; } } @@ -1999,7 +1999,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index fd67dfc65f9846..ef9590b3fd33fa 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4926,7 +4926,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: sgpr_base_large_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-NEXT:s_wait_loadcnt 0x0 ; GFX12-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-NEXT:s_nop 0 @@ -4985,7 +4985,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-PAL-LABEL: sgpr_base_large_offset: ; GFX12-PAL: ; %bb.0: ; %entry -; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-PAL-NEXT:s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-PAL-NEXT:s_nop 0 @@ -5038,7 +5038,7 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT:v_mov_b32_e32 v2, 0x100 ; GFX12-NEXT:s_and_b32 s0, s0, -4 -; GFX12-NEXT:scratch_load_b32 v2, v2, s0 offset:-24 scope:SCOPE_SYS +; GFX12-NEXT:scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT:s_wait_loadcnt 0x0 ; GFX12-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-NEXT:s_nop 0 @@ -5103,7 +5103,7 @@ define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr a ; GFX12-PAL: ; %bb.0: ; %entry ; GFX12-PAL-NEXT:v_mov_b32_e32 v2, 0x100 ; GFX12-PAL-NEXT:s_and_b32 s0, s0, -4 -; GFX12-PAL-NEXT:scratch_load_b32 v2, v2, s0 offset:-24 scope:SCOPE_SYS +; GFX12-PAL-NEXT:scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS ; GFX12-PAL-NEXT:s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-PAL-NEXT:s_nop 0 @@ -5159,7 +5159,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT:v_mov_b32_e32 v1, 15 ; GFX12-NEXT:s_add_co_i32 s0, s0, s1 -; GFX12-NEXT:scratch_store_b32 v0, v1, s0 offset:-24 scope:SCOPE_SYS +; GFX12-NEXT:scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT:s_wait_storecnt 0x0 ; GFX12-NEXT:s_endpgm ; @@ -5221,7 +5221,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT:v_mov_b32_e32 v1, 15 ; GFX12-PAL-NEXT:s_add_co_i32 s0, s0, s1 -; GFX12-PAL-NEXT:scratch_store_b32 v0, v1, s0 offset:-24 scope:SCOPE_SYS +; GFX12-PAL-NEXT:scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; GFX12-PAL-NEXT:s_wait_storecnt 0x0 ; GFX12-PAL-NEXT:s_endpgm bb: ___ llvm-branch-commits mailing list llvm-bran
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
@@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); petar-avramovic wrote: fixed and added tests for those cases https://github.com/llvm/llvm-project/pull/110256 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
petar-avramovic wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/110256?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#110256** https://app.graphite.dev/github/pr/llvm/llvm-project/110256?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#110255** https://app.graphite.dev/github/pr/llvm/llvm-project/110255?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @petar-avramovic and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/110256 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
https://github.com/petar-avramovic created https://github.com/llvm/llvm-project/pull/110256 Use i32 for offset instead of i16, this way it does not get interpreted as negative 16 bit offset. >From dcec93029eb0126761ed7521511294b9237591db Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 27 Sep 2024 13:59:31 +0200 Subject: [PATCH] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base Use i32 for offset instead of i16, this way it does not get interpreted as negative 16 bit offset. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 2 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d3d5bc924525fc..48971a6840c779 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1911,7 +1911,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i32); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 667a8a38c62ecc..496ac80a3dfbcf 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4926,7 +4926,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-LABEL: sgpr_base_large_offset: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-NEXT:s_wait_loadcnt 0x0 ; GFX12-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-NEXT:s_nop 0 @@ -4985,7 +4985,7 @@ define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspa ; ; GFX12-PAL-LABEL: sgpr_base_large_offset: ; GFX12-PAL: ; %bb.0: ; %entry -; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:-24 +; GFX12-PAL-NEXT:scratch_load_b32 v2, off, s0 offset:65512 ; GFX12-PAL-NEXT:s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT:global_store_b32 v[0:1], v2, off ; GFX12-PAL-NEXT:s_nop 0 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Fix inst-selection of large scratch offsets with sgpr base (PR #110256)
https://github.com/petar-avramovic ready_for_review https://github.com/llvm/llvm-project/pull/110256 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From befab474546b9f44fa2b5d7961df8c0490aed692 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 75694f85585d7b07c17d68f32632310ba1d939a9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 288 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 278 ++- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 900 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..7d3ecde1dc714c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,54 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT::f
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
@@ -290,7 +504,86 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}}) .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}}); - addRulesForGOpcs({G_LOAD}).Any({{DivS32, DivP1}, {{Vgpr32}, {VgprP1}}}); + bool hasUnAlignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12; + bool hasSMRDSmall = ST->hasScalarSubwordLoads(); + + Predicate isAlign16([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(16); + }); + + Predicate isAlign4([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getAlign() >= Align(4); + }); + + Predicate isAtomicMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isAtomic(); + }); + + Predicate isUniMMO([](const MachineInstr &MI) -> bool { +return AMDGPUInstrInfo::isUniformMMO(*MI.memoperands_begin()); + }); + + Predicate isConst([](const MachineInstr &MI) -> bool { +// Address space in MMO be different then address space on pointer. +const MachineMemOperand *MMO = *MI.memoperands_begin(); +const unsigned AS = MMO->getAddrSpace(); +return AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + }); + + Predicate isVolatileMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isVolatile(); + }); + + Predicate isInvMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->isInvariant(); + }); + + Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool { +return (*MI.memoperands_begin())->getFlags() & MONoClobber; + }); + + Predicate isNaturalAlignedSmall([](const MachineInstr &MI) -> bool { +const MachineMemOperand *MMO = *MI.memoperands_begin(); +const unsigned MemSize = 8 * MMO->getSize().getValue(); +return (MemSize == 16 && MMO->getAlign() >= Align(2)) || + (MemSize == 8 && MMO->getAlign() >= Align(1)); + }); + + auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) && + (isConst || isInvMMO || isNoClobberMMO); petar-avramovic wrote: It is copied from current implementation in AMDGPURegisterBankInfo::isScalarLoadLegal isConst checks for address space in MMO, which can be different that address space of pointer operand https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From b7366209b93a07f286842f31bb625ca321b47df4 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 284 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 927 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..5c4195cb15fb2c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 686c0699e6653c1a11e7e911ccf4de107d390066 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankSelect (PR #112863)
petar-avramovic wrote: ping https://github.com/llvm/llvm-project/pull/112863 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
petar-avramovic wrote: ping https://github.com/llvm/llvm-project/pull/112882 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 97ce5f3295ed0f795656aed9180901c2299159f8 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 59e70ef3cb6b1e9183691782b5675a376add3fbd Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 284 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 309 - .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 927 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..5c4195cb15fb2c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,50 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT:
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: AMDGPURegBankLegalize (PR #112864)
petar-avramovic wrote: ping https://github.com/llvm/llvm-project/pull/112864 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 3f80c887a75708c9cf88283fde991be7221c73d9 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 288 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 278 ++- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 900 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 916140e2bbcd68..7d3ecde1dc714c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,54 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT::f
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 18162175daa7f1627f036ecda9cbfb589b58d04a Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -40,6 +40,10 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + // Use outside cycle with divergent exit + using UOCWDE = petar-avramovic wrote: My guess is that GenericUniformityAnalysisImpl and GenericUniformityInfo repeat typedefs because of terrible line break This would work typename GenericUniformityInfo::TemporalDivergenceTuple https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/124298 >From a5c340d0301c3b36fadd352d7ed1c332789cb73b Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 31 Jan 2025 13:04:17 +0100 Subject: [PATCH] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source. --- llvm/include/llvm/ADT/GenericUniformityImpl.h | 33 ++ llvm/include/llvm/ADT/GenericUniformityInfo.h | 5 +++ llvm/lib/Analysis/UniformityAnalysis.cpp | 3 +- .../lib/CodeGen/MachineUniformityAnalysis.cpp | 6 +-- .../AMDGPUGlobalISelDivergenceLowering.cpp| 45 ++- .../lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 25 +-- llvm/lib/Target/AMDGPU/SILowerI1Copies.h | 6 +++ ...divergent-i1-phis-no-lane-mask-merging.mir | 7 +-- ...ergence-divergent-i1-used-outside-loop.mir | 19 .../divergence-temporal-divergent-reg.ll | 18 .../divergence-temporal-divergent-reg.mir | 3 +- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 17 +++ 12 files changed, 146 insertions(+), 41 deletions(-) diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index bd09f4fe43e087..d0f7bd14120651 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -342,6 +342,9 @@ template class GenericUniformityAnalysisImpl { typename SyncDependenceAnalysisT::DivergenceDescriptor; using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI) : Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI), @@ -396,6 +399,11 @@ template class GenericUniformityAnalysisImpl { void print(raw_ostream &out) const; + SmallVector TemporalDivergenceList; + + void recordTemporalDivergence(const InstructionT *, const InstructionT *, +const CycleT *); + protected: /// \brief Value/block pair representing a single phi input. struct PhiInput { @@ -1129,6 +1137,13 @@ void GenericUniformityAnalysisImpl::compute() { } } +template +void GenericUniformityAnalysisImpl::recordTemporalDivergence( +const InstructionT *Inst, const InstructionT *User, const CycleT *Cycle) { + TemporalDivergenceList.emplace_back(const_cast(Inst), + const_cast(User), Cycle); +} + template bool GenericUniformityAnalysisImpl::isAlwaysUniform( const InstructionT &Instr) const { @@ -1180,6 +1195,16 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } + if (!TemporalDivergenceList.empty()) { +OS << "\nTEMPORAL DIVERGENCE LIST:\n"; + +for (auto [Inst, UseInst, Cycle] : TemporalDivergenceList) { + OS << "Inst:" << Context.print(Inst) + << "Used by :" << Context.print(UseInst) + << "Outside cycle :" << Cycle->print(Context) << "\n\n"; +} + } + for (auto &block : F) { OS << "\nBLOCK " << Context.print(&block) << '\n'; @@ -1210,6 +1235,14 @@ void GenericUniformityAnalysisImpl::print(raw_ostream &OS) const { } } +template +iterator_range< +typename GenericUniformityInfo::TemporalDivergenceTuple *> +GenericUniformityInfo::getTemporalDivergenceList() const { + return make_range(DA->TemporalDivergenceList.begin(), +DA->TemporalDivergenceList.end()); +} + template bool GenericUniformityInfo::hasDivergence() const { return DA->hasDivergence(); diff --git a/llvm/include/llvm/ADT/GenericUniformityInfo.h b/llvm/include/llvm/ADT/GenericUniformityInfo.h index e53afccc020b46..8d3b141aaeded7 100644 --- a/llvm/include/llvm/ADT/GenericUniformityInfo.h +++ b/llvm/include/llvm/ADT/GenericUniformityInfo.h @@ -40,6 +40,9 @@ template class GenericUniformityInfo { using CycleInfoT = GenericCycleInfo; using CycleT = typename CycleInfoT::CycleT; + using TemporalDivergenceTuple = + std::tuple; + GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI, const TargetTransformInfo *TTI = nullptr); GenericUniformityInfo() = default; @@ -78,6 +81,8 @@ template class GenericUniformityInfo { void print(raw_ostream &Out) const; + iterator_range getTemporalDivergenceList() const; + private: using ImplT =
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering i1 (PR #124299)
petar-avramovic wrote: Insert point of merging phi is changed to after Inst, not in the exiting block. https://github.com/llvm/llvm-project/pull/124299 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (PR #124298)
@@ -395,6 +399,14 @@ template class GenericUniformityAnalysisImpl { } void print(raw_ostream &out) const; + SmallVector UsesOutsideCycleWithDivergentExit; + void recordUseOutsideCycleWithDivergentExit(const InstructionT *, petar-avramovic wrote: I was considering TemporalDivergenceCandidate. I did not find strict definition of Temporal Divergence so I ended up using UseOutsideCycleWithDivergentExit since it is more technical and, I assume, not target dependent. It is not Temporal Divergence until we check uniformity of Src used OutsideCycleWithDivergentExit and it turns out to be uniform or the other case check type and it is i1. For us divergent i1 is also technically Temporal Divergence since it will ends up in sgpr. I am fine with using different name instead of "UseOutsideCycleWithDivergentExit" if you think it is more appropriate. https://github.com/llvm/llvm-project/pull/124298 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU/GlobalISel: RegBankLegalize rules for load (PR #112882)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112882 >From 3fa31aeeb48e3b4a1dcac77d67dc6a1d205c8dce Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Wed, 30 Oct 2024 15:37:59 +0100 Subject: [PATCH] AMDGPU/GlobalISel: RegBankLegalize rules for load Add IDs for bit width that cover multiple LLTs: B32 B64 etc. "Predicate" wrapper class for bool predicate functions used to write pretty rules. Predicates can be combined using &&, || and !. Lowering for splitting and widening loads. Write rules for loads to not change existing mir tests from old regbankselect. --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp| 288 +++- .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 5 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 278 ++- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 65 +++- .../AMDGPU/GlobalISel/regbankselect-load.mir | 320 +++--- .../GlobalISel/regbankselect-zextload.mir | 9 +- 6 files changed, 900 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 6d9cf487c6dd25..6e78e29555ee11 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -38,6 +38,83 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) { lower(MI, Mapping, WaterfallSgprs); } +void RegBankLegalizeHelper::splitLoad(MachineInstr &MI, + ArrayRef LLTBreakdown, LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(Base); + const RegisterBank *PtrRB = MRI.getRegBankOrNull(Base); + LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits()); + SmallVector LoadPartRegs; + + unsigned ByteOffset = 0; + for (LLT PartTy : LLTBreakdown) { +Register BasePlusOffset; +if (ByteOffset == 0) { + BasePlusOffset = Base; +} else { + auto Offset = B.buildConstant({PtrRB, OffsetTy}, ByteOffset); + BasePlusOffset = B.buildPtrAdd({PtrRB, PtrTy}, Base, Offset).getReg(0); +} +auto *OffsetMMO = MF.getMachineMemOperand(&BaseMMO, ByteOffset, PartTy); +auto LoadPart = B.buildLoad({DstRB, PartTy}, BasePlusOffset, *OffsetMMO); +LoadPartRegs.push_back(LoadPart.getReg(0)); +ByteOffset += PartTy.getSizeInBytes(); + } + + if (!MergeTy.isValid()) { +// Loads are of same size, concat or merge them together. +B.buildMergeLikeInstr(Dst, LoadPartRegs); + } else { +// Loads are not all of same size, need to unmerge them to smaller pieces +// of MergeTy type, then merge pieces to Dst. +SmallVector MergeTyParts; +for (Register Reg : LoadPartRegs) { + if (MRI.getType(Reg) == MergeTy) { +MergeTyParts.push_back(Reg); + } else { +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, Reg); +for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) + MergeTyParts.push_back(Unmerge.getReg(i)); + } +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + +void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy, + LLT MergeTy) { + MachineFunction &MF = B.getMF(); + assert(MI.getNumMemOperands() == 1); + MachineMemOperand &BaseMMO = **MI.memoperands_begin(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstRB = MRI.getRegBankOrNull(Dst); + Register Base = MI.getOperand(1).getReg(); + + MachineMemOperand *WideMMO = MF.getMachineMemOperand(&BaseMMO, 0, WideTy); + auto WideLoad = B.buildLoad({DstRB, WideTy}, Base, *WideMMO); + + if (WideTy.isScalar()) { +B.buildTrunc(Dst, WideLoad); + } else { +SmallVector MergeTyParts; +auto Unmerge = B.buildUnmerge({DstRB, MergeTy}, WideLoad); + +LLT DstTy = MRI.getType(Dst); +unsigned NumElts = DstTy.getSizeInBits() / MergeTy.getSizeInBits(); +for (unsigned i = 0; i < NumElts; ++i) { + MergeTyParts.push_back(Unmerge.getReg(i)); +} +B.buildMergeLikeInstr(Dst, MergeTyParts); + } + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lower(MachineInstr &MI, const RegBankLLTMapping &Mapping, SmallSet &WaterfallSgprs) { @@ -116,6 +193,54 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, MI.eraseFromParent(); break; } + case SplitLoad: { +LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); +unsigned Size = DstTy.getSizeInBits(); +// Even split to 128-bit loads +if (Size > 128) { + LLT B128; + if (DstTy.isVector()) { +LLT EltTy = DstTy.getElementType(); +B128 = LLT::f
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From ada22e8d25b8ca05c048b83307edb60fbdf6a774 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D
[llvm-branch-commits] [llvm] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi (PR #112866)
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/112866 >From 991bdf1a6de2e91732a3b9c443786243aaa58e91 Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Thu, 31 Oct 2024 14:10:57 +0100 Subject: [PATCH] MachineUniformityAnalysis: Improve isConstantOrUndefValuePhi Change existing code for G_PHI to match what LLVM-IR version is doing via PHINode::hasConstantOrUndefValue. This is not safe for regular PHI since it may appear with an undef operand and getVRegDef can fail. Most notably this improves number of values that can be allocated to sgpr register bank in AMDGPURegBankSelect. Common case here are phis that appear in structurize-cfg lowering for cycles with multiple exits: Undef incoming value is coming from block that reached cycle exit condition, if other incoming is uniform keep the phi uniform despite the fact it is joining values from pair of blocks that are entered via divergent condition branch. --- llvm/lib/CodeGen/MachineSSAContext.cpp| 27 +- .../AMDGPU/MIR/hidden-diverge-gmir.mir| 28 +++ .../AMDGPU/MIR/hidden-loop-diverge.mir| 4 +- .../AMDGPU/MIR/uses-value-from-cycle.mir | 8 +- .../GlobalISel/divergence-structurizer.mir| 80 -- .../regbankselect-mui-regbanklegalize.mir | 69 --- .../regbankselect-mui-regbankselect.mir | 18 ++-- .../AMDGPU/GlobalISel/regbankselect-mui.ll| 84 ++- .../AMDGPU/GlobalISel/regbankselect-mui.mir | 51 ++- 9 files changed, 191 insertions(+), 178 deletions(-) diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index e384187b6e8593..8e13c0916dd9e1 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -54,9 +54,34 @@ const MachineBasicBlock *MachineSSAContext::getDefBlock(Register value) const { return F->getRegInfo().getVRegDef(value)->getParent(); } +static bool isUndef(const MachineInstr &MI) { + return MI.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + MI.getOpcode() == TargetOpcode::IMPLICIT_DEF; +} + +/// MachineInstr equivalent of PHINode::hasConstantOrUndefValue() for G_PHI. template <> bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { - return Phi.isConstantValuePHI(); + if (!Phi.isPHI()) +return false; + + // In later passes PHI may appear with an undef operand, getVRegDef can fail. + if (Phi.getOpcode() == TargetOpcode::PHI) +return Phi.isConstantValuePHI(); + + // For G_PHI we do equivalent of PHINode::hasConstantOrUndefValue(). + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + Register This = Phi.getOperand(0).getReg(); + Register ConstantValue; + for (unsigned i = 1, e = Phi.getNumOperands(); i < e; i += 2) { +Register Incoming = Phi.getOperand(i).getReg(); +if (Incoming != This && !isUndef(*MRI.getVRegDef(Incoming))) { + if (ConstantValue && ConstantValue != Incoming) +return false; + ConstantValue = Incoming; +} + } + return true; } template <> diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir index ce00edf3363f77..9694a340b5e906 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/hidden-diverge-gmir.mir @@ -1,24 +1,24 @@ # RUN: llc -mtriple=amdgcn-- -run-pass=print-machine-uniformity -o - %s 2>&1 | FileCheck %s # CHECK-LABEL: MachineUniformityInfo for function: hidden_diverge # CHECK-LABEL: BLOCK bb.0 -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) -# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 -# CHECK: DIVERGENT: G_BR %bb.2 +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.workitem.id.x) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_ICMP intpred(slt) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1) = G_XOR %{{[0-9]*}}:_, %{{[0-9]*}}:_ +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s1), %{{[0-9]*}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if) +# CHECK: DIVERGENT: G_BRCOND %{{[0-9]*}}:_(s1), %bb.1 +# CHECK: DIVERGENT: G_BR %bb.2 # CHECK-LABEL: BLOCK bb.1 # CHECK-LABEL: BLOCK bb.2 -# CHECK: D