================ @@ -66,9 +73,215 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() { return new AMDGPURegBankSelect(); } +class RegBankSelectHelper { + MachineIRBuilder &B; + MachineRegisterInfo &MRI; + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA; + const MachineUniformityInfo &MUI; + const SIRegisterInfo &TRI; + const RegisterBank *SgprRB; + const RegisterBank *VgprRB; + const RegisterBank *VccRB; + +public: + RegBankSelectHelper(MachineIRBuilder &B, + AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA, + const MachineUniformityInfo &MUI, + const SIRegisterInfo &TRI, const RegisterBankInfo &RBI) + : B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI), + SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)), + VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)), + VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {} + + bool shouldRegBankSelect(MachineInstr &MI) { + return MI.isPreISelOpcode() || MI.isCopy(); + } + + // Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of + // the cycle + // Note: uniformity analysis does not consider that registers with vgpr def + // are divergent (you can have uniform value in vgpr). + // - TODO: implicit use of $exec could be implemented as indicator that + // instruction is divergent + bool isTemporalDivergenceCopy(Register Reg) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (!MI->isCopy()) + return false; + + for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) + continue; + + if (Op.getReg() == TRI.getExec()) { + return true; + } + } + + return false; + } + + void setRBDef(MachineInstr &MI, MachineOperand &DefOP, + const RegisterBank *RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy that can be removed later. This simplifies post regbanklegalize + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator()))); + B.buildCopy(Reg, NewReg); + + // The problem was discovered for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // Replace virtual registers with register class on generic instructions + // uses with virtual registers with register bank. + for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRegBankSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.getReg() == Reg) + Op.setReg(NewReg); + } + } + } + + } else { + MRI.setRegBank(Reg, *RB); + } + } + + std::optional<Register> tryGetVReg(MachineOperand &Op) { + if (!Op.isReg()) + return std::nullopt; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) + return std::nullopt; + + return Reg; + } + + void assignBanksOnDefs(MachineInstr &MI) { + if (!shouldRegBankSelect(MI)) + return; + + for (MachineOperand &DefOP : MI.defs()) { + auto MaybeDefReg = tryGetVReg(DefOP); + if (!MaybeDefReg) + continue; + Register DefReg = *MaybeDefReg; + + // Copies can have register class on def registers. + if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; + } + + if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRBDef(MI, DefOP, SgprRB); + } else { + if (MRI.getType(DefReg) == LLT::scalar(1)) + setRBDef(MI, DefOP, VccRB); + else + setRBDef(MI, DefOP, VgprRB); + } + } + } + + void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP, + const RegisterBank *RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { + auto DefMI = MRI.getVRegDef(Reg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { + B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); + } + + void constrainBanksOnUses(MachineInstr &MI) { + if (!shouldRegBankSelect(MI)) + return; + + // Copies can have register class on use registers. + if (MI.isCopy()) + return; + + for (MachineOperand &UseOP : MI.uses()) { + auto MaybeUseReg = tryGetVReg(UseOP); + if (!MaybeUseReg) + continue; + Register UseReg = *MaybeUseReg; + + // UseReg already has register bank. + if (MRI.getRegBankOrNull(UseReg)) + continue; + + if (!isTemporalDivergenceCopy(UseReg) && + (MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) { + constrainRBUse(MI, UseOP, SgprRB); + } else { + if (MRI.getType(UseReg) == LLT::scalar(1)) + constrainRBUse(MI, UseOP, VccRB); + else + constrainRBUse(MI, UseOP, VgprRB); + } + } + } +}; + bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) return false; + + MachineIRBuilder B(MF); ---------------- petar-avramovic wrote:
Added, but I don't see a reason for it. Here we only set register banks and insert a few copies, there is nothing for CSE to find. Legalizer part actually makes use of CSE often. https://github.com/llvm/llvm-project/pull/112863 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits