llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) <details> <summary>Changes</summary> This patch uses the MachineLoopInfo to give blocks within loops a higher weight when choosing the bundle ZA state. MachineLoopInfo does not find loop trip counts, so this uses an arbitrary weight (default 10), which can be configured with the `-aarch64-sme-abi-loop-edge-weight` flag. This makes the MachineSMEABIPass pass more likely to pick a bundle state that matches the loop's entry/exit state, which avoids state changes in the loop (which we assume will happen more than once). This does require some extra analysis, so this is only enabled at -O1 and above. --- Full diff: https://github.com/llvm/llvm-project/pull/149065.diff 5 Files Affected: - (modified) llvm/lib/Target/AArch64/AArch64.h (+1-1) - (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+2-2) - (modified) llvm/lib/Target/AArch64/MachineSMEABIPass.cpp (+27-4) - (added) llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll (+115) - (modified) llvm/test/CodeGen/AArch64/sme-za-control-flow.ll (+14-19) ``````````diff diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 8d0ff41fc8c08..139684172f1bb 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -60,7 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); FunctionPass *createAArch64CollectLOHPass(); FunctionPass *createSMEABIPass(); FunctionPass *createSMEPeepholeOptPass(); -FunctionPass *createMachineSMEABIPass(); +FunctionPass *createMachineSMEABIPass(CodeGenOptLevel); ModulePass *createSVEIntrinsicOptsPass(); InstructionSelector * createAArch64InstructionSelector(const AArch64TargetMachine &, diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 2c1edecd0b48d..b26a137d4e0fb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -792,7 +792,7 @@ bool AArch64PassConfig::addGlobalInstructionSelect() { void AArch64PassConfig::addMachineSSAOptimization() { if (EnableNewSMEABILowering && TM->getOptLevel() != CodeGenOptLevel::None) - addPass(createMachineSMEABIPass()); + addPass(createMachineSMEABIPass(TM->getOptLevel())); if (TM->getOptLevel() != CodeGenOptLevel::None && EnableSMEPeepholeOpt) addPass(createSMEPeepholeOptPass()); @@ -825,7 +825,7 @@ bool AArch64PassConfig::addILPOpts() { void AArch64PassConfig::addPreRegAlloc() { if (EnableNewSMEABILowering && TM->getOptLevel() == CodeGenOptLevel::None) - addPass(createMachineSMEABIPass()); + addPass(createMachineSMEABIPass(CodeGenOptLevel::None)); // Change dead register definitions to refer to the zero register. if (TM->getOptLevel() != CodeGenOptLevel::None && diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 7c0cad299cc64..f63a338b4bd23 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -28,6 +29,12 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-machine-sme-abi" +static cl::opt<int> + LoopEdgeWeight("aarch64-sme-abi-loop-edge-weight", cl::ReallyHidden, + cl::init(10), + cl::desc("Edge weight for basic blocks witin loops (used " + "for placing ZA saves/restores)")); + namespace { enum ZAState { @@ -112,7 +119,8 @@ getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI, struct MachineSMEABI : public MachineFunctionPass { inline static char ID = 0; - MachineSMEABI() : MachineFunctionPass(ID) {} + MachineSMEABI(CodeGenOptLevel OptLevel = CodeGenOptLevel::Default) + : MachineFunctionPass(ID), OptLevel(OptLevel) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -121,6 +129,9 @@ struct MachineSMEABI : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<EdgeBundlesWrapperLegacy>(); + // Only analyse loops at -01 and above. + if (OptLevel != CodeGenOptLevel::None) + AU.addRequired<MachineLoopInfoWrapperPass>(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); @@ -197,6 +208,8 @@ struct MachineSMEABI : public MachineFunctionPass { LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; + CodeGenOptLevel OptLevel = CodeGenOptLevel::Default; + // All pass state that must be cleared between functions. struct PassState { SmallVector<BlockInfo> Blocks; @@ -209,6 +222,7 @@ struct MachineSMEABI : public MachineFunctionPass { } State; EdgeBundles *Bundles = nullptr; + MachineLoopInfo *MLI = nullptr; }; void MachineSMEABI::collectNeededZAStates(MachineFunction &MF, @@ -302,18 +316,23 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) { LLVM_DEBUG(dbgs() << " (no state preference)\n"); continue; } + bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID)); bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I; bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I; + int EdgeWeight = IsLoop ? LoopEdgeWeight : 1; + if (IsLoop) + LLVM_DEBUG(dbgs() << " IsLoop"); + LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')'); ZAState DesiredIncomingState = Block.Insts.front().NeededState; if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { - EdgeStateCounts[DesiredIncomingState]++; + EdgeStateCounts[DesiredIncomingState] += EdgeWeight; LLVM_DEBUG(dbgs() << " DesiredIncomingState: " << getZAStateString(DesiredIncomingState)); } ZAState DesiredOutgoingState = Block.Insts.front().NeededState; if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { - EdgeStateCounts[DesiredOutgoingState]++; + EdgeStateCounts[DesiredOutgoingState] += EdgeWeight; LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " << getZAStateString(DesiredOutgoingState)); } @@ -771,6 +790,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { // Reset pass state. State = PassState{}; Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); + if (OptLevel != CodeGenOptLevel::None) + MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface(); @@ -799,4 +820,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { return true; } -FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); } +FunctionPass *llvm::createMachineSMEABIPass(CodeGenOptLevel OptLevel) { + return new MachineSMEABI(OptLevel); +} diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll new file mode 100644 index 0000000000000..200280f52acb0 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-in-loop.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O0 +; RUN: llc -O1 -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-O1 + +declare void @private_za_call() +declare void @shared_za_call() "aarch64_inout_za" + +; This test checks that at -O0 we don't attempt to optimize lazy save state +; changes in loops, and that -O1 (and above) we attempt to push state changes +; out of loops. + +define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { +; CHECK-O0-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-O0: // %bb.0: // %entry +; CHECK-O0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-O0-NEXT: mov x29, sp +; CHECK-O0-NEXT: sub sp, sp, #32 +; CHECK-O0-NEXT: rdsvl x9, #1 +; CHECK-O0-NEXT: mov x8, sp +; CHECK-O0-NEXT: msub x8, x9, x9, x8 +; CHECK-O0-NEXT: mov sp, x8 +; CHECK-O0-NEXT: stp x8, x9, [x29, #-16] +; CHECK-O0-NEXT: stur w0, [x29, #-24] // 4-byte Folded Spill +; CHECK-O0-NEXT: bl shared_za_call +; CHECK-O0-NEXT: ldur w0, [x29, #-24] // 4-byte Folded Reload +; CHECK-O0-NEXT: mov w8, wzr +; CHECK-O0-NEXT: subs w9, w0, #1 +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill +; CHECK-O0-NEXT: b.lt .LBB0_4 +; CHECK-O0-NEXT: b .LBB0_1 +; CHECK-O0-NEXT: .LBB0_1: // %loop +; CHECK-O0-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-O0-NEXT: ldur w8, [x29, #-20] // 4-byte Folded Reload +; CHECK-O0-NEXT: stur w8, [x29, #-28] // 4-byte Folded Spill +; CHECK-O0-NEXT: sub x8, x29, #16 +; CHECK-O0-NEXT: msr TPIDR2_EL0, x8 +; CHECK-O0-NEXT: bl private_za_call +; CHECK-O0-NEXT: ldur w8, [x29, #-28] // 4-byte Folded Reload +; CHECK-O0-NEXT: ldur w10, [x29, #-24] // 4-byte Folded Reload +; CHECK-O0-NEXT: add w9, w8, #1 +; CHECK-O0-NEXT: mov w8, w9 +; CHECK-O0-NEXT: subs w9, w9, w10 +; CHECK-O0-NEXT: mrs x9, NZCV +; CHECK-O0-NEXT: smstart za +; CHECK-O0-NEXT: mrs x10, TPIDR2_EL0 +; CHECK-O0-NEXT: sub x0, x29, #16 +; CHECK-O0-NEXT: cbz x10, .LBB0_2 +; CHECK-O0-NEXT: b .LBB0_3 +; CHECK-O0-NEXT: .LBB0_2: // %loop +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-O0-NEXT: bl __arm_tpidr2_restore +; CHECK-O0-NEXT: b .LBB0_3 +; CHECK-O0-NEXT: .LBB0_3: // %loop +; CHECK-O0-NEXT: // in Loop: Header=BB0_1 Depth=1 +; CHECK-O0-NEXT: msr TPIDR2_EL0, xzr +; CHECK-O0-NEXT: msr NZCV, x9 +; CHECK-O0-NEXT: stur w8, [x29, #-20] // 4-byte Folded Spill +; CHECK-O0-NEXT: b.ne .LBB0_1 +; CHECK-O0-NEXT: b .LBB0_4 +; CHECK-O0-NEXT: .LBB0_4: // %exit +; CHECK-O0-NEXT: mov sp, x29 +; CHECK-O0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-O0-NEXT: b shared_za_call +; +; CHECK-O1-LABEL: private_za_loop_active_entry_and_exit: +; CHECK-O1: // %bb.0: // %entry +; CHECK-O1-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-O1-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-O1-NEXT: mov x29, sp +; CHECK-O1-NEXT: sub sp, sp, #16 +; CHECK-O1-NEXT: rdsvl x8, #1 +; CHECK-O1-NEXT: mov x9, sp +; CHECK-O1-NEXT: msub x9, x8, x8, x9 +; CHECK-O1-NEXT: mov sp, x9 +; CHECK-O1-NEXT: mov w19, w0 +; CHECK-O1-NEXT: stp x9, x8, [x29, #-16] +; CHECK-O1-NEXT: bl shared_za_call +; CHECK-O1-NEXT: cmp w19, #1 +; CHECK-O1-NEXT: sub x8, x29, #16 +; CHECK-O1-NEXT: msr TPIDR2_EL0, x8 +; CHECK-O1-NEXT: b.lt .LBB0_2 +; CHECK-O1-NEXT: .LBB0_1: // %loop +; CHECK-O1-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-O1-NEXT: bl private_za_call +; CHECK-O1-NEXT: subs w19, w19, #1 +; CHECK-O1-NEXT: b.ne .LBB0_1 +; CHECK-O1-NEXT: .LBB0_2: // %exit +; CHECK-O1-NEXT: smstart za +; CHECK-O1-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-O1-NEXT: sub x0, x29, #16 +; CHECK-O1-NEXT: cbnz x8, .LBB0_4 +; CHECK-O1-NEXT: // %bb.3: // %exit +; CHECK-O1-NEXT: bl __arm_tpidr2_restore +; CHECK-O1-NEXT: .LBB0_4: // %exit +; CHECK-O1-NEXT: msr TPIDR2_EL0, xzr +; CHECK-O1-NEXT: mov sp, x29 +; CHECK-O1-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-O1-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-O1-NEXT: b shared_za_call +entry: + %cmpgt = icmp sgt i32 %n, 0 + tail call void @shared_za_call() + br i1 %cmpgt, label %loop, label %exit + +loop: + %iv = phi i32 [ %next_iv, %loop ], [ 0, %entry ] + tail call void @private_za_call() + %next_iv = add nuw nsw i32 %iv, 1 + %cmpeq = icmp eq i32 %next_iv, %n + br i1 %cmpeq, label %exit, label %loop + +exit: + tail call void @shared_za_call() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index d3d7e953bedfa..e9ef9d22aaba5 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -102,7 +102,7 @@ exit: ret void } -; FIXME: In the new lowering we could weight edges to avoid doing the lazy save in the loop. +; This tests that with the new lowering we push state changes out of loops (at -O1 and above). define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop_active_entry_and_exit: ; CHECK: // %bb.0: // %entry @@ -154,7 +154,7 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEWLOWERING-LABEL: private_za_loop_active_entry_and_exit: ; CHECK-NEWLOWERING: // %bb.0: // %entry ; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill ; CHECK-NEWLOWERING-NEXT: mov x29, sp ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 @@ -165,30 +165,25 @@ define void @private_za_loop_active_entry_and_exit(i32 %n) "aarch64_inout_za" no ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEWLOWERING-NEXT: bl shared_za_call ; CHECK-NEWLOWERING-NEXT: cmp w19, #1 -; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_5 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %loop.preheader -; CHECK-NEWLOWERING-NEXT: sub x20, x29, #16 -; CHECK-NEWLOWERING-NEXT: b .LBB1_3 -; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: cbz w19, .LBB1_5 -; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %loop +; CHECK-NEWLOWERING-NEXT: sub x8, x29, #16 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEWLOWERING-NEXT: b.lt .LBB1_2 +; CHECK-NEWLOWERING-NEXT: .LBB1_1: // %loop ; CHECK-NEWLOWERING-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: sub w19, w19, #1 +; CHECK-NEWLOWERING-NEXT: subs w19, w19, #1 +; CHECK-NEWLOWERING-NEXT: b.ne .LBB1_1 +; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %exit ; CHECK-NEWLOWERING-NEXT: smstart za ; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %loop -; CHECK-NEWLOWERING-NEXT: // in Loop: Header=BB1_3 Depth=1 +; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_4 +; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit ; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: b .LBB1_2 -; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit +; CHECK-NEWLOWERING-NEXT: .LBB1_4: // %exit +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEWLOWERING-NEXT: b shared_za_call entry: `````````` </details> https://github.com/llvm/llvm-project/pull/149065 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits