https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/149510
>From c2d34149b2860cadf03824cc35a724775aaf60f8 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell <benjamin.maxw...@arm.com> Date: Tue, 15 Jul 2025 17:00:04 +0000 Subject: [PATCH] [AArch64][SME] Propagate desired ZA states in the MachineSMEABIPass This patch adds a propagation step to the MachineSMEABIPass that propagates desired ZA states forwards (from predecessors to successors). The aim of this is to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state up from the inner loop to the outer loops (to avoid saves/restores in any loop). Change-Id: I39f9c7d7608e2fa070be2fb88351b4d1d0079041 --- llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 85 ++++- .../sme-za-function-with-many-blocks.ll | 296 ++++++++++++++++++ 2 files changed, 364 insertions(+), 17 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 7f3bb42e5a08e..4bf11a7e9da2c 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -138,6 +138,7 @@ struct MachineSMEABI : public MachineFunctionPass { } void collectNeededZAStates(MachineFunction &MF, SMEAttrs); + void propagateDesiredStates(MachineFunction &MF); void pickBundleZAStates(MachineFunction &MF); void insertStateChanges(MachineFunction &MF, bool IsAgnosticZA); @@ -202,8 +203,10 @@ struct MachineSMEABI : public MachineFunctionPass { }; struct BlockInfo { - ZAState FixedEntryState{ZAState::ANY}; SmallVector<InstInfo> Insts; + ZAState FixedEntryState{ZAState::ANY}; + ZAState DesiredIncomingState{ZAState::ANY}; + ZAState DesiredOutgoingState{ZAState::ANY}; LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -294,28 +297,74 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction &MF, // Reverse vector (as we had to iterate backwards for liveness). std::reverse(Block.Insts.begin(), Block.Insts.end()); + + // Record the desired states on entry/exit of this block. These are the + // states that would not incur a state transition. + if (!Block.Insts.empty()) { + Block.DesiredIncomingState = Block.Insts.front().NeededState; + Block.DesiredOutgoingState = Block.Insts.back().NeededState; + } + } +} + +void MachineSMEABI::propagateDesiredStates(MachineFunction &MF) { + // This propagates desired states from predecessors to successors. This + // propagates state up loop nests (as an inner loop is a predecessor + // to outer its loops). + SmallVector<MachineBasicBlock *> Worklist; + for (auto [BlockID, BlockInfo] : enumerate(State.Blocks)) { + if (!isLegalEdgeBundleZAState(BlockInfo.DesiredIncomingState)) + Worklist.push_back(MF.getBlockNumbered(BlockID)); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + auto &BlockInfo = State.Blocks[MBB->getNumber()]; + + // Pick a legal edge bundle state that matches the majority of predecessors. + int PredStateCounts[ZAState::NUM_ZA_STATE] = {0}; + for (MachineBasicBlock *Pred : predecessors(MBB)) { + auto &PredBlockInfo = State.Blocks[Pred->getNumber()]; + if (isLegalEdgeBundleZAState(PredBlockInfo.DesiredOutgoingState)) + PredStateCounts[PredBlockInfo.DesiredOutgoingState]++; + } + ZAState PropagatedState = + ZAState(max_element(PredStateCounts) - PredStateCounts); + + if (PropagatedState != BlockInfo.DesiredIncomingState) { + BlockInfo.DesiredIncomingState = PropagatedState; + // Propagate to outgoing state for blocks that don't care about their + // ZA state. + if (BlockInfo.DesiredOutgoingState == ZAState::ANY) + BlockInfo.DesiredOutgoingState = PropagatedState; + + // Push any successors that may need updating to the worklist. + for (MachineBasicBlock *Succ : successors(MBB)) { + auto &SuccBlockInfo = State.Blocks[Succ->getNumber()]; + if (!isLegalEdgeBundleZAState(SuccBlockInfo.DesiredIncomingState)) + Worklist.push_back(Succ); + } + } } } void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) { State.BundleStates.resize(Bundles->getNumBundles()); + + if (OptLevel != CodeGenOptLevel::None) + propagateDesiredStates(MF); + for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) { LLVM_DEBUG(dbgs() << "Picking ZA state for edge bundle: " << I << '\n'); // Attempt to pick a ZA state for this bundle that minimizes state // transitions. Edges within loops are given a higher weight as we assume // they will be executed more than once. - // TODO: We should propagate desired incoming/outgoing states through blocks - // that have the "ANY" state first to make better global decisions. int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; for (unsigned BlockID : Bundles->getBlocks(I)) { LLVM_DEBUG(dbgs() << "- bb." << BlockID); BlockInfo &Block = State.Blocks[BlockID]; - if (Block.Insts.empty()) { - LLVM_DEBUG(dbgs() << " (no state preference)\n"); - continue; - } bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID)); bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I; bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I; @@ -324,26 +373,28 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) { LLVM_DEBUG(dbgs() << " IsLoop"); LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')'); - ZAState DesiredIncomingState = Block.Insts.front().NeededState; - if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { - EdgeStateCounts[DesiredIncomingState] += EdgeWeight; + bool LegalInEdge = + InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState); + bool LegalOutEgde = + OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState); + if (LegalInEdge) { LLVM_DEBUG(dbgs() << " DesiredIncomingState: " - << getZAStateString(DesiredIncomingState)); + << getZAStateString(Block.DesiredIncomingState)); + EdgeStateCounts[Block.DesiredIncomingState] += EdgeWeight; } - ZAState DesiredOutgoingState = Block.Insts.back().NeededState; - if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { - EdgeStateCounts[DesiredOutgoingState] += EdgeWeight; + if (LegalOutEgde) { LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " - << getZAStateString(DesiredOutgoingState)); + << getZAStateString(Block.DesiredOutgoingState)); + EdgeStateCounts[Block.DesiredOutgoingState] += EdgeWeight; } + if (!LegalInEdge && !LegalOutEgde) + LLVM_DEBUG(dbgs() << " (no state preference)"); LLVM_DEBUG(dbgs() << '\n'); } ZAState BundleState = ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); - // Force ZA to be active in bundles that don't have a preferred state. - // TODO: Something better here (to avoid extra mode switches). if (BundleState == ZAState::ANY) BundleState = ZAState::ACTIVE; diff --git a/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll new file mode 100644 index 0000000000000..0306b27cb17e1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll @@ -0,0 +1,296 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | FileCheck %s + +; This test case was generated by lowering mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir to LLVM IR. +; The actual contents of the function are not that important. The main interesting quality here is that many blocks +; don't directly use ZA. The only blocks that require ZA are the MOPA (and load/stores) in the inner loop, and the +;`printMemrefF32()` call in the exit block. +; +; If ZA states are not propagated in the MachineSMEABIPass block %48 (which is within the outer loop), will +; have an edge to block %226 (the exit block), which requires ZA in the "saved" state, and an edge to block %51 +; (which has no preference on ZA state). This means block %48 will also end up in the locally saved state. +; This is not really what we want, as it means we will save/restore ZA in the outer loop. We can fix this by +; propagating the "active" state from the inner loop through basic blocks with no preference, to ensure the outer +; loop is in the "active" state too. +; +; If done correctly, the only ZA save/restore should be in the exit block (with all other blocks in the active state). + +define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) #0 { +; Check for a ZA zero in the entry block, then no uses of TPIDR2_EL0 (for ZA saves/restore) +; until the exit block (which contains the call to printMemrefF32). +; +; CHECK-LABEL: matmul: +; CHECK: zero {za} +; CHECK-NOT: TPIDR2_EL0 +; CHECK: msr TPIDR2_EL0, x{{.*}} +; CHECK-NOT: .LBB{{.*}} +; CHECK: bl printMemrefF32 + %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %14, 0 + %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1 + %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2 + %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0 + %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0 + %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1 + %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1 + %29 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %7, 0 + %30 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %29, ptr %8, 1 + %31 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %30, i64 %9, 2 + %32 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %31, i64 %10, 3, 0 + %33 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %32, i64 %12, 4, 0 + %34 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %33, i64 %11, 3, 1 + %35 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %34, i64 %13, 4, 1 + %36 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %0, 0 + %37 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %36, ptr %1, 1 + %38 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %37, i64 %2, 2 + %39 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %38, i64 %3, 3, 0 + %40 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %39, i64 %5, 4, 0 + %41 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %40, i64 %4, 3, 1 + %42 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %41, i64 %6, 4, 1 + %43 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0 + %44 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1 + %45 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1 + %46 = call i64 @llvm.vscale.i64() + %47 = mul i64 %46, 4 + br label %48 + +48: ; preds = %224, %21 + %49 = phi i64 [ %225, %224 ], [ 0, %21 ] + %50 = icmp slt i64 %49, %43 + br i1 %50, label %51, label %226 + +51: ; preds = %48 + %52 = sub i64 %43, %49 + %53 = call i64 @llvm.smin.i64(i64 %47, i64 %52) + %54 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %55 = trunc i64 %53 to i32 + %56 = insertelement <vscale x 4 x i32> poison, i32 %55, i32 0 + %57 = shufflevector <vscale x 4 x i32> %56, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %58 = icmp slt <vscale x 4 x i32> %54, %57 + br label %59 + +59: ; preds = %222, %51 + %60 = phi i64 [ %223, %222 ], [ 0, %51 ] + %61 = icmp slt i64 %60, %45 + br i1 %61, label %62, label %224 + +62: ; preds = %59 + %63 = sub i64 %45, %60 + %64 = call i64 @llvm.smin.i64(i64 %47, i64 %63) + %65 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 0 + %66 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 1 + %67 = insertvalue { ptr, ptr, i64 } poison, ptr %65, 0 + %68 = insertvalue { ptr, ptr, i64 } %67, ptr %66, 1 + %69 = insertvalue { ptr, ptr, i64 } %68, i64 0, 2 + %70 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 2 + %71 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 0 + %72 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 1 + %73 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 0 + %74 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 1 + %75 = mul nsw i64 %49, %73 + %76 = add i64 %70, %75 + %77 = mul nsw i64 %60, %74 + %78 = add i64 %76, %77 + %79 = extractvalue { ptr, ptr, i64 } %69, 0 + %80 = extractvalue { ptr, ptr, i64 } %69, 1 + %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %79, 0 + %82 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %81, ptr %80, 1 + %83 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %82, i64 %78, 2 + %84 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %83, i64 %53, 3, 0 + %85 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %84, i64 %73, 4, 0 + %86 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %85, i64 %64, 3, 1 + %87 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %86, i64 %74, 4, 1 + %88 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %89 = trunc i64 %64 to i32 + %90 = insertelement <vscale x 4 x i32> poison, i32 %89, i32 0 + %91 = shufflevector <vscale x 4 x i32> %90, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %92 = icmp slt <vscale x 4 x i32> %88, %91 + br label %93 + +93: ; preds = %220, %62 + %94 = phi i64 [ %221, %220 ], [ 0, %62 ] + %95 = icmp slt i64 %94, %44 + br i1 %95, label %96, label %222 + +96: ; preds = %93 + %97 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 0 + %98 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 1 + %99 = insertvalue { ptr, ptr, i64 } poison, ptr %97, 0 + %100 = insertvalue { ptr, ptr, i64 } %99, ptr %98, 1 + %101 = insertvalue { ptr, ptr, i64 } %100, i64 0, 2 + %102 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 2 + %103 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0 + %104 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1 + %105 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 0 + %106 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 1 + %107 = mul nsw i64 %49, %105 + %108 = add i64 %102, %107 + %109 = mul nsw i64 %94, %106 + %110 = add i64 %108, %109 + %111 = extractvalue { ptr, ptr, i64 } %101, 0 + %112 = extractvalue { ptr, ptr, i64 } %101, 1 + %113 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %111, 0 + %114 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %113, ptr %112, 1 + %115 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %114, i64 %110, 2 + %116 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %115, i64 %53, 3, 0 + %117 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %116, i64 %105, 4, 0 + br label %118 + +118: ; preds = %133, %96 + %119 = phi i64 [ %135, %133 ], [ 0, %96 ] + %120 = phi <vscale x 4 x float> [ %134, %133 ], [ poison, %96 ] + %121 = icmp slt i64 %119, %47 + br i1 %121, label %122, label %136 + +122: ; preds = %118 + %123 = extractelement <vscale x 4 x i1> %58, i64 %119 + br i1 %123, label %124, label %133 + +124: ; preds = %122 + %125 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 1 + %126 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 2 + %127 = getelementptr float, ptr %125, i64 %126 + %128 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 4, 0 + %129 = mul nuw nsw i64 %119, %128 + %130 = getelementptr inbounds nuw float, ptr %127, i64 %129 + %131 = load float, ptr %130, align 4 + %132 = insertelement <vscale x 4 x float> %120, float %131, i64 %119 + br label %133 + +133: ; preds = %124, %122 + %134 = phi <vscale x 4 x float> [ %132, %124 ], [ %120, %122 ] + %135 = add i64 %119, 1 + br label %118 + +136: ; preds = %118 + %137 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 0 + %138 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 1 + %139 = insertvalue { ptr, ptr, i64 } poison, ptr %137, 0 + %140 = insertvalue { ptr, ptr, i64 } %139, ptr %138, 1 + %141 = insertvalue { ptr, ptr, i64 } %140, i64 0, 2 + %142 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 2 + %143 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 0 + %144 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1 + %145 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 0 + %146 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 1 + %147 = mul nsw i64 %94, %145 + %148 = add i64 %142, %147 + %149 = mul nsw i64 %60, %146 + %150 = add i64 %148, %149 + %151 = extractvalue { ptr, ptr, i64 } %141, 0 + %152 = extractvalue { ptr, ptr, i64 } %141, 1 + %153 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %151, 0 + %154 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %153, ptr %152, 1 + %155 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %154, i64 %150, 2 + %156 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %155, i64 %64, 3, 0 + %157 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %156, i64 %146, 4, 0 + br label %158 + +158: ; preds = %173, %136 + %159 = phi i64 [ %175, %173 ], [ 0, %136 ] + %160 = phi <vscale x 4 x float> [ %174, %173 ], [ poison, %136 ] + %161 = icmp slt i64 %159, %47 + br i1 %161, label %162, label %176 + +162: ; preds = %158 + %163 = extractelement <vscale x 4 x i1> %92, i64 %159 + br i1 %163, label %164, label %173 + +164: ; preds = %162 + %165 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 1 + %166 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 2 + %167 = getelementptr float, ptr %165, i64 %166 + %168 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 4, 0 + %169 = mul nuw nsw i64 %159, %168 + %170 = getelementptr inbounds nuw float, ptr %167, i64 %169 + %171 = load float, ptr %170, align 4 + %172 = insertelement <vscale x 4 x float> %160, float %171, i64 %159 + br label %173 + +173: ; preds = %164, %162 + %174 = phi <vscale x 4 x float> [ %172, %164 ], [ %160, %162 ] + %175 = add i64 %159, 1 + br label %158 + +176: ; preds = %158 + %177 = trunc i64 %64 to i32 + br label %178 + +178: ; preds = %181, %176 + %179 = phi i64 [ %202, %181 ], [ 0, %176 ] + %180 = icmp slt i64 %179, %47 + br i1 %180, label %181, label %203 + +181: ; preds = %178 + %182 = icmp ult i64 %179, %53 + %183 = sext i1 %182 to i32 + %184 = and i32 %183, %177 + %185 = sext i32 %184 to i64 + %186 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() + %187 = trunc i64 %185 to i32 + %188 = insertelement <vscale x 4 x i32> poison, i32 %187, i32 0 + %189 = shufflevector <vscale x 4 x i32> %188, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer + %190 = icmp slt <vscale x 4 x i32> %186, %189 + %191 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1 + %192 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2 + %193 = getelementptr float, ptr %191, i64 %192 + %194 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0 + %195 = mul i64 %179, %194 + %196 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1 + %197 = mul i64 0, %196 + %198 = add i64 %195, %197 + %199 = getelementptr float, ptr %193, i64 %198 + %200 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %199, i32 4, <vscale x 4 x i1> %190, <vscale x 4 x float> poison) + %201 = trunc i64 %179 to i32 + call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %201, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> %200) + %202 = add i64 %179, 1 + br label %178 + +203: ; preds = %178 + call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %58, <vscale x 4 x i1> %92, <vscale x 4 x float> %120, <vscale x 4 x float> %160) + %204 = call i64 @llvm.smin.i64(i64 %53, i64 %47) + br label %205 + +205: ; preds = %208, %203 + %206 = phi i64 [ %219, %208 ], [ 0, %203 ] + %207 = icmp slt i64 %206, %204 + br i1 %207, label %208, label %220 + +208: ; preds = %205 + %209 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1 + %210 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2 + %211 = getelementptr float, ptr %209, i64 %210 + %212 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0 + %213 = mul i64 %206, %212 + %214 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1 + %215 = mul i64 0, %214 + %216 = add i64 %213, %215 + %217 = getelementptr float, ptr %211, i64 %216 + %218 = trunc i64 %206 to i32 + call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %92, ptr %217, i32 0, i32 %218) + %219 = add i64 %206, 1 + br label %205 + +220: ; preds = %205 + %221 = add i64 %94, 1 + br label %93 + +222: ; preds = %93 + %223 = add i64 %60, %47 + br label %59 + +224: ; preds = %59 + %225 = add i64 %49, %47 + br label %48 + +226: ; preds = %48 + %227 = alloca { ptr, ptr, i64, [2 x i64], [2 x i64] }, i64 1, align 8 + store { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, ptr %227, align 8 + %228 = insertvalue { i64, ptr } { i64 2, ptr poison }, ptr %227, 1 + %229 = extractvalue { i64, ptr } %228, 0 + %230 = extractvalue { i64, ptr } %228, 1 + call void @printMemrefF32(i64 %229, ptr %230) + ret void +} + +declare void @printMemrefF32(i64, ptr) + +attributes #0 = { "aarch64_new_za" "aarch64_pstate_sm_body" } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits