llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Sam Tebbs (SamTebbs33) <details> <summary>Changes</summary> Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on https://github.com/llvm/llvm-project/pull/144281 . Replaces https://github.com/llvm/llvm-project/pull/146073 . --- Patch is 170.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147513.diff 11 Files Affected: - (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+20-14) - (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+45-90) - (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+8-7) - (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+34-114) - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+28-7) - (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (-1) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+83-163) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+243-483) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+1-1) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+1-1) - (modified) llvm/unittests/Transforms/Vectorize/VPlanTest.cpp (+4-4) ``````````diff diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0adff8d957e98..570fcf2f71bde 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7045,7 +7045,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } // The VPlan-based cost model is more accurate for partial reduction and // comparing against the legacy cost isn't desirable. - if (isa<VPPartialReductionRecipe>(&R)) + if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); + VPR && VPR->isPartialReduction()) return true; /// If a VPlan transform folded a recipe to one producing a single-scalar, @@ -8307,11 +8308,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); // If the PHI is used by a partial reduction, set the scale factor. - unsigned ScaleFactor = - getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); - PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); + bool UseInLoopReduction = CM.isInLoopReduction(Phi); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); + auto ScaleFactor = + (UseOrderedReductions || UseInLoopReduction) + ? 0 + : getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + UseOrderedReductions, ScaleFactor); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8375,7 +8379,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, VPValue *Accumulator = Operands[1]; VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa<VPReductionPHIRecipe>(BinOpRecipe) || - isa<VPPartialReductionRecipe>(BinOpRecipe)) + (isa<VPReductionRecipe>(BinOpRecipe) && + cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction())) std::swap(BinOp, Accumulator); unsigned ReductionOpcode = Reduction->getOpcode(); @@ -8396,12 +8401,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); - BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } - return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + + return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction, + Accumulator, BinOp, Cond, false, ScaleFactor); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -9189,9 +9192,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI) ? RdxDesc.getFastMathFlags() : FastMathFlags(); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); + unsigned VFScaleFactor = !UseOrderedReductions; auto *RedRecipe = new VPReductionRecipe( Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, - CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); + UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Delete CurrentLink as it will be invalid if its operand is replaced @@ -9225,8 +9230,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Don't output selects for partial reductions because they have an output // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. + auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe()); if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) { + (!RR || !RR->isPartialReduction())) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 30f3566332d79..a3cc5f335e82e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -552,7 +552,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: - case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2194,26 +2193,29 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; - /// The phi is part of an in-loop reduction. - bool IsInLoop; - /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; - /// When expanding the reduction PHI, the plan's VF element count is divided - /// by this factor to form the reduction phi's VF. - unsigned VFScaleFactor = 1; + /// The scaling factor, relative to the VF, that this recipe's output is + /// divided by. + /// For outer-loop reductions this is equal to 1. + /// For in-loop reductions this is equal to 0, to specify that this is equal + /// to the VF (which may not be known yet). For partial-reductions this is + /// equal to another scalar value. + unsigned VFScaleFactor; public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, - VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false, unsigned VFScaleFactor = 1) + VPValue &Start, bool IsOrdered = false, + unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), - VFScaleFactor(VFScaleFactor) { - assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); + RdxDesc(RdxDesc), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { + assert((!IsOrdered || isInLoop()) && + "IsOrdered requires the reduction to be in-loop"); + assert(((!isInLoop() && !IsOrdered) || isInLoop()) && + "Invalid VFScaleFactor"); } ~VPReductionPHIRecipe() override = default; @@ -2221,7 +2223,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, VPReductionPHIRecipe *clone() override { auto *R = new VPReductionPHIRecipe( dyn_cast_or_null<PHINode>(getUnderlyingValue()), RdxDesc, - *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor); + *getOperand(0), IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2247,8 +2249,11 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Returns true, if the phi is part of an ordered reduction. bool isOrdered() const { return IsOrdered; } - /// Returns true, if the phi is part of an in-loop reduction. - bool isInLoop() const { return IsInLoop; } + /// Returns true if the phi is part of an in-loop reduction. + bool isInLoop() const { return VFScaleFactor == 0; } + + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return VFScaleFactor > 1; } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -2421,23 +2426,32 @@ class VPInterleaveRecipe : public VPRecipeBase { Instruction *getInsertPos() const { return IG->getInsertPos(); } }; -/// A recipe to represent inloop reduction operations, performing a reduction on -/// a vector operand into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, [Condition]}. +/// A recipe to represent inloop, ordered or partial reduction operations. It +/// performs a reduction on a vector operand into a scalar (vector in the case +/// of a partial reduction) value, and adds the result to a chain. The Operands +/// are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeWithIRFlags { /// The recurrence kind for the reduction in question. RecurKind RdxKind; bool IsOrdered; /// Whether the reduction is conditional. bool IsConditional = false; + /// The scaling factor, relative to the VF, that this recipe's output is + /// divided by. + /// For outer-loop reductions this is equal to 1. + /// For in-loop reductions this is equal to 0, to specify that this is equal + /// to the VF (which may not be known yet). + /// For partial-reductions this is equal to another scalar value. + unsigned VFScaleFactor; protected: VPReductionRecipe(const unsigned char SC, RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, ArrayRef<VPValue *> Operands, VPValue *CondOp, - bool IsOrdered, DebugLoc DL) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL) : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered) { + IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { + assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor"); if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2448,30 +2462,29 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + IsOrdered, VFScaleFactor, DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + IsOrdered, VFScaleFactor, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { - return new VPReductionRecipe(RdxKind, getFastMathFlags(), - getUnderlyingInstr(), getChainOp(), getVecOp(), - getCondOp(), IsOrdered, getDebugLoc()); + return new VPReductionRecipe( + RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(), + getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPReductionSC || - R->getVPDefID() == VPRecipeBase::VPReductionEVLSC || - R->getVPDefID() == VPRecipeBase::VPPartialReductionSC; + R->getVPDefID() == VPRecipeBase::VPReductionEVLSC; } static inline bool classof(const VPUser *U) { @@ -2498,6 +2511,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { bool isOrdered() const { return IsOrdered; }; /// Return true if the in-loop reduction is conditional. bool isConditional() const { return IsConditional; }; + /// Returns true if the reduction outputs a vector with a scaled down VF. + bool isPartialReduction() const { return VFScaleFactor > 1; } /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } /// The VPValue of the vector value to be reduced. @@ -2506,68 +2521,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { VPValue *getCondOp() const { return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; } -}; - -/// A recipe for forming partial reductions. In the loop, an accumulator and -/// vector operand are added together and passed to the next iteration as the -/// next accumulator. After the loop body, the accumulator is reduced to a -/// scalar value. -class VPPartialReductionRecipe : public VPReductionRecipe { - unsigned Opcode; - - /// The divisor by which the VF of this recipe's output should be divided - /// during execution. - unsigned VFScaleFactor; - -public: - VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor) - : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond, - VFScaleFactor, ReductionInst) {} - VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - VPValue *Cond, unsigned ScaleFactor, - Instruction *ReductionInst = nullptr) - : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add, - FastMathFlags(), ReductionInst, - ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}), - Opcode(Opcode), VFScaleFactor(ScaleFactor) { - [[maybe_unused]] auto *AccumulatorRecipe = - getChainOp()->getDefiningRecipe(); - // When cloning as part of a VPExpressionRecipe, the chain op could have - // been removed from the plan and so doesn't have a defining recipe. - assert((!AccumulatorRecipe || - isa<VPReductionPHIRecipe>(AccumulatorRecipe) || - isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && - "Unexpected operand order for partial reduction recipe"); - } - ~VPPartialReductionRecipe() override = default; - - VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getCondOp(), VFScaleFactor, - getUnderlyingInstr()); - } - - VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) - - /// Generate the reduction in the loop. - void execute(VPTransformState &State) override; - - /// Return the cost of this VPPartialReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - - /// Get the binary op's opcode. - unsigned getOpcode() const { return Opcode; } - /// Get the factor that the VF of this recipe's output should be scaled by. unsigned getVFScaleFactor() const { return VFScaleFactor; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif }; /// A recipe to represent inloop reduction operations with vector-predication @@ -2583,7 +2538,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe { R.getFastMathFlags(), cast_or_null<Instruction>(R.getUnderlyingValue()), ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered(), DL) {} + R.isOrdered(), 0, DL) {} ~VPReductionEVLRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..6aa3750897309 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -282,10 +282,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe, - VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe, - VPPartialReductionRecipe>([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case<VPInstructionWithType, VPWidenIntrinsicRecipe, VPWidenCastRecipe>( @@ -396,7 +396,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, static unsigned getVFScaleFactor(VPRecipeBase *R) { if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R)) return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R)) + if (auto *RR = dyn_cast<VPReductionRecipe>(R)) return RR->getVFScaleFactor(); assert( (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() != @@ -566,8 +566,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(R); - ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); + ElementCount VF = VFs[J]; + if (unsigned ScaleFactor = getVFScaleFactor(R)) + VF = VF.divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF << " for " << *R << "\n"; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b485fa3d86e40..9dc3fb1b97447 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -164,7 +164,6 @@ bool VPRecipeBase::mayHaveSideEffects() const { return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects(); case VPBlendSC: case VPReductionEVLSC: - case VPPartialReductionSC: case VPReductionSC: case VPScalarIVStepsSC: case VPVectorPointerSC: @@ -294,104 +293,6 @@ bool VPRecipeBase::isScalarCast() const { return VPI && Instruction::isCast(VPI->getOpcode()); } -InstructionCost -VPPartialReductionRecipe::computeCost(ElementCount VF, - VPCostContext &Ctx) const { - std::optional<unsigned> Opcode; - VPValue *Op = getOperand(0); - VPRecipeBase *OpR = Op->getDefiningRecipe(); - - // If the partial reduction is predicated, a select will be operand 0 - using namespace llvm::VPlanPatternMatch; - if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) { - OpR = Op->getDefiningRecipe(); - } - - Type *InputTypeA = nullptr, *InputTypeB = nullptr; - TTI::PartialReductionExtendKind ExtAType = TTI::PR_None, - ExtBType = TTI::PR_None; - - auto GetExtendKind = [](VPRecipeBase *R) { - if (!R) - return TTI::PR_None; - auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R); - if (!WidenCastR) - return TTI::PR_None; - if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt) - return TTI::PR_ZeroExtend; - if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) - return TTI::PR_SignExtend; - return TTI::PR_None; - }; - - // Pick out opcode, type/ext information and use sub side effects from a widen - // recipe.... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/147513 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits