================ @@ -4825,6 +4827,113 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } +// Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA. +// Case 1: If the vector number (vecnum) is an immediate in range, it gets +// folded into the instruction +// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11] +// Case 2: If the vecnum is not an immediate, then it is used to modify the base +// and tile slice registers +// ldr(%tileslice, %ptr, %vecnum) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 0], [%ptr2, 0] +// Case 3: If the vecnum is an immediate out of range, then the same is done as +// case 2, but the base and slice registers are modified by the greatest +// multiple of 15 lower than the vecnum and the remainder is folded into the +// instruction. This means that successive loads and stores that are offset from +// each other can share the same base and slice register updates. +// ldr(%tileslice, %ptr, 22) +// ldr(%tileslice, %ptr, 23) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * 15 +// %tileslice2 = %tileslice + 15 +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 4: If the vecnum is an add of an immediate, then the non-immediate +// operand and the immediate can be folded into the instruction, like case 2. +// ldr(%tileslice, %ptr, %vecnum + 7) +// ldr(%tileslice, %ptr, %vecnum + 8) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 5: The vecnum being an add of an immediate out of range is also handled, +// in which case the same remainder logic as case 3 is used. +SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { + SDLoc DL(N); + + SDValue TileSlice = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue VecNum = N->getOperand(4); + int Addend = 0; + + // If the vnum is an add, we can fold that add into the instruction if the + // operand is an immediate. The range check is performed below. + if (VecNum.getOpcode() == ISD::ADD) { + if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum.getOperand(1))) { + Addend = ImmNode->getSExtValue(); + VecNum = VecNum.getOperand(0); + } + } + + SDValue Remainder = DAG.getTargetConstant(Addend, DL, MVT::i32); + + // true if the base and slice registers need to be modified + bool NeedsAdd = true; + auto ImmNode = dyn_cast<ConstantSDNode>(VecNum); + if (ImmNode || Addend != 0) { + int Imm = ImmNode ? ImmNode->getSExtValue() + Addend : Addend; + Remainder = DAG.getTargetConstant(Imm % 16, DL, MVT::i32); + if (Imm >= 0 && Imm <= 15) { + // If vnum is an immediate in range then we don't need to modify the tile + // slice and base register. We could also get here because Addend != 0 but + // vecnum is not an immediate, in which case we still want the base and + // slice register to be modified + NeedsAdd = !ImmNode; ---------------- sdesmalen-arm wrote:
Maybe it's me, but I find this logic a little tricky to follow. Specifically here that the value for NeedsAdd depends on previous control flow, which depends on whether ImmNode is defined. It might be a bit simpler to follow if you progressively break down VecNum in two subsequent steps. First break it down into: * A variable part (e.g. for `i + 17` that would be `i`) * A constant (e.g. for `i + 17` that would be `17`) Second to break down `17` into: * A base constant (for `17` that would be `15`) * An immediate (for `17` that would be `2`) When you then fold the base constant into the variable part, you can avoid the need for `NeedsAdd` because it can be inferred from whether there is a variable part, e.g.: ``` // First split VecNum into a "Variable" and "Constant" part. int32_t ConstAddend = 0; SDValue VariableAddend = VecNum; if (VecNum.getOpcode() == ISD::ADD && isa<ConstantSDNode>(VecNum.getOperand(1))) { ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue(); VariableAddend = VecNum.getOperand(0); } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) { ConstAddend = ImmNode->getSExtValue(); VariableAddend = SDValue(); } // Further try to split the constant into an immediate. int32_t ImmAddend = ConstAddend % 16; if (int32_t C = (ConstAddend - ImmAddend)) { SDValue CVal = DAG.getConstant(C, DL, MVT::i32); VariableAddend = VariableAddend ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VariableAddend, CVal}) : CVal; } if (VariableAddend) { // Get the vector length that will be multiplied by VariableAddend auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, ... } ``` https://github.com/llvm/llvm-project/pull/68565 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits