================ @@ -4850,6 +4852,93 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } +// Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA. +// Case 1: If the vector number (vecnum) is an immediate in range, it gets +// folded into the instruction +// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11] +// Case 2: If the vecnum is not an immediate, then it is used to modify the base +// and tile slice registers +// ldr(%tileslice, %ptr, %vecnum) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 0], [%ptr2, 0] +// Case 3: If the vecnum is an immediate out of range, then the same is done as +// case 2, but the base and slice registers are modified by the greatest +// multiple of 15 lower than the vecnum and the remainder is folded into the +// instruction. This means that successive loads and stores that are offset from +// each other can share the same base and slice register updates. +// ldr(%tileslice, %ptr, 22) +// ldr(%tileslice, %ptr, 23) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * 15 +// %tileslice2 = %tileslice + 15 +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 4: If the vecnum is an add of an immediate, then the non-immediate +// operand and the immediate can be folded into the instruction, like case 2. +// ldr(%tileslice, %ptr, %vecnum + 7) +// ldr(%tileslice, %ptr, %vecnum + 8) +// -> +// %svl = rdsvl +// %ptr2 = %ptr + %svl * %vecnum +// %tileslice2 = %tileslice + %vecnum +// ldr [%tileslice2, 7], [%ptr2, 7] +// ldr [%tileslice2, 8], [%ptr2, 8] +// Case 5: The vecnum being an add of an immediate out of range is also handled, +// in which case the same remainder logic as case 3 is used. +SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { + SDLoc DL(N); + + SDValue TileSlice = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue VecNum = N->getOperand(4); + int32_t ConstAddend = 0; + SDValue VarAddend = VecNum; + + // If the vnum is an add of an immediate, we can fold it into the instruction + if (VecNum.getOpcode() == ISD::ADD && + isa<ConstantSDNode>(VecNum.getOperand(1))) { + ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue(); + VarAddend = VecNum.getOperand(0); + } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) { + ConstAddend = ImmNode->getSExtValue(); + VarAddend = SDValue(); + } + + int32_t ImmAddend = ConstAddend % 16; + if (int32_t C = (ConstAddend - ImmAddend)) { + SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32); + VarAddend = VarAddend + ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal}) + : CVal; + } + + if (VarAddend) { + // Get the vector length that will be multiplied by vnum + auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + // Multiply SVL and vnum then add it to the base + SDValue Mul = DAG.getNode( + ISD::MUL, DL, MVT::i64, + {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)}); + Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); + // Just add vnum to the tileslice + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend}); + } + + SmallVector<SDValue, 4> Ops = { + /*Chain=*/N.getOperand(0), TileSlice, Base, + DAG.getTargetConstant(ImmAddend, DL, MVT::i32)}; ---------------- sdesmalen-arm wrote:
nit: you might as well inline Ops into the expression below. https://github.com/llvm/llvm-project/pull/68565 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits