Author: Sam Tebbs Date: 2024-12-17T16:23:26Z New Revision: 4b79faf786d77e5c5b1271f1efab6967e43f6a17
URL: https://github.com/llvm/llvm-project/commit/4b79faf786d77e5c5b1271f1efab6967e43f6a17 DIFF: https://github.com/llvm/llvm-project/commit/4b79faf786d77e5c5b1271f1efab6967e43f6a17.diff LOG: Revert "[AArch64] Lower alias mask to a whilewr (#100769)" This reverts commit e7f9d8e5c3e49e729c69aaa9be3322f7902370b8. Added: Modified: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp Removed: llvm/test/CodeGen/AArch64/whilewr.ll ################################################################################ diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 28f304100326c6..abc00fc86ee455 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1539,7 +1539,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); - setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); @@ -14329,128 +14328,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } -/// Try to lower the construction of a pointer alias mask to a WHILEWR. -/// The mask's enabled lanes represent the elements that will not overlap across -/// one loop iteration. This tries to match: -/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), -/// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) -SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, - const AArch64Subtarget &Subtarget) { - if (!Subtarget.hasSVE2()) - return SDValue(); - SDValue LaneMask = Op.getOperand(0); - SDValue Splat = Op.getOperand(1); - - if (Splat.getOpcode() != ISD::SPLAT_VECTOR) - std::swap(LaneMask, Splat); - - if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN || - LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask || - Splat.getOpcode() != ISD::SPLAT_VECTOR) - return SDValue(); - - SDValue Cmp = Splat.getOperand(0); - if (Cmp.getOpcode() != ISD::SETCC) - return SDValue(); - - CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2)); - - auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1)); - if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 || - Cond->get() != ISD::CondCode::SETLT) - return SDValue(); - unsigned CompValue = std::abs(ComparatorConst->getSExtValue()); - unsigned EltSize = CompValue + 1; - if (!isPowerOf2_64(EltSize) || EltSize > 8) - return SDValue(); - - SDValue Diff = Cmp.getOperand(0); - if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64) - return SDValue(); - - if (!isNullConstant(LaneMask.getOperand(1)) || - (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) - return SDValue(); - - // The number of elements that alias is calculated by dividing the positive - // diff erence between the pointers by the element size. An alias mask for i8 - // elements omits the division because it would just divide by 1 - if (EltSize > 1) { - SDValue DiffDiv = LaneMask.getOperand(2); - auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1)); - if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) - return SDValue(); - if (EltSize > 2) { - // When masking i32 or i64 elements, the positive value of the - // possibly-negative diff erence comes from a select of the diff erence if - // it's positive, otherwise the diff erence plus the element size if it's - // negative: pos_ diff = diff < 0 ? ( diff + 7) : diff - SDValue Select = DiffDiv.getOperand(0); - // Make sure the diff erence is being compared by the select - if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff) - return SDValue(); - // Make sure it's checking if the diff erence is less than 0 - if (!isNullConstant(Select.getOperand(1)) || - cast<CondCodeSDNode>(Select.getOperand(4))->get() != - ISD::CondCode::SETLT) - return SDValue(); - // An add creates a positive value from the negative diff erence - SDValue Add = Select.getOperand(2); - if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) - return SDValue(); - if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1)); - !AddConst || AddConst->getZExtValue() != EltSize - 1) - return SDValue(); - } else { - // When masking i16 elements, this positive value comes from adding the - // diff erence's sign bit to the diff erence itself. This is equivalent to - // the 32 bit and 64 bit case: pos_ diff = diff + sign_bit ( diff ) - SDValue Add = DiffDiv.getOperand(0); - if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) - return SDValue(); - // A logical right shift by 63 extracts the sign bit from the diff erence - SDValue Shift = Add.getOperand(1); - if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff) - return SDValue(); - if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); - !ShiftConst || ShiftConst->getZExtValue() != 63) - return SDValue(); - } - } else if (LaneMask.getOperand(2) != Diff) - return SDValue(); - - SDValue StorePtr = Diff.getOperand(0); - SDValue ReadPtr = Diff.getOperand(1); - - unsigned IntrinsicID = 0; - switch (EltSize) { - case 1: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_b; - break; - case 2: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_h; - break; - case 4: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_s; - break; - case 8: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_d; - break; - default: - return SDValue(); - } - SDLoc DL(Op); - SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID, - StorePtr, ReadPtr); -} - SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { - if (SDValue SV = - tryWhileWRFromOR(Op, DAG, DAG.getSubtarget<AArch64Subtarget>())) - return SV; - if (useSVEForFixedLengthVectorVT(Op.getValueType(), !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll deleted file mode 100644 index 9f1ea850792384..00000000000000 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ /dev/null @@ -1,1086 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 - -define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 -; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %c14 = ptrtoint ptr %c to i64 - %b15 = ptrtoint ptr %b to i64 - %sub. diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub. diff , 0 - %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub. diff ) - %active.lane.mask.alias = or <vscale x 16 x i1> %ptr. diff .lane.mask, %.splat - ret <vscale x 16 x i1> %active.lane.mask.alias -} - -define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_commutative: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_commutative: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 -; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %c14 = ptrtoint ptr %c to i64 - %b15 = ptrtoint ptr %b to i64 - %sub. diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub. diff , 0 - %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub. diff ) - %active.lane.mask.alias = or <vscale x 16 x i1> %.splat, %ptr. diff .lane.mask - ret <vscale x 16 x i1> %active.lane.mask.alias -} - -define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.h, x1, x2 -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmn x8, #1 -; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: asr x8, x8, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %b14 = ptrtoint ptr %b to i64 - %c15 = ptrtoint ptr %c to i64 - %sub. diff = sub i64 %b14, %c15 - % diff = sdiv i64 %sub. diff , 2 - %neg.compare = icmp slt i64 %sub. diff , -1 - %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 8 x i1> %ptr. diff .lane.mask, %.splat - ret <vscale x 8 x i1> %active.lane.mask.alias -} - -define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.s, x1, x2 -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: add x9, x8, #3 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #3 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #2 -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %sub. diff = sub i64 %b12, %c13 - % diff = sdiv i64 %sub. diff , 4 - %neg.compare = icmp slt i64 %sub. diff , -3 - %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 4 x i1> %ptr. diff .lane.mask, %.splat - ret <vscale x 4 x i1> %active.lane.mask.alias -} - -define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.d, x1, x2 -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: add x9, x8, #7 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #7 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #3 -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %sub. diff = sub i64 %b12, %c13 - % diff = sdiv i64 %sub. diff , 8 - %neg.compare = icmp slt i64 %sub. diff , -7 - %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 2 x i1> %ptr. diff .lane.mask, %.splat - ret <vscale x 2 x i1> %active.lane.mask.alias -} - -define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: no_whilewr_128: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub x8, x1, x2 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #15 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csel x9, x9, x8, lt -; CHECK-NEXT: cmn x8, #15 -; CHECK-NEXT: asr x9, x9, #4 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: mov z1.d, x9 -; CHECK-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_128: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 -; CHECK-NOSVE2-NEXT: ptrue p0.d -; CHECK-NOSVE2-NEXT: add x9, x8, #15 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #15 -; CHECK-NOSVE2-NEXT: asr x9, x9, #4 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: mov z1.d, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d -; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b -; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret -entry: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %sub. diff = sub i64 %b12, %c13 - % diff = sdiv i64 %sub. diff , 16 - %neg.compare = icmp slt i64 %sub. diff , -15 - %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 1 x i1> %ptr. diff .lane.mask, %.splat - ret <vscale x 1 x i1> %active.lane.mask.alias -} - -define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB6_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB6_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB6_2 -; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 -; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b -; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 -; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %b15 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub. diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub. diff , 0 - %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub. diff ) - %active.lane.mask.alias = or <vscale x 16 x i1> %ptr. diff .lane.mask, %.splat - %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) - %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8> - %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0) - %2 = zext i8 %1 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias - %4 = getelementptr inbounds i8, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison) - %5 = getelementptr inbounds i8, ptr %b, i64 %index - %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison) - %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load - %7 = getelementptr inbounds i8, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3) - %index.next = add i64 %index, %2 - %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) - %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 - br i1 %8, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB7_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.h, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.h, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB7_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] -; CHECK-NEXT: inch x9 -; CHECK-NEXT: whilelo p0.h, x9, x8 -; CHECK-NEXT: b.mi .LBB7_2 -; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 -; CHECK-NOSVE2-NEXT: cmn x10, #1 -; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 -; CHECK-NOSVE2-NEXT: cset w11, lt -; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 -; CHECK-NOSVE2-NEXT: asr x10, x10, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 -; CHECK-NOSVE2-NEXT: cnth x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h -; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 -; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b14 = ptrtoint ptr %b to i64 - %c15 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 3 - %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) - %sub. diff = sub i64 %b14, %c15 - % diff = sdiv i64 %sub. diff , 2 - %neg.compare = icmp slt i64 %sub. diff , -1 - %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 8 x i1> %ptr. diff .lane.mask, %.splat - %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i16, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison) - %4 = getelementptr inbounds i16, ptr %b, i64 %index - %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison) - %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load - %6 = getelementptr inbounds i16, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB8_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.s, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.s, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB8_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] -; CHECK-NEXT: incw x9 -; CHECK-NEXT: whilelo p0.s, x9, x8 -; CHECK-NEXT: b.mi .LBB8_2 -; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB8_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: add x11, x10, #3 -; CHECK-NOSVE2-NEXT: cmp x10, #0 -; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #3 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x11, x11, #2 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 -; CHECK-NOSVE2-NEXT: cntw x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s -; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB8_2 -; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 2 - %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) - %sub. diff = sub i64 %b12, %c13 - % diff = sdiv i64 %sub. diff , 4 - %neg.compare = icmp slt i64 %sub. diff , -3 - %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 4 x i1> %ptr. diff .lane.mask, %.splat - %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison) - %4 = getelementptr inbounds i32, ptr %b, i64 %index - %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison) - %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load - %6 = getelementptr inbounds i32, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB9_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.d, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.d, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB9_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] -; CHECK-NEXT: incd x9 -; CHECK-NEXT: whilelo p0.d, x9, x8 -; CHECK-NEXT: b.mi .LBB9_2 -; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB9_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: add x11, x10, #7 -; CHECK-NOSVE2-NEXT: cmp x10, #0 -; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #7 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x11, x11, #3 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x11 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 -; CHECK-NOSVE2-NEXT: cntd x10 -; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d -; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB9_2 -; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %b12 = ptrtoint ptr %b to i64 - %c13 = ptrtoint ptr %c to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %0 = tail call i64 @llvm.vscale.i64() - %1 = shl nuw nsw i64 %0, 1 - %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) - %sub. diff = sub i64 %b12, %c13 - % diff = sdiv i64 %sub. diff , 8 - %neg.compare = icmp slt i64 %sub. diff , -7 - %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 2 x i1> %ptr. diff .lane.mask, %.splat - %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %3 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison) - %4 = getelementptr inbounds i64, ptr %b, i64 %index - %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison) - %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load - %6 = getelementptr inbounds i64, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask) - %index.next = add i64 %index, %1 - %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) - %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0 - br i1 %7, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB10_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.b, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB10_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB10_2 -; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB10_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p2.b, xzr, x10 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NOSVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NOSVE2-NEXT: add z0.b, z1.b, z0.b -; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB10_2 -; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %a15 = ptrtoint ptr %a to i64 - %b16 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub. diff = sub i64 %a15, %c14 - %neg.compare = icmp slt i64 %sub. diff , 0 - %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub. diff ) - %active.lane.mask.alias = or <vscale x 16 x i1> %ptr. diff .lane.mask, %.splat - %sub. diff 18 = sub i64 %b16, %c14 - %neg.compare20 = icmp slt i64 %sub. diff 18, 0 - %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0 - %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer - %ptr. diff .lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub. diff 18) - %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr. diff .lane.mask23, %.splat22 - %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24 - %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8> - %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and <vscale x 16 x i1> %active.lane.mask, %0 - %5 = getelementptr inbounds i8, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison) - %6 = getelementptr inbounds i8, ptr %b, i64 %index - %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison) - %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load - %8 = getelementptr inbounds i8, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB11_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.h, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.h, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.h -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB11_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.h, x8, x9 -; CHECK-NEXT: b.mi .LBB11_2 -; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB11_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmn x9, #1 -; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: asr x9, x9, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x10 -; CHECK-NOSVE2-NEXT: sub x10, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 -; CHECK-NOSVE2-NEXT: cmn x10, #1 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #1 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.h, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h -; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB11_2 -; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp11 = icmp sgt i32 %n, 0 - br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c14 = ptrtoint ptr %c to i64 - %a15 = ptrtoint ptr %a to i64 - %b16 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub. diff = sub i64 %a15, %c14 - % diff = sdiv i64 %sub. diff , 2 - %neg.compare = icmp slt i64 %sub. diff , -1 - %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 8 x i1> %ptr. diff .lane.mask, %.splat - %sub. diff 18 = sub i64 %b16, %c14 - % diff 19 = sdiv i64 %sub. diff 18, 2 - %neg.compare20 = icmp slt i64 %sub. diff 18, -1 - %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0 - %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer - %ptr. diff .lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 % diff 19) - %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr. diff .lane.mask23, %.splat22 - %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24 - %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8> - %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and <vscale x 8 x i1> %active.lane.mask, %0 - %5 = getelementptr inbounds i16, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison) - %6 = getelementptr inbounds i16, ptr %b, i64 %index - %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison) - %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load - %8 = getelementptr inbounds i16, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB12_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.s, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.s, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.s -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB12_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.s, x8, x9 -; CHECK-NEXT: b.mi .LBB12_2 -; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB12_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: add x10, x9, #3 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 -; CHECK-NOSVE2-NEXT: asr x9, x10, #2 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 -; CHECK-NOSVE2-NEXT: add x10, x9, #3 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: asr x10, x10, #2 -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.s, xzr, x10 -; CHECK-NOSVE2-NEXT: whilelo p2.s, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s -; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB12_2 -; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c12 = ptrtoint ptr %c to i64 - %a13 = ptrtoint ptr %a to i64 - %b14 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub. diff = sub i64 %a13, %c12 - % diff = sdiv i64 %sub. diff , 4 - %neg.compare = icmp slt i64 %sub. diff , -3 - %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 4 x i1> %ptr. diff .lane.mask, %.splat - %sub. diff 16 = sub i64 %b14, %c12 - % diff 17 = sdiv i64 %sub. diff 16, 4 - %neg.compare18 = icmp slt i64 %sub. diff 16, -3 - %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0 - %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer - %ptr. diff .lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 % diff 17) - %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr. diff .lane.mask21, %.splat20 - %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22 - %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8> - %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and <vscale x 4 x i1> %active.lane.mask, %0 - %5 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison) - %6 = getelementptr inbounds i32, ptr %b, i64 %index - %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison) - %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load - %8 = getelementptr inbounds i32, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB13_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.d, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.d, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.d -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB13_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.d, x8, x9 -; CHECK-NEXT: b.mi .LBB13_2 -; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB13_3 -; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NOSVE2-NEXT: sub x9, x0, x2 -; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: add x10, x9, #7 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 -; CHECK-NOSVE2-NEXT: asr x9, x10, #3 -; CHECK-NOSVE2-NEXT: cset w10, lt -; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 -; CHECK-NOSVE2-NEXT: add x10, x9, #7 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: asr x10, x10, #3 -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p3.d, xzr, x10 -; CHECK-NOSVE2-NEXT: whilelo p2.d, xzr, x9 -; CHECK-NOSVE2-NEXT: mov w9, w3 -; CHECK-NOSVE2-NEXT: sel p1.b, p3, p3.b, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d -; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body -; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d -; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] -; CHECK-NOSVE2-NEXT: add x8, x8, x10 -; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB13_2 -; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup -; CHECK-NOSVE2-NEXT: ret -entry: - %cmp9 = icmp sgt i32 %n, 0 - br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %c12 = ptrtoint ptr %c to i64 - %a13 = ptrtoint ptr %a to i64 - %b14 = ptrtoint ptr %b to i64 - %wide.trip.count = zext nneg i32 %n to i64 - %sub. diff = sub i64 %a13, %c12 - % diff = sdiv i64 %sub. diff , 8 - %neg.compare = icmp slt i64 %sub. diff , -7 - %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0 - %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer - %ptr. diff .lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 % diff ) - %active.lane.mask.alias = or <vscale x 2 x i1> %ptr. diff .lane.mask, %.splat - %sub. diff 16 = sub i64 %b14, %c12 - % diff 17 = sdiv i64 %sub. diff 16, 8 - %neg.compare18 = icmp slt i64 %sub. diff 16, -7 - %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0 - %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer - %ptr. diff .lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 % diff 17) - %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr. diff .lane.mask21, %.splat20 - %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22 - %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) - %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8> - %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1) - %3 = zext i8 %2 to i64 - br label %vector.body - -vector.body: - %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] - %4 = and <vscale x 2 x i1> %active.lane.mask, %0 - %5 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison) - %6 = getelementptr inbounds i64, ptr %b, i64 %index - %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison) - %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load - %8 = getelementptr inbounds i64, ptr %c, i64 %index - tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4) - %index.next = add i64 %index, %3 - %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count) - %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0 - br i1 %9, label %vector.body, label %for.cond.cleanup - -for.cond.cleanup: - ret void -} - -declare i64 @llvm.vscale.i64() - -declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) - -declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>) - -declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>) - -declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) - -declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>) - -declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>) - -declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64) - -declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) - -declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>) - -declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64) - -declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>) - -declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits