This revision was automatically updated to reflect the committed changes. kmclaughlin marked an inline comment as done. Closed by commit rGf87f23c81cae: [AArch64][SVE] Add the SVE dupq_lane intrinsic (authored by kmclaughlin).
Changed prior to commit: https://reviews.llvm.org/D74734?vs=245012&id=246194#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D74734/new/ https://reviews.llvm.org/D74734 Files: llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -297,6 +297,179 @@ } ; +; DUPQ +; + +define <vscale x 16 x i8> @dupq_i8(<vscale x 16 x i8> %a) { +; CHECK-LABEL: dupq_i8: +; CHECK: mov z0.q, q0 +; CHECK-NEXT: ret + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %a, i64 0) + ret <vscale x 16 x i8> %out +} + +define <vscale x 8 x i16> @dupq_i16(<vscale x 8 x i16> %a) { +; CHECK-LABEL: dupq_i16: +; CHECK: mov z0.q, z0.q[1] +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %a, i64 1) + ret <vscale x 8 x i16> %out +} + +define <vscale x 4 x i32> @dupq_i32(<vscale x 4 x i32> %a) { +; CHECK-LABEL: dupq_i32: +; CHECK: mov z0.q, z0.q[2] +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %a, i64 2) + ret <vscale x 4 x i32> %out +} + +define <vscale x 2 x i64> @dupq_i64(<vscale x 2 x i64> %a) { +; CHECK-LABEL: dupq_i64: +; CHECK: mov z0.q, z0.q[3] +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 3) + ret <vscale x 2 x i64> %out +} + +define <vscale x 8 x half> @dupq_f16(<vscale x 8 x half> %a) { +; CHECK-LABEL: dupq_f16: +; CHECK: mov z0.q, q0 +; CHECK-NEXT: ret + %out = call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %a, i64 0) + ret <vscale x 8 x half> %out +} + +define <vscale x 4 x float> @dupq_f32(<vscale x 4 x float> %a) { +; CHECK-LABEL: dupq_f32: +; CHECK: mov z0.q, z0.q[1] +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %a, i64 1) + ret <vscale x 4 x float> %out +} + +define <vscale x 2 x double> @dupq_f64(<vscale x 2 x double> %a) { +; CHECK-LABEL: dupq_f64: +; CHECK: mov z0.q, z0.q[2] +; CHECK-NEXT: ret + %out = call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %a, i64 2) + ret <vscale x 2 x double> %out +} + +; +; DUPQ_LANE +; + +define <vscale x 16 x i8> @dupq_lane_i8(<vscale x 16 x i8> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i8: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK-NEXT: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %a, i64 %idx) + ret <vscale x 16 x i8> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 8 x i16> @dupq_lane_i16(<vscale x 8 x i16> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i16: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %a, i64 %idx) + ret <vscale x 8 x i16> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 4 x i32> @dupq_lane_i32(<vscale x 4 x i32> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i32: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %a, i64 %idx) + ret <vscale x 4 x i32> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 2 x i64> @dupq_lane_i64(<vscale x 2 x i64> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_i64: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 %idx) + ret <vscale x 2 x i64> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 8 x half> @dupq_lane_f16(<vscale x 8 x half> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f16: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %a, i64 %idx) + ret <vscale x 8 x half> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 4 x float> @dupq_lane_f32(<vscale x 4 x float> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f32: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %a, i64 %idx) + ret <vscale x 4 x float> %out +} + +; NOTE: Identical operation to dupq_lane_i8 (i.e. element type is irrelevant). +define <vscale x 2 x double> @dupq_lane_f64(<vscale x 2 x double> %a, i64 %idx) { +; CHECK-LABEL: dupq_lane_f64: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[X1:x[0-9]+]], x0, x0 +; CHECK-DAG: mov [[Z3:z[0-9]+]].d, [[X1]] +; CHECK: add [[Z4:z[0-9]+]].d, [[Z2]].d, [[Z3]].d +; CHECK: tbl z0.d, { z0.d }, [[Z4]].d +; CHECK-NEXT: ret + %out = call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %a, i64 %idx) + ret <vscale x 2 x double> %out +} + +; NOTE: Index out of range (0-3) +define <vscale x 2 x i64> @dupq_i64_range(<vscale x 2 x i64> %a) { +; CHECK-LABEL: dupq_i64_range: +; CHECK-DAG: index [[Z1:z[0-9]+]].d, #0, #1 +; CHECK-DAG: and [[Z2:z[0-9]+]].d, [[Z1]].d, #0x1 +; CHECK-DAG: add [[Z3:z[0-9]+]].d, [[Z2]].d, #8 +; CHECK: tbl z0.d, { z0.d }, [[Z3]].d +; CHECK-NEXT: ret + %out = call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %a, i64 4) + ret <vscale x 2 x i64> %out +} + +; ; EXT ; @@ -1616,6 +1789,14 @@ declare <vscale x 4 x float> @llvm.aarch64.sve.compact.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>) declare <vscale x 2 x double> @llvm.aarch64.sve.compact.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>) +declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64) +declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64) +declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64) +declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64) +declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64) +declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64) +declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64) + declare <vscale x 16 x i8> @llvm.aarch64.sve.ext.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32) declare <vscale x 8 x i16> @llvm.aarch64.sve.ext.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32) declare <vscale x 4 x i32> @llvm.aarch64.sve.ext.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32) Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -745,6 +745,7 @@ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3036,6 +3036,8 @@ case Intrinsic::aarch64_sve_ptrue: return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_dupq_lane: + return LowerDUPQLane(Op, DAG); case Intrinsic::aarch64_sve_insr: { SDValue Scalar = Op.getOperand(2); @@ -7512,6 +7514,54 @@ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); } +SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + EVT VT = Op.getValueType(); + if (!isTypeLegal(VT) || !VT.isScalableVector()) + return SDValue(); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + // The DUPQ operation is indepedent of element type so normalise to i64s. + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1)); + SDValue Idx128 = Op.getOperand(2); + + // DUPQ can be used when idx is in range. + auto *CIdx = dyn_cast<ConstantSDNode>(Idx128); + if (CIdx && (CIdx->getZExtValue() <= 3)) { + SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64); + SDNode *DUPQ = + DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI); + return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0)); + } + + // The ACLE says this must produce the same result as: + // svtbl(data, svadd_x(svptrue_b64(), + // svand_x(svptrue_b64(), svindex_u64(0, 1), 1), + // index * 2)) + SDValue One = DAG.getConstant(1, DL, MVT::i64); + SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One); + + // create the vector 0,1,0,1,... + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR, + DL, MVT::nxv2i64, Zero, One); + SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne); + + // create the vector idx64,idx64+1,idx64,idx64+1,... + SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128); + SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64); + SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64); + + // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],... + SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask); + return DAG.getNode(ISD::BITCAST, DL, VT, TBL); +} + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -961,6 +961,12 @@ LLVMVectorElementType<0>], [IntrNoMem]>; + class AdvSIMD_SVE_DUPQ_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_i64_ty], + [IntrNoMem]>; + class AdvSIMD_SVE_EXPA_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMVectorOfBitcastsToInt<0>], @@ -1474,6 +1480,7 @@ def int_aarch64_sve_clastb : AdvSIMD_Pred2VectorArg_Intrinsic; def int_aarch64_sve_clastb_n : AdvSIMD_SVE_ReduceWithInit_Intrinsic; def int_aarch64_sve_compact : AdvSIMD_Pred1VectorArg_Intrinsic; +def int_aarch64_sve_dupq_lane : AdvSIMD_SVE_DUPQ_Intrinsic; def int_aarch64_sve_ext : AdvSIMD_2VectorArgIndexed_Intrinsic; def int_aarch64_sve_lasta : AdvSIMD_SVE_Reduce_Intrinsic; def int_aarch64_sve_lastb : AdvSIMD_SVE_Reduce_Intrinsic;
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits