This revision was automatically updated to reflect the committed changes. Closed by commit rGcdcc4f2a44b5: [AArch64][SVE] Add intrinsic for non-faulting loads (authored by kmclaughlin).
Changed prior to commit: https://reviews.llvm.org/D71698?vs=239144&id=239531#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D71698/new/ https://reviews.llvm.org/D71698 Files: llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/lib/Target/AArch64/AArch64ISelLowering.h llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll @@ -0,0 +1,182 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +define <vscale x 16 x i8> @ldnf1b(<vscale x 16 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1b: +; CHECK: ldnf1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a) + ret <vscale x 16 x i8> %load +} + +define <vscale x 8 x i16> @ldnf1b_h(<vscale x 8 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_h: +; CHECK: ldnf1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a) + %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x i16> @ldnf1sb_h(<vscale x 8 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_h: +; CHECK: ldnf1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a) + %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> + ret <vscale x 8 x i16> %res +} + +define <vscale x 8 x i16> @ldnf1h(<vscale x 8 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldnf1h: +; CHECK: ldnf1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a) + ret <vscale x 8 x i16> %load +} + +define <vscale x 8 x half> @ldnf1h_f16(<vscale x 8 x i1> %pg, half* %a) { +; CHECK-LABEL: ldnf1h_f16: +; CHECK: ldnf1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1> %pg, half* %a) + ret <vscale x 8 x half> %load +} + +define <vscale x 4 x i32> @ldnf1b_s(<vscale x 4 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_s: +; CHECK: ldnf1b { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a) + %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 4 x i32> @ldnf1sb_s(<vscale x 4 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_s: +; CHECK: ldnf1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a) + %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 4 x i32> @ldnf1h_s(<vscale x 4 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldnf1h_s: +; CHECK: ldnf1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a) + %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 4 x i32> @ldnf1sh_s(<vscale x 4 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldnf1sh_s: +; CHECK: ldnf1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a) + %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> + ret <vscale x 4 x i32> %res +} + +define <vscale x 4 x i32> @ldnf1w(<vscale x 4 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldnf1w: +; CHECK: ldnf1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a) + ret <vscale x 4 x i32> %load +} + +define <vscale x 4 x float> @ldnf1w_f32(<vscale x 4 x i1> %pg, float* %a) { +; CHECK-LABEL: ldnf1w_f32: +; CHECK: ldnf1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1> %pg, float* %a) + ret <vscale x 4 x float> %load +} + +define <vscale x 2 x i64> @ldnf1b_d(<vscale x 2 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1b_d: +; CHECK: ldnf1b { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a) + %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1sb_d(<vscale x 2 x i1> %pg, i8* %a) { +; CHECK-LABEL: ldnf1sb_d: +; CHECK: ldnf1sb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a) + %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1h_d(<vscale x 2 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldnf1h_d: +; CHECK: ldnf1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a) + %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1sh_d(<vscale x 2 x i1> %pg, i16* %a) { +; CHECK-LABEL: ldnf1sh_d: +; CHECK: ldnf1sh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a) + %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1w_d(<vscale x 2 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldnf1w_d: +; CHECK: ldnf1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a) + %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1sw_d(<vscale x 2 x i1> %pg, i32* %a) { +; CHECK-LABEL: ldnf1sw_d: +; CHECK: ldnf1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a) + %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> + ret <vscale x 2 x i64> %res +} + +define <vscale x 2 x i64> @ldnf1d(<vscale x 2 x i1> %pg, i64* %a) { +; CHECK-LABEL: ldnf1d: +; CHECK: ldnf1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a) + ret <vscale x 2 x i64> %load +} + +define <vscale x 2 x double> @ldnf1d_f64(<vscale x 2 x i1> %pg, double* %a) { +; CHECK-LABEL: ldnf1d_f64: +; CHECK: ldnf1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1> %pg, double* %a) + ret <vscale x 2 x double> %load +} + +declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnf1.nxv16i8(<vscale x 16 x i1>, i8*) + +declare <vscale x 8 x i8> @llvm.aarch64.sve.ldnf1.nxv8i8(<vscale x 8 x i1>, i8*) +declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnf1.nxv8i16(<vscale x 8 x i1>, i16*) +declare <vscale x 8 x half> @llvm.aarch64.sve.ldnf1.nxv8f16(<vscale x 8 x i1>, half*) + +declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnf1.nxv4i8(<vscale x 4 x i1>, i8*) +declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnf1.nxv4i16(<vscale x 4 x i1>, i16*) +declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnf1.nxv4i32(<vscale x 4 x i1>, i32*) +declare <vscale x 4 x float> @llvm.aarch64.sve.ldnf1.nxv4f32(<vscale x 4 x i1>, float*) + +declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnf1.nxv2i8(<vscale x 2 x i1>, i8*) +declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnf1.nxv2i16(<vscale x 2 x i1>, i16*) +declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnf1.nxv2i32(<vscale x 2 x i1>, i32*) +declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnf1.nxv2i64(<vscale x 2 x i1>, i64*) +declare <vscale x 2 x double> @llvm.aarch64.sve.ldnf1.nxv2f64(<vscale x 2 x i1>, double*) Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5557,14 +5557,21 @@ multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm, RegisterOperand listty, ZPRRegOp zprty> { - def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>; + def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", - (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]", - (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; + (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>; def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]", - (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in { + def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>, + PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>; + } } multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty, Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -57,6 +57,7 @@ def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">; def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">; +def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; @@ -1259,6 +1260,40 @@ defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>; defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>; defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>; + + multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> { + // base + def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)), + (I PPR:$gp, GPR64sp:$base, (i64 0))>; + } + + // 2-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i8>; + defm : ldnf1<LDNF1SB_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i8>; + defm : ldnf1<LDNF1H_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i16>; + defm : ldnf1<LDNF1SH_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i16>; + defm : ldnf1<LDNF1W_D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i32>; + defm : ldnf1<LDNF1SW_D_IMM, nxv2i64, AArch64ldnf1s, nxv2i1, nxv2i32>; + defm : ldnf1<LDNF1D_IMM, nxv2i64, AArch64ldnf1, nxv2i1, nxv2i64>; + defm : ldnf1<LDNF1D_IMM, nxv2f64, AArch64ldnf1, nxv2i1, nxv2f64>; + + // 4-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i8>; + defm : ldnf1<LDNF1SB_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i8>; + defm : ldnf1<LDNF1H_S_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i16>; + defm : ldnf1<LDNF1SH_S_IMM, nxv4i32, AArch64ldnf1s, nxv4i1, nxv4i16>; + defm : ldnf1<LDNF1W_IMM, nxv4i32, AArch64ldnf1, nxv4i1, nxv4i32>; + defm : ldnf1<LDNF1W_IMM, nxv4f32, AArch64ldnf1, nxv4i1, nxv4f32>; + + // 8-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i8>; + defm : ldnf1<LDNF1SB_H_IMM, nxv8i16, AArch64ldnf1s, nxv8i1, nxv8i8>; + defm : ldnf1<LDNF1H_IMM, nxv8i16, AArch64ldnf1, nxv8i1, nxv8i16>; + defm : ldnf1<LDNF1H_IMM, nxv8f16, AArch64ldnf1, nxv8i1, nxv8f16>; + + // 16-element contiguous non-faulting loads + defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>; + } let Predicates = [HasSVE2] in { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -549,6 +549,13 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; +def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -215,6 +215,9 @@ PTEST, PTRUE, + LDNF1, + LDNF1S, + // Unsigned gather loads. GLD1, GLD1_SCALED, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1373,6 +1373,8 @@ case AArch64ISD::INSR: return "AArch64ISD::INSR"; case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; + case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1"; + case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S"; case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; @@ -10225,9 +10227,14 @@ if (!Src.hasOneUse()) return SDValue(); - // GLD1* instructions perform an implicit zero-extend, which makes them + EVT MemVT; + + // SVE load instructions perform an implicit zero-extend, which makes them // perfect candidates for combining. switch (Src->getOpcode()) { + case AArch64ISD::LDNF1: + MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT(); + break; case AArch64ISD::GLD1: case AArch64ISD::GLD1_SCALED: case AArch64ISD::GLD1_SXTW: @@ -10235,13 +10242,12 @@ case AArch64ISD::GLD1_UXTW: case AArch64ISD::GLD1_UXTW_SCALED: case AArch64ISD::GLD1_IMM: + MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); break; default: return SDValue(); } - EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); - if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) return Src; @@ -11217,6 +11223,35 @@ return NewST1; } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + case MVT::nxv8i8: + case MVT::nxv8i16: + case MVT::nxv8f16: + return MVT::nxv8i16; + case MVT::nxv16i8: + return MVT::nxv16i8; + } +} + static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); @@ -11259,6 +11294,32 @@ ISD::UNINDEXED, false, false); } +static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + EVT ContainerVT = VT; + if (ContainerVT.isInteger()) + ContainerVT = getSVEContainerType(ContainerVT); + + SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other); + SDValue Ops[] = { N->getOperand(0), // Chain + N->getOperand(2), // Pg + N->getOperand(3), // Base + DAG.getValueType(VT) }; + + SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (ContainerVT.isInteger() && (VT != ContainerVT)) + Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0)); + + return DAG.getMergeValues({ Load, LoadChain }, DL); +} + /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store @@ -12310,29 +12371,6 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } -// Returns an SVE type that ContentTy can be trivially sign or zero extended -// into. -static MVT getSVEContainerType(EVT ContentTy) { - assert(ContentTy.isSimple() && "No SVE containers for extended types"); - - switch (ContentTy.getSimpleVT().SimpleTy) { - default: - llvm_unreachable("No known SVE container for this MVT type"); - case MVT::nxv2i8: - case MVT::nxv2i16: - case MVT::nxv2i32: - case MVT::nxv2i64: - case MVT::nxv2f32: - case MVT::nxv2f64: - return MVT::nxv2i64; - case MVT::nxv4i8: - case MVT::nxv4i16: - case MVT::nxv4i32: - case MVT::nxv4f32: - return MVT::nxv4i32; - } -} - static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { @@ -12520,10 +12558,15 @@ SDValue Src = N->getOperand(0); unsigned Opc = Src->getOpcode(); - // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates + // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. unsigned NewOpc; + unsigned MemVTOpNum = 4; switch (Opc) { + case AArch64ISD::LDNF1: + NewOpc = AArch64ISD::LDNF1S; + MemVTOpNum = 3; + break; case AArch64ISD::GLD1: NewOpc = AArch64ISD::GLD1S; break; @@ -12550,15 +12593,17 @@ } EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); - EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); + EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT(); - if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse()) + if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse()) return SDValue(); EVT DstVT = N->getValueType(0); SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); - SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2), - Src->getOperand(3), Src->getOperand(4)}; + + SmallVector<SDValue, 5> Ops; + for (unsigned I = 0; I < Src->getNumOperands(); ++I) + Ops.push_back(Src->getOperand(I)); SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); DCI.CombineTo(N, ExtLoad); @@ -12656,6 +12701,8 @@ return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ldnf1: + return performLDNF1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); case Intrinsic::aarch64_sve_ld1_gather: Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -775,6 +775,12 @@ LLVMPointerTo<0>], [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredFaultingLoad_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>], + [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_1Vec_PredStore_Intrinsic : Intrinsic<[], [llvm_anyvector_ty, @@ -1169,6 +1175,8 @@ def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; +def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic; + // // Stores //
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits