Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: >> > { >> > /* The pattern matching functions above are written to look for a small >> > number to begin the sequence (0, 1, N/2). If we begin with an index >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct >> > expand_vec_perm_d *d) >> > || d->vec_flags == VEC_SVE_PRED) >> > && known_gt (nelt, 1)) >> > { >> > + /* If operand and result modes differ, then only check >> > + for dup case. */ >> > + if (d->vmode != op_mode) >> > + return (d->vec_flags == VEC_SVE_DATA) >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false; >> > + >> >> I think it'd be more future-proof to format this as: >> >> if (d->vmod == d->op_mode) >> { >> …existing code… >> } >> else >> { >> if (aarch64_evpc_sve_dup (d)) >> return true; >> } >> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup, >> alongside the op_mode check. I think we'll be adding more checks here >> over time. > Um I was wondering if we should structure it as: > if (d->vmode == d->op_mode) > { > ...existing code... > } > if (aarch64_evpc_sve_dup (d)) > return true; > > So we check for dup irrespective of d->vmode == d->op_mode ?
Yeah, I can see the attraction of that. I think the else is better though because the fallback TBL handling will (rightly) come at the end of the existing code. Without the else, we'd have specific tests like DUP after generic ones like TBL, so the reader would have to work out for themselves that DUP and TBL handle disjoint cases. >> > if (aarch64_evpc_rev_local (d)) >> > return true; >> > else if (aarch64_evpc_rev_global (d)) >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct >> > expand_vec_perm_d *d) >> > else if (aarch64_evpc_reencode (d)) >> > return true; >> > if (d->vec_flags == VEC_SVE_DATA) >> > - return aarch64_evpc_sve_tbl (d); >> > + { >> > + if (aarch64_evpc_sve_tbl (d)) >> > + return true; >> > + else if (aarch64_evpc_sve_dup (d, op_mode)) >> > + return true; >> > + } >> > else if (d->vec_flags == VEC_ADVSIMD) >> > return aarch64_evpc_tbl (d); >> > } >> >> Is this part still needed, given the above? >> >> Thanks, >> Richard >> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode >> > vmode, machine_mode op_mode, >> > rtx target, rtx op0, rtx op1, >> > const vec_perm_indices &sel) >> > { >> > - if (vmode != op_mode) >> > - return false; >> > - >> > struct expand_vec_perm_d d; >> > >> > /* Check whether the mask can be applied to a single vector. */ >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode >> > vmode, machine_mode op_mode, >> > d.testing_p = !target; >> > >> > if (!d.testing_p) >> > - return aarch64_expand_vec_perm_const_1 (&d); >> > + return aarch64_expand_vec_perm_const_1 (&d, op_mode); >> > >> > rtx_insn *last = get_last_insn (); >> > - bool ret = aarch64_expand_vec_perm_const_1 (&d); >> > + bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode); >> > gcc_assert (last == get_last_insn ()); >> > >> > return ret; > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index bee410929bd..1a804b1ab73 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -44,6 +44,7 @@ > #include "aarch64-sve-builtins-shapes.h" > #include "aarch64-sve-builtins-base.h" > #include "aarch64-sve-builtins-functions.h" > +#include "ssa.h" > > using namespace aarch64_sve; > > @@ -1207,6 +1208,64 @@ public: > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > return e.use_contiguous_load_insn (icode); > } > + > + gimple * > + fold (gimple_folder &f) const override > + { > + tree arg0 = gimple_call_arg (f.call, 0); > + tree arg1 = gimple_call_arg (f.call, 1); > + > + /* Transform: > + lhs = svld1rq ({-1, -1, ... }, arg1) > + into: > + tmp = mem_ref<vectype> [(int * {ref-all}) arg1] > + lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>. > + on little endian target. > + vectype is the corresponding ADVSIMD type. */ > + > + if (!BYTES_BIG_ENDIAN > + && integer_all_onesp (arg0)) > + { > + tree lhs = gimple_call_lhs (f.call); > + tree lhs_type = TREE_TYPE (lhs); > + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); > + tree eltype = TREE_TYPE (lhs_type); > + > + scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type)); > + machine_mode vq_mode = aarch64_vq_mode (elmode).require (); > + tree vectype = build_vector_type_for_mode (eltype, vq_mode); > + > + tree elt_ptr_type > + = build_pointer_type_for_mode (eltype, VOIDmode, true); > + tree zero = build_zero_cst (elt_ptr_type); > + > + /* Use element type alignment. */ > + tree access_type > + = build_aligned_type (vectype, TYPE_ALIGN (eltype)); > + > + tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0); > + tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero); > + gimple *mem_ref_stmt > + = gimple_build_assign (mem_ref_lhs, mem_ref_op); > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > + > + int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); > + vec_perm_builder sel (lhs_len, source_nelts, 1); > + for (int i = 0; i < source_nelts; i++) > + sel.quick_push (i); > + > + vec_perm_indices indices (sel, 1, source_nelts); > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), > + TYPE_MODE (access_type), > + indices)); > + tree mask_type = build_vector_type (ssizetype, lhs_len); > + tree mask = vec_perm_indices_to_tree (mask_type, indices); > + return gimple_build_assign (lhs, VEC_PERM_EXPR, > + mem_ref_lhs, mem_ref_lhs, mask); > + } > + > + return NULL; > + } > }; > > class svld1ro_impl : public load_replicate > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index d4c575ce976..bb24701b0d2 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d > { > rtx target, op0, op1; > vec_perm_indices perm; > + machine_mode op_mode; > machine_mode vmode; > unsigned int vec_flags; > + unsigned int op_vec_flags; Very minor, but it would be good to keep the order consistent: output mode first or input mode first. Guess it might as well be output mode first, to match the hook: machine_mode vmode; machine_mode op_mode; unsigned int vec_flags; unsigned int op_vec_flags; > bool one_vector_p; > bool testing_p; > }; > @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) > return true; > } > > +/* Try to implement D using SVE dup instruction. */ > + > +static bool > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) > +{ > + if (BYTES_BIG_ENDIAN > + || !d->one_vector_p > + || d->vec_flags != VEC_SVE_DATA > + || d->op_vec_flags != VEC_ADVSIMD Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need: || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128) This isn't redundant with any of the other tests. (We can use DUP .D for 64-bit input vectors, but that's a separate patch.) OK with those changes (including using "else" :-)), thanks. Richard > + || d->perm.encoding ().nelts_per_pattern () != 1 > + || !known_eq (d->perm.encoding ().npatterns (), > + GET_MODE_NUNITS (d->op_mode))) > + return false; > + > + int npatterns = d->perm.encoding ().npatterns (); > + for (int i = 0; i < npatterns; i++) > + if (!known_eq (d->perm[i], i)) > + return false; > + > + if (d->testing_p) > + return true; > + > + aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0); > + return true; > +} > + > /* Try to implement D using SVE SEL instruction. */ > > static bool > @@ -24084,30 +24112,39 @@ aarch64_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > || d->vec_flags == VEC_SVE_PRED) > && known_gt (nelt, 1)) > { > - if (aarch64_evpc_rev_local (d)) > - return true; > - else if (aarch64_evpc_rev_global (d)) > - return true; > - else if (aarch64_evpc_ext (d)) > - return true; > - else if (aarch64_evpc_dup (d)) > - return true; > - else if (aarch64_evpc_zip (d)) > - return true; > - else if (aarch64_evpc_uzp (d)) > - return true; > - else if (aarch64_evpc_trn (d)) > - return true; > - else if (aarch64_evpc_sel (d)) > - return true; > - else if (aarch64_evpc_ins (d)) > - return true; > - else if (aarch64_evpc_reencode (d)) > + /* If operand and result modes differ, then only check > + for dup case. */ > + if (d->vmode == d->op_mode) > + { > + if (aarch64_evpc_rev_local (d)) > + return true; > + else if (aarch64_evpc_rev_global (d)) > + return true; > + else if (aarch64_evpc_ext (d)) > + return true; > + else if (aarch64_evpc_dup (d)) > + return true; > + else if (aarch64_evpc_zip (d)) > + return true; > + else if (aarch64_evpc_uzp (d)) > + return true; > + else if (aarch64_evpc_trn (d)) > + return true; > + else if (aarch64_evpc_sel (d)) > + return true; > + else if (aarch64_evpc_ins (d)) > + return true; > + else if (aarch64_evpc_reencode (d)) > + return true; > + > + if (d->vec_flags == VEC_SVE_DATA) > + return aarch64_evpc_sve_tbl (d); > + else if (d->vec_flags == VEC_ADVSIMD) > + return aarch64_evpc_tbl (d); > + } > + > + if (aarch64_evpc_sve_dup (d)) > return true; > - if (d->vec_flags == VEC_SVE_DATA) > - return aarch64_evpc_sve_tbl (d); > - else if (d->vec_flags == VEC_ADVSIMD) > - return aarch64_evpc_tbl (d); > } > return false; > } > @@ -24119,9 +24156,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, > machine_mode op_mode, > rtx target, rtx op0, rtx op1, > const vec_perm_indices &sel) > { > - if (vmode != op_mode) > - return false; > - > struct expand_vec_perm_d d; > > /* Check whether the mask can be applied to a single vector. */ > @@ -24145,6 +24179,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, > machine_mode op_mode, > sel.nelts_per_input ()); > d.vmode = vmode; > d.vec_flags = aarch64_classify_vector_mode (d.vmode); > + d.op_mode = op_mode; > + d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode); > d.target = target; > d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX; > if (op0 == op1)