On Mon, Oct 13, 2025 at 11:47 AM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> this is a refactor that moves the setting of slp_perm, checking of basic-block
> SLP gaps, as well as the final check for perm_ok into get_load_store_type.
> Also, slp_perm is moved to ls_data.
Thanks - I wanted to move this as well.
> In order to help legibility, the patch introduces an enum
> vect_gather_scatter_subtype that specifies the gather/scatter
> "sub-type" like strided, or grouped_strided. It's only used in one place for
> now, though.
Hmm, how about instead using VMAT_GATHER_SCATTER_STRIDED and
VMAT_GATHER_SCATTER_GROUPED_STRIDED
(not sure why we need to distinguish the latter via some kind)? I
don't like splitting
up VMAT_* unless really necessary.
> Bootstrapped and regtested on x86 and power10. Regtested on aarch64 and
> rv64gcv_zvl512b.
>
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (get_load_store_type): Add load-permutation
> checks and setting of slp_perm.
> (vectorizable_store): Remove perm_ok argument.
> (vectorizable_load): Ditto and replace slp_perm by ls.slp_perm.
> * tree-vectorizer.h (enum vect_gather_scatter_subtype): New.
> (struct vect_load_store_data): Add slp_perm.
> ---
> gcc/tree-vect-stmts.cc | 181 ++++++++++++++++++++++-------------------
> gcc/tree-vectorizer.h | 9 ++
> 2 files changed, 105 insertions(+), 85 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index bf5a67bf805..ef33638af74 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype,
> poly_uint64 nelts, tree *ptype,
> VECTYPE is the vector type that the vectorized statements will use.
>
> If ELSVALS is nonzero the supported else values will be stored in the
> - vector ELSVALS points to.
> -
> - For loads PERM_OK indicates whether we can code generate a
> - SLP_TREE_LOAD_PERMUTATION on the node. */
> + vector ELSVALS points to. */
>
> static bool
> get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> tree vectype, slp_tree slp_node,
> bool masked_p, vec_load_store_type vls_type,
> - bool perm_ok, vect_load_store_data *ls)
> + vect_load_store_data *ls)
> {
> vect_memory_access_type *memory_access_type = &ls->memory_access_type;
> poly_int64 *poffset = &ls->poffset;
> @@ -2081,6 +2078,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> internal_fn *lanes_ifn = &ls->lanes_ifn;
> vec<int> *elsvals = &ls->elsvals;
> tree *ls_type = &ls->ls_type;
> + bool *slp_perm = &ls->slp_perm;
> + unsigned *n_perms = &ls->n_perms;
> loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -2093,6 +2092,15 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> *misalignment = DR_MISALIGNMENT_UNKNOWN;
> *poffset = 0;
> *ls_type = NULL_TREE;
> + *slp_perm = false;
> + *n_perms = -1U;
> +
> + bool perm_ok = true;
> + poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
> +
> + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> + perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
> + vf, true, n_perms);
>
> if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> {
> @@ -2131,6 +2139,10 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
>
> + /* Sub-type of gather/scatter. */
> + vect_gather_scatter_subtype gather_scatter_subtype
> + = GATHER_SCATTER_UNDEFINED;
> +
> if (STMT_VINFO_STRIDED_P (first_stmt_info))
> /* Try to use consecutive accesses of as many elements as possible,
> separated by the stride, until we have a complete vector.
> @@ -2181,6 +2193,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> }
> *memory_access_type = VMAT_GATHER_SCATTER_EMULATED;
> }
> +
> + gather_scatter_subtype = GATHER_SCATTER_REGULAR;
> }
> else
> {
> @@ -2418,6 +2432,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> ls->gs.ifn = gs_info.ifn;
> ls->strided_offset_vectype = gs_info.offset_vectype;
> *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> + gather_scatter_subtype = GATHER_SCATTER_STRIDED;
> }
> else if (SLP_TREE_LANES (slp_node) > 1
> && !masked_p
> @@ -2431,6 +2446,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> SLP_TREE_GS_BASE (slp_node) = error_mark_node;
> grouped_gather_fallback = *memory_access_type;
> *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> + gather_scatter_subtype = GATHER_SCATTER_STRIDED_GROUPED;
> ls->gs.ifn = gs_info.ifn;
> vectype = *ls_type;
> ls->strided_offset_vectype = gs_info.offset_vectype;
> @@ -2534,7 +2550,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> poly_uint64 read_amount
> = vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
> if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> - read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
> + read_amount *= group_size;
>
> auto target_alignment
> = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
> @@ -2627,6 +2643,58 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
> return false;
>
> + /* Some loads need to explicitly permute the loaded data if there
> + is a load permutation. Among those are:
> + - VMAT_ELEMENTWISE.
> + - VMAT_STRIDED_SLP.
> + - VMAT_GATHER_SCATTER:
> + - Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
> + - Grouped strided gather (ditto but for #lanes > 1).
> +
> + For VMAT_ELEMENTWISE we can fold the load permutation into the
> + individual indices we access directly, eliding the permutation.
> + Strided gather only allows load permutations for the
> + single-element case. */
> +
> + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> + && !(*memory_access_type == VMAT_ELEMENTWISE
> + || gather_scatter_subtype == GATHER_SCATTER_STRIDED))
> + {
> + if (!loop_vinfo)
> + {
> + /* In BB vectorization we may not actually use a loaded vector
> + accessing elements in excess of DR_GROUP_SIZE. */
> + stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
> + group_info = DR_GROUP_FIRST_ELEMENT (group_info);
> + unsigned HOST_WIDE_INT nunits;
> + unsigned j, k, maxk = 0;
> + FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
> + if (k > maxk)
> + maxk = k;
> + tree vectype = SLP_TREE_VECTYPE (slp_node);
> + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> + || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "BB vectorization with gaps at the end of "
> + "a load is not supported\n");
> + return false;
> + }
> + }
> +
> + if (!perm_ok)
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> + vect_location,
> + "unsupported load permutation\n");
> + return false;
> + }
> +
> + *slp_perm = true;
> + }
> +
> return true;
> }
>
> @@ -8002,7 +8070,7 @@ vectorizable_store (vec_info *vinfo,
> vect_load_store_data &ls = slp_node->get_data (_ls_data);
> if (cost_vec
> && !get_load_store_type (vinfo, stmt_info, vectype, slp_node,
> mask_node,
> - vls_type, false, &_ls_data))
> + vls_type, &_ls_data))
> return false;
> /* Temporary aliases to analysis data, should not be modified through
> these. */
> @@ -9446,7 +9514,6 @@ vectorizable_load (vec_info *vinfo,
> bool compute_in_loop = false;
> class loop *at_loop;
> int vec_num;
> - bool slp_perm = false;
> bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
> poly_uint64 vf;
> tree aggr_type;
> @@ -9584,17 +9651,11 @@ vectorizable_load (vec_info *vinfo,
> else
> group_size = 1;
>
> - bool perm_ok = true;
> - unsigned n_perms = -1U;
> - if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> - perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
> - true, &n_perms);
> -
> vect_load_store_data _ls_data{};
> vect_load_store_data &ls = slp_node->get_data (_ls_data);
> if (cost_vec
> && !get_load_store_type (vinfo, stmt_info, vectype, slp_node,
> mask_node,
> - VLS_LOAD, perm_ok, &ls))
> + VLS_LOAD, &ls))
> return false;
> /* Temporary aliases to analysis data, should not be modified through
> these. */
> @@ -9615,56 +9676,6 @@ vectorizable_load (vec_info *vinfo,
> bool type_mode_padding_p
> = TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER
> (mode));
>
> - /* ??? The following checks should really be part of
> - get_load_store_type. */
> - if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> - && !(memory_access_type == VMAT_ELEMENTWISE
> - || (mat_gather_scatter_p (memory_access_type)
> - && SLP_TREE_LANES (slp_node) == 1
> - && (!grouped_load
> - || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
> - {
> - slp_perm = true;
> -
> - if (!loop_vinfo && cost_vec)
> - {
> - /* In BB vectorization we may not actually use a loaded vector
> - accessing elements in excess of DR_GROUP_SIZE. */
> - stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
> - group_info = DR_GROUP_FIRST_ELEMENT (group_info);
> - unsigned HOST_WIDE_INT nunits;
> - unsigned j, k, maxk = 0;
> - FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
> - if (k > maxk)
> - maxk = k;
> - tree vectype = SLP_TREE_VECTYPE (slp_node);
> - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> - || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> - "BB vectorization with gaps at the end of "
> - "a load is not supported\n");
> - return false;
> - }
> - }
> -
> - if (cost_vec)
> - {
> - if (!perm_ok)
> - {
> - if (dump_enabled_p ())
> - dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> - vect_location,
> - "unsupported load permutation\n");
> - return false;
> - }
> - ls.n_perms = n_perms;
> - }
> - else
> - n_perms = ls.n_perms;
> - }
> -
> if (slp_node->ldst_lanes
> && memory_access_type != VMAT_LOAD_STORE_LANES)
> {
> @@ -10019,7 +10030,7 @@ vectorizable_load (vec_info *vinfo,
> not only the number of vector stmts the permutation result
> fits in. */
> int ncopies;
> - if (slp_perm)
> + if (ls.slp_perm)
> {
> gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
> /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> @@ -10127,18 +10138,18 @@ vectorizable_load (vec_info *vinfo,
>
> if (!costing_p)
> {
> - if (slp_perm)
> + if (ls.slp_perm)
> dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> else
> slp_node->push_vec_def (new_stmt);
> }
> }
> - if (slp_perm)
> + if (ls.slp_perm)
> {
> if (costing_p)
> {
> - gcc_assert (n_perms != -1U);
> - inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
> + gcc_assert (ls.n_perms != -1U);
> + inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> slp_node, 0, vect_body);
> }
> else
> @@ -10146,7 +10157,7 @@ vectorizable_load (vec_info *vinfo,
> unsigned n_perms2;
> vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi,
> vf,
> false, &n_perms2);
> - gcc_assert (n_perms == n_perms2);
> + gcc_assert (ls.n_perms == n_perms2);
> }
> }
>
> @@ -10211,7 +10222,7 @@ vectorizable_load (vec_info *vinfo,
> instead the access is contiguous but it might be
> permuted. No gap adjustment is needed though. */
> ;
> - else if (slp_perm
> + else if (ls.slp_perm
> && (group_size != scalar_lanes
> || !multiple_p (nunits, group_size)))
> {
> @@ -10560,7 +10571,7 @@ vectorizable_load (vec_info *vinfo,
>
> if (mat_gather_scatter_p (memory_access_type))
> {
> - gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
> + gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
>
> /* If we pun the original vectype the loads as well as costing, length,
> etc. is performed with the new type. After loading we VIEW_CONVERT
> @@ -10922,14 +10933,14 @@ vectorizable_load (vec_info *vinfo,
> /* Store vector loads in the corresponding SLP_NODE. */
> if (!costing_p)
> {
> - if (slp_perm)
> + if (ls.slp_perm)
> dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> else
> slp_node->push_vec_def (new_stmt);
> }
> }
>
> - if (slp_perm)
> + if (ls.slp_perm)
> {
> if (costing_p)
> {
> @@ -11026,7 +11037,7 @@ vectorizable_load (vec_info *vinfo,
> stmt_info, bump);
> }
>
> - if (grouped_load || slp_perm)
> + if (grouped_load || ls.slp_perm)
> dr_chain.create (vec_num);
>
> gimple *new_stmt = NULL;
> @@ -11523,11 +11534,11 @@ vectorizable_load (vec_info *vinfo,
>
> /* Collect vector loads and later create their permutation in
> vect_transform_slp_perm_load. */
> - if (!costing_p && (grouped_load || slp_perm))
> + if (!costing_p && (grouped_load || ls.slp_perm))
> dr_chain.quick_push (new_temp);
>
> /* Store vector loads in the corresponding SLP_NODE. */
> - if (!costing_p && !slp_perm)
> + if (!costing_p && !ls.slp_perm)
> slp_node->push_vec_def (new_stmt);
>
> /* With SLP permutation we load the gaps as well, without
> @@ -11536,7 +11547,7 @@ vectorizable_load (vec_info *vinfo,
> group_elt += nunits;
> if (!costing_p
> && maybe_ne (group_gap_adj, 0U)
> - && !slp_perm
> + && !ls.slp_perm
> && known_eq (group_elt, group_size - group_gap_adj))
> {
> poly_wide_int bump_val
> @@ -11553,7 +11564,7 @@ vectorizable_load (vec_info *vinfo,
> elements loaded for a permuted SLP load. */
> if (!costing_p
> && maybe_ne (group_gap_adj, 0U)
> - && slp_perm)
> + && ls.slp_perm)
> {
> poly_wide_int bump_val
> = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> @@ -11564,7 +11575,7 @@ vectorizable_load (vec_info *vinfo,
> stmt_info, bump);
> }
>
> - if (slp_perm)
> + if (ls.slp_perm)
> {
> /* For SLP we know we've seen all possible uses of dr_chain so
> direct vect_transform_slp_perm_load to DCE the unused parts.
> @@ -11572,9 +11583,9 @@ vectorizable_load (vec_info *vinfo,
> in PR101120 and friends. */
> if (costing_p)
> {
> - gcc_assert (n_perms != -1U);
> - if (n_perms != 0)
> - inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
> + gcc_assert (ls.n_perms != -1U);
> + if (ls.n_perms != 0)
> + inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> slp_node, 0, vect_body);
> }
> else
> @@ -11583,7 +11594,7 @@ vectorizable_load (vec_info *vinfo,
> bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
> gsi, vf, false, &n_perms2,
> nullptr, true);
> - gcc_assert (ok && n_perms == n_perms2);
> + gcc_assert (ok && ls.n_perms == n_perms2);
> }
> dr_chain.release ();
> }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 52bc0d672bf..d2c5f2ba51f 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -209,6 +209,13 @@ enum vect_memory_access_type {
> VMAT_GATHER_SCATTER_EMULATED
> };
>
> +enum vect_gather_scatter_subtype {
> + GATHER_SCATTER_UNDEFINED,
> + GATHER_SCATTER_REGULAR,
> + GATHER_SCATTER_STRIDED,
> + GATHER_SCATTER_STRIDED_GROUPED
> +};
> +
> /* Returns whether MAT is any of the VMAT_GATHER_SCATTER_* kinds. */
>
> inline bool
> @@ -290,6 +297,8 @@ struct vect_load_store_data : vect_data {
> tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> tree ls_type; // VMAT_GATHER_SCATTER_IFN
> auto_vec<int> elsvals;
> + /* True if the load requires a load permutation. */
> + bool slp_perm; // SLP_TREE_LOAD_PERMUTATION
> unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
> };
>
> --
> 2.51.0
>