On Mon, Oct 13, 2025 at 11:47 AM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> this is a refactor that moves the setting of slp_perm, checking of basic-block
> SLP gaps, as well as the final check for perm_ok into get_load_store_type.
> Also, slp_perm is moved to ls_data.

Thanks - I wanted to move this as well.

> In order to help legibility, the patch introduces an enum
> vect_gather_scatter_subtype that specifies the gather/scatter
> "sub-type" like strided, or grouped_strided.  It's only used in one place for
> now, though.

Hmm, how about instead using VMAT_GATHER_SCATTER_STRIDED and
VMAT_GATHER_SCATTER_GROUPED_STRIDED
(not sure why we need to distinguish the latter via some kind)?  I
don't like splitting
up VMAT_* unless really necessary.

> Bootstrapped and regtested on x86 and power10.  Regtested on aarch64 and
> rv64gcv_zvl512b.
>
> Regards
>  Robin
>
> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (get_load_store_type): Add load-permutation
>         checks and setting of slp_perm.
>         (vectorizable_store): Remove perm_ok argument.
>         (vectorizable_load): Ditto and replace slp_perm by ls.slp_perm.
>         * tree-vectorizer.h (enum vect_gather_scatter_subtype): New.
>         (struct vect_load_store_data): Add slp_perm.
> ---
>  gcc/tree-vect-stmts.cc | 181 ++++++++++++++++++++++-------------------
>  gcc/tree-vectorizer.h  |   9 ++
>  2 files changed, 105 insertions(+), 85 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index bf5a67bf805..ef33638af74 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype, 
> poly_uint64 nelts, tree *ptype,
>     VECTYPE is the vector type that the vectorized statements will use.
>
>     If ELSVALS is nonzero the supported else values will be stored in the
> -   vector ELSVALS points to.
> -
> -   For loads PERM_OK indicates whether we can code generate a
> -   SLP_TREE_LOAD_PERMUTATION on the node.  */
> +   vector ELSVALS points to.  */
>
>  static bool
>  get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
>                      tree vectype, slp_tree slp_node,
>                      bool masked_p, vec_load_store_type vls_type,
> -                    bool perm_ok, vect_load_store_data *ls)
> +                    vect_load_store_data *ls)
>  {
>    vect_memory_access_type *memory_access_type = &ls->memory_access_type;
>    poly_int64 *poffset = &ls->poffset;
> @@ -2081,6 +2078,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    internal_fn *lanes_ifn = &ls->lanes_ifn;
>    vec<int> *elsvals = &ls->elsvals;
>    tree *ls_type = &ls->ls_type;
> +  bool *slp_perm = &ls->slp_perm;
> +  unsigned *n_perms = &ls->n_perms;
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -2093,6 +2092,15 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    *misalignment = DR_MISALIGNMENT_UNKNOWN;
>    *poffset = 0;
>    *ls_type = NULL_TREE;
> +  *slp_perm = false;
> +  *n_perms = -1U;
> +
> +  bool perm_ok = true;
> +  poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
> +
> +  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> +    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
> +                                           vf, true, n_perms);
>
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      {
> @@ -2131,6 +2139,10 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
>      first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
>
> +  /* Sub-type of gather/scatter.  */
> +  vect_gather_scatter_subtype gather_scatter_subtype
> +    = GATHER_SCATTER_UNDEFINED;
> +
>    if (STMT_VINFO_STRIDED_P (first_stmt_info))
>      /* Try to use consecutive accesses of as many elements as possible,
>         separated by the stride, until we have a complete vector.
> @@ -2181,6 +2193,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>             }
>           *memory_access_type = VMAT_GATHER_SCATTER_EMULATED;
>         }
> +
> +      gather_scatter_subtype = GATHER_SCATTER_REGULAR;
>      }
>    else
>      {
> @@ -2418,6 +2432,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>           ls->gs.ifn = gs_info.ifn;
>           ls->strided_offset_vectype = gs_info.offset_vectype;
>           *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> +         gather_scatter_subtype = GATHER_SCATTER_STRIDED;
>         }
>        else if (SLP_TREE_LANES (slp_node) > 1
>                && !masked_p
> @@ -2431,6 +2446,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>           SLP_TREE_GS_BASE (slp_node) = error_mark_node;
>           grouped_gather_fallback = *memory_access_type;
>           *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> +         gather_scatter_subtype = GATHER_SCATTER_STRIDED_GROUPED;
>           ls->gs.ifn = gs_info.ifn;
>           vectype = *ls_type;
>           ls->strided_offset_vectype = gs_info.offset_vectype;
> @@ -2534,7 +2550,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>        poly_uint64 read_amount
>         = vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
>        if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -       read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
> +       read_amount *= group_size;
>
>        auto target_alignment
>         = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
> @@ -2627,6 +2643,58 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
>      return false;
>
> +  /* Some loads need to explicitly permute the loaded data if there
> +     is a load permutation.  Among those are:
> +      - VMAT_ELEMENTWISE.
> +      - VMAT_STRIDED_SLP.
> +      - VMAT_GATHER_SCATTER:
> +       - Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
> +       - Grouped strided gather (ditto but for #lanes > 1).
> +
> +     For VMAT_ELEMENTWISE we can fold the load permutation into the
> +     individual indices we access directly, eliding the permutation.
> +     Strided gather only allows load permutations for the
> +     single-element case.  */
> +
> +  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> +      && !(*memory_access_type == VMAT_ELEMENTWISE
> +          || gather_scatter_subtype == GATHER_SCATTER_STRIDED))
> +    {
> +      if (!loop_vinfo)
> +       {
> +         /* In BB vectorization we may not actually use a loaded vector
> +            accessing elements in excess of DR_GROUP_SIZE.  */
> +         stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
> +         group_info = DR_GROUP_FIRST_ELEMENT (group_info);
> +         unsigned HOST_WIDE_INT nunits;
> +         unsigned j, k, maxk = 0;
> +         FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
> +           if (k > maxk)
> +             maxk = k;
> +         tree vectype = SLP_TREE_VECTYPE (slp_node);
> +         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> +             || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "BB vectorization with gaps at the end of "
> +                                "a load is not supported\n");
> +             return false;
> +           }
> +       }
> +
> +      if (!perm_ok)
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> +                            vect_location,
> +                            "unsupported load permutation\n");
> +         return false;
> +       }
> +
> +      *slp_perm = true;
> +    }
> +
>    return true;
>  }
>
> @@ -8002,7 +8070,7 @@ vectorizable_store (vec_info *vinfo,
>    vect_load_store_data &ls = slp_node->get_data (_ls_data);
>    if (cost_vec
>        && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, 
> mask_node,
> -                              vls_type, false, &_ls_data))
> +                              vls_type, &_ls_data))
>      return false;
>    /* Temporary aliases to analysis data, should not be modified through
>       these.  */
> @@ -9446,7 +9514,6 @@ vectorizable_load (vec_info *vinfo,
>    bool compute_in_loop = false;
>    class loop *at_loop;
>    int vec_num;
> -  bool slp_perm = false;
>    bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
>    poly_uint64 vf;
>    tree aggr_type;
> @@ -9584,17 +9651,11 @@ vectorizable_load (vec_info *vinfo,
>    else
>      group_size = 1;
>
> -  bool perm_ok = true;
> -  unsigned n_perms = -1U;
> -  if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> -    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
> -                                           true, &n_perms);
> -
>    vect_load_store_data _ls_data{};
>    vect_load_store_data &ls = slp_node->get_data (_ls_data);
>    if (cost_vec
>        && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, 
> mask_node,
> -                              VLS_LOAD, perm_ok, &ls))
> +                              VLS_LOAD, &ls))
>      return false;
>    /* Temporary aliases to analysis data, should not be modified through
>       these.  */
> @@ -9615,56 +9676,6 @@ vectorizable_load (vec_info *vinfo,
>    bool type_mode_padding_p
>      = TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER 
> (mode));
>
> -  /* ???  The following checks should really be part of
> -     get_load_store_type.  */
> -  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> -      && !(memory_access_type == VMAT_ELEMENTWISE
> -          || (mat_gather_scatter_p (memory_access_type)
> -              && SLP_TREE_LANES (slp_node) == 1
> -              && (!grouped_load
> -                  || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
> -    {
> -      slp_perm = true;
> -
> -      if (!loop_vinfo && cost_vec)
> -       {
> -         /* In BB vectorization we may not actually use a loaded vector
> -            accessing elements in excess of DR_GROUP_SIZE.  */
> -         stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
> -         group_info = DR_GROUP_FIRST_ELEMENT (group_info);
> -         unsigned HOST_WIDE_INT nunits;
> -         unsigned j, k, maxk = 0;
> -         FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
> -             if (k > maxk)
> -               maxk = k;
> -         tree vectype = SLP_TREE_VECTYPE (slp_node);
> -         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
> -             || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
> -           {
> -             if (dump_enabled_p ())
> -               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                "BB vectorization with gaps at the end of "
> -                                "a load is not supported\n");
> -             return false;
> -           }
> -       }
> -
> -      if (cost_vec)
> -       {
> -         if (!perm_ok)
> -           {
> -             if (dump_enabled_p ())
> -               dump_printf_loc (MSG_MISSED_OPTIMIZATION,
> -                                vect_location,
> -                                "unsupported load permutation\n");
> -             return false;
> -           }
> -         ls.n_perms = n_perms;
> -       }
> -      else
> -       n_perms = ls.n_perms;
> -    }
> -
>    if (slp_node->ldst_lanes
>        && memory_access_type != VMAT_LOAD_STORE_LANES)
>      {
> @@ -10019,7 +10030,7 @@ vectorizable_load (vec_info *vinfo,
>          not only the number of vector stmts the permutation result
>          fits in.  */
>        int ncopies;
> -      if (slp_perm)
> +      if (ls.slp_perm)
>         {
>           gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
>           /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> @@ -10127,18 +10138,18 @@ vectorizable_load (vec_info *vinfo,
>
>           if (!costing_p)
>             {
> -             if (slp_perm)
> +             if (ls.slp_perm)
>                 dr_chain.quick_push (gimple_assign_lhs (new_stmt));
>               else
>                 slp_node->push_vec_def (new_stmt);
>             }
>         }
> -      if (slp_perm)
> +      if (ls.slp_perm)
>         {
>           if (costing_p)
>             {
> -             gcc_assert (n_perms != -1U);
> -             inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
> +             gcc_assert (ls.n_perms != -1U);
> +             inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
>                                                slp_node, 0, vect_body);
>             }
>           else
> @@ -10146,7 +10157,7 @@ vectorizable_load (vec_info *vinfo,
>               unsigned n_perms2;
>               vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, 
> vf,
>                                             false, &n_perms2);
> -             gcc_assert (n_perms == n_perms2);
> +             gcc_assert (ls.n_perms == n_perms2);
>             }
>         }
>
> @@ -10211,7 +10222,7 @@ vectorizable_load (vec_info *vinfo,
>            instead the access is contiguous but it might be
>            permuted.  No gap adjustment is needed though.  */
>         ;
> -      else if (slp_perm
> +      else if (ls.slp_perm
>                && (group_size != scalar_lanes
>                    || !multiple_p (nunits, group_size)))
>         {
> @@ -10560,7 +10571,7 @@ vectorizable_load (vec_info *vinfo,
>
>    if (mat_gather_scatter_p (memory_access_type))
>      {
> -      gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
> +      gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
>
>        /* If we pun the original vectype the loads as well as costing, length,
>          etc. is performed with the new type.  After loading we VIEW_CONVERT
> @@ -10922,14 +10933,14 @@ vectorizable_load (vec_info *vinfo,
>           /* Store vector loads in the corresponding SLP_NODE.  */
>           if (!costing_p)
>             {
> -             if (slp_perm)
> +             if (ls.slp_perm)
>                 dr_chain.quick_push (gimple_assign_lhs (new_stmt));
>               else
>                 slp_node->push_vec_def (new_stmt);
>             }
>         }
>
> -      if (slp_perm)
> +      if (ls.slp_perm)
>         {
>           if (costing_p)
>             {
> @@ -11026,7 +11037,7 @@ vectorizable_load (vec_info *vinfo,
>                                        stmt_info, bump);
>      }
>
> -  if (grouped_load || slp_perm)
> +  if (grouped_load || ls.slp_perm)
>      dr_chain.create (vec_num);
>
>    gimple *new_stmt = NULL;
> @@ -11523,11 +11534,11 @@ vectorizable_load (vec_info *vinfo,
>
>        /* Collect vector loads and later create their permutation in
>          vect_transform_slp_perm_load.  */
> -      if (!costing_p && (grouped_load || slp_perm))
> +      if (!costing_p && (grouped_load || ls.slp_perm))
>         dr_chain.quick_push (new_temp);
>
>        /* Store vector loads in the corresponding SLP_NODE.  */
> -      if (!costing_p && !slp_perm)
> +      if (!costing_p && !ls.slp_perm)
>         slp_node->push_vec_def (new_stmt);
>
>        /* With SLP permutation we load the gaps as well, without
> @@ -11536,7 +11547,7 @@ vectorizable_load (vec_info *vinfo,
>        group_elt += nunits;
>        if (!costing_p
>           && maybe_ne (group_gap_adj, 0U)
> -         && !slp_perm
> +         && !ls.slp_perm
>           && known_eq (group_elt, group_size - group_gap_adj))
>         {
>           poly_wide_int bump_val
> @@ -11553,7 +11564,7 @@ vectorizable_load (vec_info *vinfo,
>       elements loaded for a permuted SLP load.  */
>    if (!costing_p
>        && maybe_ne (group_gap_adj, 0U)
> -      && slp_perm)
> +      && ls.slp_perm)
>      {
>        poly_wide_int bump_val
>         = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
> @@ -11564,7 +11575,7 @@ vectorizable_load (vec_info *vinfo,
>                                      stmt_info, bump);
>      }
>
> -  if (slp_perm)
> +  if (ls.slp_perm)
>      {
>        /* For SLP we know we've seen all possible uses of dr_chain so
>          direct vect_transform_slp_perm_load to DCE the unused parts.
> @@ -11572,9 +11583,9 @@ vectorizable_load (vec_info *vinfo,
>          in PR101120 and friends.  */
>        if (costing_p)
>         {
> -         gcc_assert (n_perms != -1U);
> -         if (n_perms != 0)
> -           inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
> +         gcc_assert (ls.n_perms != -1U);
> +         if (ls.n_perms != 0)
> +           inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
>                                             slp_node, 0, vect_body);
>         }
>        else
> @@ -11583,7 +11594,7 @@ vectorizable_load (vec_info *vinfo,
>           bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
>                                                   gsi, vf, false, &n_perms2,
>                                                   nullptr, true);
> -         gcc_assert (ok && n_perms == n_perms2);
> +         gcc_assert (ok && ls.n_perms == n_perms2);
>         }
>        dr_chain.release ();
>      }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 52bc0d672bf..d2c5f2ba51f 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -209,6 +209,13 @@ enum vect_memory_access_type {
>    VMAT_GATHER_SCATTER_EMULATED
>  };
>
> +enum vect_gather_scatter_subtype {
> +    GATHER_SCATTER_UNDEFINED,
> +    GATHER_SCATTER_REGULAR,
> +    GATHER_SCATTER_STRIDED,
> +    GATHER_SCATTER_STRIDED_GROUPED
> +};
> +
>  /* Returns whether MAT is any of the VMAT_GATHER_SCATTER_* kinds.  */
>
>  inline bool
> @@ -290,6 +297,8 @@ struct vect_load_store_data : vect_data {
>    tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
>    tree ls_type; // VMAT_GATHER_SCATTER_IFN
>    auto_vec<int> elsvals;
> +  /* True if the load requires a load permutation.  */
> +  bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
>    unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
>  };
>
> --
> 2.51.0
>

Reply via email to