[PATCH] vect: Move slp_perm checking into get_load_store_type.

Robin Dapp Mon, 13 Oct 2025 02:46:42 -0700

Hi,

this is a refactor that moves the setting of slp_perm, checking of basic-block 
SLP gaps, as well as the final check for perm_ok into get_load_store_type.
Also, slp_perm is moved to ls_data.


In order to help legibility, the patch introduces an enum
vect_gather_scatter_subtype that specifies the gather/scatter
"sub-type" like strided, or grouped_strided.  It's only used in one place for 
now, though.

Bootstrapped and regtested on x86 and power10.  Regtested on aarch64 and 
rv64gcv_zvl512b.

Regards
 Robin

gcc/ChangeLog:

        * tree-vect-stmts.cc (get_load_store_type): Add load-permutation
        checks and setting of slp_perm.
        (vectorizable_store): Remove perm_ok argument.
        (vectorizable_load): Ditto and replace slp_perm by ls.slp_perm.
        * tree-vectorizer.h (enum vect_gather_scatter_subtype): New.
        (struct vect_load_store_data): Add slp_perm.
---
 gcc/tree-vect-stmts.cc | 181 ++++++++++++++++++++++-------------------
 gcc/tree-vectorizer.h  |   9 ++
 2 files changed, 105 insertions(+), 85 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bf5a67bf805..ef33638af74 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype, poly_uint64 
nelts, tree *ptype,
    VECTYPE is the vector type that the vectorized statements will use.
 
    If ELSVALS is nonzero the supported else values will be stored in the
-   vector ELSVALS points to.
-
-   For loads PERM_OK indicates whether we can code generate a
-   SLP_TREE_LOAD_PERMUTATION on the node.  */
+   vector ELSVALS points to.  */
 
 static bool
 get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
                     tree vectype, slp_tree slp_node,
                     bool masked_p, vec_load_store_type vls_type,
-                    bool perm_ok, vect_load_store_data *ls)
+                    vect_load_store_data *ls)
 {
   vect_memory_access_type *memory_access_type = &ls->memory_access_type;
   poly_int64 *poffset = &ls->poffset;
@@ -2081,6 +2078,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   internal_fn *lanes_ifn = &ls->lanes_ifn;
   vec<int> *elsvals = &ls->elsvals;
   tree *ls_type = &ls->ls_type;
+  bool *slp_perm = &ls->slp_perm;
+  unsigned *n_perms = &ls->n_perms;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@@ -2093,6 +2092,15 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   *misalignment = DR_MISALIGNMENT_UNKNOWN;
   *poffset = 0;
   *ls_type = NULL_TREE;
+  *slp_perm = false;
+  *n_perms = -1U;
+
+  bool perm_ok = true;
+  poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
+                                           vf, true, n_perms);
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -2131,6 +2139,10 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
     first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
 
+  /* Sub-type of gather/scatter.  */
+  vect_gather_scatter_subtype gather_scatter_subtype
+    = GATHER_SCATTER_UNDEFINED;
+
   if (STMT_VINFO_STRIDED_P (first_stmt_info))
     /* Try to use consecutive accesses of as many elements as possible,
        separated by the stride, until we have a complete vector.
@@ -2181,6 +2193,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
            }
          *memory_access_type = VMAT_GATHER_SCATTER_EMULATED;
        }
+
+      gather_scatter_subtype = GATHER_SCATTER_REGULAR;
     }
   else
     {
@@ -2418,6 +2432,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
          ls->gs.ifn = gs_info.ifn;
          ls->strided_offset_vectype = gs_info.offset_vectype;
          *memory_access_type = VMAT_GATHER_SCATTER_IFN;
+         gather_scatter_subtype = GATHER_SCATTER_STRIDED;
        }
       else if (SLP_TREE_LANES (slp_node) > 1
               && !masked_p
@@ -2431,6 +2446,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
          SLP_TREE_GS_BASE (slp_node) = error_mark_node;
          grouped_gather_fallback = *memory_access_type;
          *memory_access_type = VMAT_GATHER_SCATTER_IFN;
+         gather_scatter_subtype = GATHER_SCATTER_STRIDED_GROUPED;
          ls->gs.ifn = gs_info.ifn;
          vectype = *ls_type;
          ls->strided_offset_vectype = gs_info.offset_vectype;
@@ -2534,7 +2550,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
       poly_uint64 read_amount
        = vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-       read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+       read_amount *= group_size;
 
       auto target_alignment
        = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
@@ -2627,6 +2643,58 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
     return false;
 
+  /* Some loads need to explicitly permute the loaded data if there
+     is a load permutation.  Among those are:
+      - VMAT_ELEMENTWISE.
+      - VMAT_STRIDED_SLP.
+      - VMAT_GATHER_SCATTER:
+       - Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
+       - Grouped strided gather (ditto but for #lanes > 1).
+
+     For VMAT_ELEMENTWISE we can fold the load permutation into the
+     individual indices we access directly, eliding the permutation.
+     Strided gather only allows load permutations for the
+     single-element case.  */
+
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+      && !(*memory_access_type == VMAT_ELEMENTWISE
+          || gather_scatter_subtype == GATHER_SCATTER_STRIDED))
+    {
+      if (!loop_vinfo)
+       {
+         /* In BB vectorization we may not actually use a loaded vector
+            accessing elements in excess of DR_GROUP_SIZE.  */
+         stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+         group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+         unsigned HOST_WIDE_INT nunits;
+         unsigned j, k, maxk = 0;
+         FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+           if (k > maxk)
+             maxk = k;
+         tree vectype = SLP_TREE_VECTYPE (slp_node);
+         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+             || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "BB vectorization with gaps at the end of "
+                                "a load is not supported\n");
+             return false;
+           }
+       }
+
+      if (!perm_ok)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                            vect_location,
+                            "unsupported load permutation\n");
+         return false;
+       }
+
+      *slp_perm = true;
+    }
+
   return true;
 }
 
@@ -8002,7 +8070,7 @@ vectorizable_store (vec_info *vinfo,
   vect_load_store_data &ls = slp_node->get_data (_ls_data);
   if (cost_vec
       && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-                              vls_type, false, &_ls_data))
+                              vls_type, &_ls_data))
     return false;
   /* Temporary aliases to analysis data, should not be modified through
      these.  */
@@ -9446,7 +9514,6 @@ vectorizable_load (vec_info *vinfo,
   bool compute_in_loop = false;
   class loop *at_loop;
   int vec_num;
-  bool slp_perm = false;
   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
   poly_uint64 vf;
   tree aggr_type;
@@ -9584,17 +9651,11 @@ vectorizable_load (vec_info *vinfo,
   else
     group_size = 1;
 
-  bool perm_ok = true;
-  unsigned n_perms = -1U;
-  if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
-                                           true, &n_perms);
-
   vect_load_store_data _ls_data{};
   vect_load_store_data &ls = slp_node->get_data (_ls_data);
   if (cost_vec
       && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-                              VLS_LOAD, perm_ok, &ls))
+                              VLS_LOAD, &ls))
     return false;
   /* Temporary aliases to analysis data, should not be modified through
      these.  */
@@ -9615,56 +9676,6 @@ vectorizable_load (vec_info *vinfo,
   bool type_mode_padding_p
     = TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER 
(mode));
 
-  /* ???  The following checks should really be part of
-     get_load_store_type.  */
-  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-      && !(memory_access_type == VMAT_ELEMENTWISE
-          || (mat_gather_scatter_p (memory_access_type)
-              && SLP_TREE_LANES (slp_node) == 1
-              && (!grouped_load
-                  || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
-    {
-      slp_perm = true;
-
-      if (!loop_vinfo && cost_vec)
-       {
-         /* In BB vectorization we may not actually use a loaded vector
-            accessing elements in excess of DR_GROUP_SIZE.  */
-         stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-         group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-         unsigned HOST_WIDE_INT nunits;
-         unsigned j, k, maxk = 0;
-         FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
-             if (k > maxk)
-               maxk = k;
-         tree vectype = SLP_TREE_VECTYPE (slp_node);
-         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-             || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "BB vectorization with gaps at the end of "
-                                "a load is not supported\n");
-             return false;
-           }
-       }
-
-      if (cost_vec)
-       {
-         if (!perm_ok)
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                vect_location,
-                                "unsupported load permutation\n");
-             return false;
-           }
-         ls.n_perms = n_perms;
-       }
-      else
-       n_perms = ls.n_perms;
-    }
-
   if (slp_node->ldst_lanes
       && memory_access_type != VMAT_LOAD_STORE_LANES)
     {
@@ -10019,7 +10030,7 @@ vectorizable_load (vec_info *vinfo,
         not only the number of vector stmts the permutation result
         fits in.  */
       int ncopies;
-      if (slp_perm)
+      if (ls.slp_perm)
        {
          gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
          /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
@@ -10127,18 +10138,18 @@ vectorizable_load (vec_info *vinfo,
 
          if (!costing_p)
            {
-             if (slp_perm)
+             if (ls.slp_perm)
                dr_chain.quick_push (gimple_assign_lhs (new_stmt));
              else
                slp_node->push_vec_def (new_stmt);
            }
        }
-      if (slp_perm)
+      if (ls.slp_perm)
        {
          if (costing_p)
            {
-             gcc_assert (n_perms != -1U);
-             inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
+             gcc_assert (ls.n_perms != -1U);
+             inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                               slp_node, 0, vect_body);
            }
          else
@@ -10146,7 +10157,7 @@ vectorizable_load (vec_info *vinfo,
              unsigned n_perms2;
              vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
                                            false, &n_perms2);
-             gcc_assert (n_perms == n_perms2);
+             gcc_assert (ls.n_perms == n_perms2);
            }
        }
 
@@ -10211,7 +10222,7 @@ vectorizable_load (vec_info *vinfo,
           instead the access is contiguous but it might be
           permuted.  No gap adjustment is needed though.  */
        ;
-      else if (slp_perm
+      else if (ls.slp_perm
               && (group_size != scalar_lanes
                   || !multiple_p (nunits, group_size)))
        {
@@ -10560,7 +10571,7 @@ vectorizable_load (vec_info *vinfo,
 
   if (mat_gather_scatter_p (memory_access_type))
     {
-      gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
+      gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
 
       /* If we pun the original vectype the loads as well as costing, length,
         etc. is performed with the new type.  After loading we VIEW_CONVERT
@@ -10922,14 +10933,14 @@ vectorizable_load (vec_info *vinfo,
          /* Store vector loads in the corresponding SLP_NODE.  */
          if (!costing_p)
            {
-             if (slp_perm)
+             if (ls.slp_perm)
                dr_chain.quick_push (gimple_assign_lhs (new_stmt));
              else
                slp_node->push_vec_def (new_stmt);
            }
        }
 
-      if (slp_perm)
+      if (ls.slp_perm)
        {
          if (costing_p)
            {
@@ -11026,7 +11037,7 @@ vectorizable_load (vec_info *vinfo,
                                       stmt_info, bump);
     }
 
-  if (grouped_load || slp_perm)
+  if (grouped_load || ls.slp_perm)
     dr_chain.create (vec_num);
 
   gimple *new_stmt = NULL;
@@ -11523,11 +11534,11 @@ vectorizable_load (vec_info *vinfo,
 
       /* Collect vector loads and later create their permutation in
         vect_transform_slp_perm_load.  */
-      if (!costing_p && (grouped_load || slp_perm))
+      if (!costing_p && (grouped_load || ls.slp_perm))
        dr_chain.quick_push (new_temp);
 
       /* Store vector loads in the corresponding SLP_NODE.  */
-      if (!costing_p && !slp_perm)
+      if (!costing_p && !ls.slp_perm)
        slp_node->push_vec_def (new_stmt);
 
       /* With SLP permutation we load the gaps as well, without
@@ -11536,7 +11547,7 @@ vectorizable_load (vec_info *vinfo,
       group_elt += nunits;
       if (!costing_p
          && maybe_ne (group_gap_adj, 0U)
-         && !slp_perm
+         && !ls.slp_perm
          && known_eq (group_elt, group_size - group_gap_adj))
        {
          poly_wide_int bump_val
@@ -11553,7 +11564,7 @@ vectorizable_load (vec_info *vinfo,
      elements loaded for a permuted SLP load.  */
   if (!costing_p
       && maybe_ne (group_gap_adj, 0U)
-      && slp_perm)
+      && ls.slp_perm)
     {
       poly_wide_int bump_val
        = (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
@@ -11564,7 +11575,7 @@ vectorizable_load (vec_info *vinfo,
                                     stmt_info, bump);
     }
 
-  if (slp_perm)
+  if (ls.slp_perm)
     {
       /* For SLP we know we've seen all possible uses of dr_chain so
         direct vect_transform_slp_perm_load to DCE the unused parts.
@@ -11572,9 +11583,9 @@ vectorizable_load (vec_info *vinfo,
         in PR101120 and friends.  */
       if (costing_p)
        {
-         gcc_assert (n_perms != -1U);
-         if (n_perms != 0)
-           inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
+         gcc_assert (ls.n_perms != -1U);
+         if (ls.n_perms != 0)
+           inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                            slp_node, 0, vect_body);
        }
       else
@@ -11583,7 +11594,7 @@ vectorizable_load (vec_info *vinfo,
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
                                                  gsi, vf, false, &n_perms2,
                                                  nullptr, true);
-         gcc_assert (ok && n_perms == n_perms2);
+         gcc_assert (ok && ls.n_perms == n_perms2);
        }
       dr_chain.release ();
     }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 52bc0d672bf..d2c5f2ba51f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -209,6 +209,13 @@ enum vect_memory_access_type {
   VMAT_GATHER_SCATTER_EMULATED
 };
 
+enum vect_gather_scatter_subtype {
+    GATHER_SCATTER_UNDEFINED,
+    GATHER_SCATTER_REGULAR,
+    GATHER_SCATTER_STRIDED,
+    GATHER_SCATTER_STRIDED_GROUPED
+};
+
 /* Returns whether MAT is any of the VMAT_GATHER_SCATTER_* kinds.  */
 
 inline bool
@@ -290,6 +297,8 @@ struct vect_load_store_data : vect_data {
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
   tree ls_type; // VMAT_GATHER_SCATTER_IFN
   auto_vec<int> elsvals;
+  /* True if the load requires a load permutation.  */
+  bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
 };
 
-- 
2.51.0

[PATCH] vect: Move slp_perm checking into get_load_store_type.

Reply via email to