[PATCH v3] vect: Handle grouped accesses via gather/scatter.

Robin Dapp Fri, 19 Sep 2025 15:08:18 -0700

Hi,

Changes from v3:


 - Remove load-perm optimization and try to pun whole group.
 - Add slp_perm handling to VMAT_GATHER_SCATTER when using "grouped gather".

The generated code is obviously worse than with v2 for my example because we 
have 2x the number of strided loads and the (non-single-source) load-perm 
gathers throw away have of the loaded data afterwards.  But we can still 
improve this in a follow up.

Bootstrapped and regtested on x86 and power10.  Regtested on rv64gcv_zvl512b, 
aarch64 running.

This patch adds gather/scatter handling for grouped access.  The idea is
to e.g. replace an access (for uint8_t elements) like
  arr[0]
  arr[1]
  arr[2]
  arr[3]
  arr[0 + step]
  arr[1 + step]
  ...
by a gather load of uint32_t
  arr[0..3]
  arr[0 + step * 1..3 + step * 1]
  arr[0 + step * 2..3 + step * 2]
  ...
where the offset vector is a simple series with step STEP.
If supported, such a gather can be implemented as a strided load.

If we have a masked access the transformation is not performed.  This
could still be done after converting the data back to the original
vectype but it does not seem worth it for now.

Regards
 Robin

        PR target/118019

gcc/ChangeLog:

        * internal-fn.cc (get_supported_else_vals): Exit at invalid
        index.
        (internal_strided_fn_supported_p): New funtion.
        * internal-fn.h (internal_strided_fn_supported_p): Declare.
        * tree-vect-stmts.cc (vector_vector_composition_type):
        Add vector_only argument.
        (vect_use_grouped_gather): New function.
        (vect_get_store_rhs): Adjust docs of
        vector_vector_composition_type.
        (get_load_store_type): Try grouped gather.
        (vectorizable_store): Use punned vectype.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (struct vect_load_store_data): Add punned
        vectype.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
---
 gcc/internal-fn.cc                            |  22 +-
 gcc/internal-fn.h                             |   2 +
 .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  50 ++++
 gcc/tree-vect-stmts.cc                        | 240 ++++++++++++++++--
 gcc/tree-vectorizer.h                         |   1 +
 5 files changed, 289 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index bf2fac81807..db396c69ec5 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned 
else_index,
                         vec<int> &else_vals)
 {
   const struct insn_data_d *data = &insn_data[icode];
-  if ((char)else_index >= data->n_operands)
+  if ((int)else_index >= data->n_operands || (int)else_index == -1)
     return;
 
   machine_mode else_mode = data->operand[else_index].mode;
@@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, 
tree vector_type,
   return ok;
 }
 
+/* Return true if the target supports a strided load/store function IFN
+   with VECTOR_TYPE.  If supported and ELSVALS is nonzero the supported else
+   values will be added to the vector ELSVALS points to.  */
+
+bool
+internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+                                vec<int> *elsvals)
+{
+  machine_mode mode = TYPE_MODE (vector_type);
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, mode);
+
+  bool ok = icode != CODE_FOR_nothing;
+
+  if (ok && elsvals)
+    get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals);
+
+  return ok;
+}
+
 /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
    for pointers of type TYPE when the accesses have LENGTH bytes and their
    common byte alignment is ALIGN.  */
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index fd21694dfeb..dcb707251f8 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
                                                    tree, tree, int,
                                                    vec<int> * = nullptr);
+extern bool internal_strided_fn_supported_p (internal_fn, tree,
+                                             vec<int> * = nullptr);
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
                                                poly_uint64, unsigned int);
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
new file mode 100644
index 00000000000..c8c1a7291fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
@@ -0,0 +1,50 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d 
-mno-vector-strict-align" } */
+
+/* Ensure we use strided loads.  */
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+uint32_t
+abs2 (uint32_t a)
+{
+  uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
+  return (a + s) ^ s;
+}
+
+int
+x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
+{
+  uint32_t tmp[4][4];
+  uint32_t a0, a1, a2, a3;
+  int sum = 0;
+  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+      HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
+    }
+  for (int i = 0; i < 4; i++)
+    {
+      HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
+    }
+  return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
+}
+
+/* { dg-final { scan-assembler-times "vlse64" 8 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 3af78f3af84..747659d3fda 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -62,6 +62,9 @@ along with GCC; see the file COPYING3.  If not see
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
 
+static tree vector_vector_composition_type (tree, poly_uint64, tree *,
+                                           bool = false);
+
 /* Return TRUE iff the given statement is in an inner loop relative to
    the loop being vectorized.  */
 bool
@@ -1723,6 +1726,95 @@ vect_truncate_gather_scatter_offset (stmt_vec_info 
stmt_info, tree vectype,
   return false;
 }
 
+/* Return true if we can use gather/scatter or strided internal functions
+   to vectorize STMT_INFO, which is a grouped or strided load or store
+   with multiple lanes and will be implemented by a type-punned access
+   of a vector with element size that matches the number of lanes.
+
+   MASKED_P is true if load or store is conditional.
+   When returning true, fill in GS_INFO with the information required to
+   perform the operation.  Also, store the punning type in PUNNED_VECTYPE.
+
+   If successful and ELSVALS is nonzero the supported
+   else values will be stored in the vector ELSVALS points to.  */
+
+static bool
+vect_use_grouped_gather (dr_vec_info *dr_info, tree vectype,
+                        loop_vec_info loop_vinfo, bool masked_p,
+                        unsigned int nelts,
+                        gather_scatter_info *info, vec<int> *elsvals,
+                        tree *pun_vectype)
+{
+  data_reference *dr = dr_info->dr;
+
+  /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
+     multiple gathers/scatter.  */
+  if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
+    return false;
+
+  /* Pun the vectype with one of the same size but an element spanning
+     NELTS elements of VECTYPE.
+     The punned type of a V16QI with NELTS = 4 would be V4SI.
+     */
+  tree tmp;
+  unsigned int pieces;
+  if (!can_div_trunc_p (TYPE_VECTOR_SUBPARTS (vectype), nelts, &pieces)
+      || !pieces)
+    return false;
+
+  *pun_vectype = vector_vector_composition_type (vectype, pieces, &tmp, true);
+
+  if (!*pun_vectype || !VECTOR_TYPE_P (*pun_vectype))
+    return false;
+
+  internal_fn ifn;
+  tree offset_vectype = *pun_vectype;
+
+  internal_fn strided_ifn = DR_IS_READ (dr)
+    ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
+
+  /* Check if we have a gather/scatter with the new type.  We're just trying
+     with the type itself as offset for now.  If not, check if we have a
+     strided load/store.  These have fewer constraints (for example no offset
+     type must exist) so it is possible that even though a gather/scatter is
+     not available we still have a strided load/store.  */
+  bool ok = false;
+  if (vect_gather_scatter_fn_p
+      (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
+       TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
+       &offset_vectype, elsvals))
+    ok = true;
+  else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
+                                           elsvals))
+    {
+      /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
+        will switch back to the strided variants.  */
+      ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
+       IFN_MASK_LEN_SCATTER_STORE;
+      ok = true;
+    }
+
+  if (ok)
+    {
+      info->ifn = ifn;
+      info->decl = NULL_TREE;
+      info->base = dr->ref;
+      info->alias_ptr = build_int_cst
+       (reference_alias_ptr_type (DR_REF (dr)),
+        get_object_alignment (DR_REF (dr)));
+      info->element_type = TREE_TYPE (*pun_vectype);
+      info->offset_vectype = offset_vectype;
+      /* No need to set the offset, vect_get_strided_load_store_ops
+        will do that.  */
+      info->scale = 1;
+      info->memory_type = TREE_TYPE (DR_REF (dr));
+      return true;
+    }
+
+  return false;
+}
+
+
 /* Return true if we can use gather/scatter internal functions to
    vectorize STMT_INFO, which is a grouped or strided load or store.
    MASKED_P is true if load or store is conditional.  When returning
@@ -1888,12 +1980,14 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
 
 /* Function VECTOR_VECTOR_COMPOSITION_TYPE
 
-   This function returns a vector type which can be composed with NETLS pieces,
+   This function returns a vector type which can be composed with NELTS pieces,
    whose type is recorded in PTYPE.  VTYPE should be a vector type, and has the
    same vector size as the return vector.  It checks target whether supports
    pieces-size vector mode for construction firstly, if target fails to, check
    pieces-size scalar mode for construction further.  It returns NULL_TREE if
-   fails to find the available composition.
+   fails to find the available composition.  If the caller only wants scalar
+   pieces where PTYPE e.g. is a possible gather/scatter element type
+   SCALAR_PTYPE_ONLY must be true.
 
    For example, for (vtype=V16QI, nelts=4), we can probably get:
      - V16QI with PTYPE V4QI.
@@ -1901,7 +1995,8 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
      - NULL_TREE.  */
 
 static tree
-vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
+vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
+                               bool scalar_ptype_only)
 {
   gcc_assert (VECTOR_TYPE_P (vtype));
   gcc_assert (known_gt (nelts, 0U));
@@ -1927,7 +2022,8 @@ vector_vector_composition_type (tree vtype, poly_uint64 
nelts, tree *ptype)
       scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
       poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
       machine_mode rmode;
-      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
+      if (!scalar_ptype_only
+         && related_vector_mode (vmode, elmode, inelts).exists (&rmode)
          && (convert_optab_handler (vec_init_optab, vmode, rmode)
              != CODE_FOR_nothing))
        {
@@ -1938,12 +2034,15 @@ vector_vector_composition_type (tree vtype, poly_uint64 
nelts, tree *ptype)
       /* Otherwise check if exists an integer type of the same piece size and
         if vec_init optab supports construction from it directly.  */
       if (int_mode_for_size (pbsize, 0).exists (&elmode)
-         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
-         && (convert_optab_handler (vec_init_optab, rmode, elmode)
-             != CODE_FOR_nothing))
+         && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
        {
-         *ptype = build_nonstandard_integer_type (pbsize, 1);
-         return build_vector_type (*ptype, nelts);
+         if (scalar_ptype_only
+             || convert_optab_handler (vec_init_optab, rmode, elmode)
+             != CODE_FOR_nothing)
+           {
+             *ptype = build_nonstandard_integer_type (pbsize, 1);
+             return build_vector_type (*ptype, nelts);
+           }
        }
     }
 
@@ -1981,6 +2080,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   int *misalignment = &ls->misalignment;
   internal_fn *lanes_ifn = &ls->lanes_ifn;
   vec<int> *elsvals = &ls->elsvals;
+  tree *ls_type = &ls->ls_type;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@@ -1994,6 +2094,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
 
   *misalignment = DR_MISALIGNMENT_UNKNOWN;
   *poffset = 0;
+  *ls_type = NULL_TREE;
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -2318,18 +2419,20 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
      on nearby locations.  Or, even if it's a win over scalar code,
      it might not be a win over vectorizing at a lower VF, if that
      allows us to use contiguous accesses.  */
+  vect_memory_access_type grouped_gather_fallback = VMAT_UNINITIALIZED;
   if (loop_vinfo
       && (*memory_access_type == VMAT_ELEMENTWISE
          || *memory_access_type == VMAT_STRIDED_SLP)
       && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
-      && SLP_TREE_LANES (slp_node) == 1
-      && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-         || single_element_p))
+      && SLP_TREE_LANES (slp_node) >= 1)
     {
       gather_scatter_info gs_info;
-      if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
-                                             masked_p, &gs_info, elsvals,
-                                             group_size, single_element_p))
+      if (SLP_TREE_LANES (slp_node) == 1
+         && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+             || single_element_p)
+         && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
+                                                masked_p, &gs_info, elsvals,
+                                                group_size, single_element_p))
        {
          SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
          SLP_TREE_GS_BASE (slp_node) = error_mark_node;
@@ -2337,6 +2440,20 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
          ls->strided_offset_vectype = gs_info.offset_vectype;
          *memory_access_type = VMAT_GATHER_SCATTER_IFN;
        }
+      else if (SLP_TREE_LANES (slp_node) > 1
+              && !masked_p
+              && !single_element_p
+              && vect_use_grouped_gather (STMT_VINFO_DR_INFO (stmt_info),
+                                          vectype, loop_vinfo,
+                                          masked_p, group_size,
+                                          &gs_info, elsvals, ls_type))
+       {
+         grouped_gather_fallback = *memory_access_type;
+         *memory_access_type = VMAT_GATHER_SCATTER_IFN;
+         ls->gs.ifn = gs_info.ifn;
+         vectype = *ls_type;
+         ls->strided_offset_vectype = gs_info.offset_vectype;
+       }
     }
 
   if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
@@ -2362,6 +2479,18 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
        = vect_supportable_dr_alignment
           (vinfo, first_dr_info, vectype, *misalignment,
            mat_gather_scatter_p (*memory_access_type));
+      if (grouped_gather_fallback != VMAT_UNINITIALIZED
+         && *alignment_support_scheme != dr_aligned
+         && *alignment_support_scheme != dr_unaligned_supported)
+       {
+         /* No supportable alignment for a grouped gather, fall back to the
+            original memory access type.  Even though VMAT_STRIDED_SLP might
+            also try aligned vector loads it can still choose vector
+            construction from scalars.  */
+         *memory_access_type = grouped_gather_fallback;
+         *alignment_support_scheme = dr_unaligned_supported;
+         *misalignment = DR_MISALIGNMENT_UNKNOWN;
+       }
     }
 
   if (overrun_p)
@@ -8368,10 +8497,13 @@ vectorizable_store (vec_info *vinfo,
     {
       aggr_type = elem_type;
       if (!costing_p)
-       vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
-                                        ls.strided_offset_vectype,
-                                        loop_vinfo, gsi,
-                                        &bump, &vec_offset, loop_lens);
+       {
+         tree vtype = ls.ls_type ? ls.ls_type : vectype;
+         vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
+                                          ls.strided_offset_vectype,
+                                          loop_vinfo, gsi,
+                                          &bump, &vec_offset, loop_lens);
+       }
     }
   else
     {
@@ -8557,7 +8689,9 @@ vectorizable_store (vec_info *vinfo,
 
   if (mat_gather_scatter_p (memory_access_type))
     {
-      gcc_assert (!grouped_store);
+      gcc_assert (!grouped_store || ls.ls_type);
+      if (ls.ls_type)
+       vectype = ls.ls_type;
       auto_vec<tree> vec_offsets;
       unsigned int inside_cost = 0, prologue_cost = 0;
       int num_stmts = vec_num;
@@ -8604,8 +8738,9 @@ vectorizable_store (vec_info *vinfo,
              if (mask_node)
                vec_mask = vec_masks[j];
              /* We should have catched mismatched types earlier.  */
-             gcc_assert (useless_type_conversion_p (vectype,
-                                                    TREE_TYPE (vec_oprnd)));
+             gcc_assert (ls.ls_type
+                         || useless_type_conversion_p
+                         (vectype, TREE_TYPE (vec_oprnd)));
            }
          tree final_mask = NULL_TREE;
          tree final_len = NULL_TREE;
@@ -8658,6 +8793,18 @@ vectorizable_store (vec_info *vinfo,
                    }
                }
 
+             if (ls.ls_type)
+               {
+                 gimple *conv_stmt
+                   = gimple_build_assign (make_ssa_name (vectype),
+                                          VIEW_CONVERT_EXPR,
+                                          build1 (VIEW_CONVERT_EXPR, vectype,
+                                                  vec_oprnd));
+                 vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
+                                              gsi);
+                 vec_oprnd = gimple_get_lhs (conv_stmt);
+               }
+
              gcall *call;
              if (final_len && final_mask)
                {
@@ -10029,7 +10176,8 @@ vectorizable_load (vec_info *vinfo,
       return true;
     }
 
-  if (mat_gather_scatter_p (memory_access_type))
+  if (mat_gather_scatter_p (memory_access_type)
+      && !ls.ls_type)
     grouped_load = false;
 
   if (grouped_load
@@ -10088,6 +10236,7 @@ vectorizable_load (vec_info *vinfo,
          vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
          group_gap_adj = group_size - scalar_lanes;
        }
+      dr_chain.create (vec_num);
 
       ref_type = get_group_alias_ptr_type (first_stmt_info);
     }
@@ -10422,7 +10571,14 @@ vectorizable_load (vec_info *vinfo,
 
   if (mat_gather_scatter_p (memory_access_type))
     {
-      gcc_assert (!grouped_load && !slp_perm);
+      gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
+
+      /* If we pun the original vectype the loads as well as costing, length,
+        etc. is performed with the new type.  After loading we VIEW_CONVERT
+        the data to the original vectype.  */
+      tree original_vectype = vectype;
+      if (ls.ls_type)
+       vectype = ls.ls_type;
 
       /* 1. Create the vector or array pointer update chain.  */
       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
@@ -10763,8 +10919,42 @@ vectorizable_load (vec_info *vinfo,
              new_temp = new_temp2;
            }
 
+         if (ls.ls_type)
+           {
+             new_stmt = gimple_build_assign (make_ssa_name
+                                             (original_vectype),
+                                             VIEW_CONVERT_EXPR,
+                                             build1 (VIEW_CONVERT_EXPR,
+                                                     original_vectype,
+                                                     new_temp));
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+           }
+
          /* Store vector loads in the corresponding SLP_NODE.  */
-         slp_node->push_vec_def (new_stmt);
+         if (!costing_p)
+           {
+             if (slp_perm)
+               dr_chain.quick_push (gimple_assign_lhs (new_stmt));
+             else
+               slp_node->push_vec_def (new_stmt);
+           }
+       }
+
+      if (slp_perm)
+       {
+         if (costing_p)
+           {
+             gcc_assert (ls.n_perms != -1U);
+             inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
+                                              slp_node, 0, vect_body);
+           }
+         else
+           {
+             unsigned n_perms2;
+             vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
+                                           false, &n_perms2);
+             gcc_assert (ls.n_perms == n_perms2);
+           }
        }
 
       if (costing_p && dump_enabled_p ())
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index faa72841d3c..f3357d1d1b1 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
       tree decl;       // VMAT_GATHER_SCATTER_DECL
   } gs;
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
+  tree ls_type; // VMAT_GATHER_SCATTER_IFN
   auto_vec<int> elsvals;
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
 };
-- 
2.51.0

[PATCH v3] vect: Handle grouped accesses via gather/scatter.

Reply via email to