Hi,

new try even though Richi didn't like the previous attempt ;)
I played with vect_transform_slp_perm_load without really getting
anywhere and figured maybe a now clearer version is acceptable.

Consecutive load permutations like {0, 1, 2, 3} or {4, 5, 6, 7} in a
group of 8 only read a part of the group, leaving a gap.

For strided accesses we can elide the permutation and, instead of
accessing the whole group, use the number of SLP lanes.  This
effectively increases the vector size as we don't load gaps.  On top we
do not need to emit the permutes at all.

Bootstrapped and regtested on x86 and power10.  Regtested on aarch64 and 
rv64gcv_zvl512b.

Regards
 Robin

gcc/ChangeLog:

        * tree-vect-stmts.cc (has_consecutive_load_permutation): New.
        (get_load_store_type): Elide load permutation if possible and
        decrease group size.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (struct vect_load_store_data): Add
        consecutive_perm_start.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr118019-2.c: Expect vlse32.
---
 .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  2 +-
 gcc/tree-vect-stmts.cc                        | 54 ++++++++++++++++++-
 gcc/tree-vectorizer.h                         |  3 ++
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
index c8c1a7291fb..d3436b78377 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
@@ -47,4 +47,4 @@ x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t 
*pix2, int i_pix2)
   return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
 }
 
-/* { dg-final { scan-assembler-times "vlse64" 8 } } */
+/* { dg-final { scan-assembler-times "vlse32" 4 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 83acbb3ff67..c86ba4f5752 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2049,6 +2049,31 @@ vector_vector_composition_type (tree vtype, poly_uint64 
nelts, tree *ptype,
   return NULL_TREE;
 }
 
+/* Check if the load permutation of NODE only refers to a consecutive
+   subset of the group indices where GROUP_SIZE is the size of the
+   dataref's group.  We also assert that the length of the permutation
+   divides the group size and is a power of two.
+   Such load permutations can be elided in strided access schemes as
+   we can "jump over" the gap they leave.  */
+
+static bool
+has_consecutive_load_permutation (slp_tree node, unsigned group_size)
+{
+  load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
+  if (!perm.exists ()
+      || perm.length () <= 1
+      || !pow2p_hwi (perm.length ())
+      || group_size % perm.length ())
+    return false;
+
+  unsigned int start = perm[0];
+  for (unsigned int i = 1; i < perm.length (); i++)
+    if (perm[i] != start + (unsigned int)i)
+      return false;
+
+  return true;
+}
+
 /* Analyze load or store SLP_NODE of type VLS_TYPE.  Return true
    if there is a memory access type that the vectorized form can use,
    storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
@@ -2094,6 +2119,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   *ls_type = NULL_TREE;
   *slp_perm = false;
   *n_perms = -1U;
+  ls->consecutive_load_perm = false;
 
   bool perm_ok = true;
   poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
@@ -2106,6 +2132,16 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
     {
       first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
       group_size = DR_GROUP_SIZE (first_stmt_info);
+      /* If we have a strided access and the load permutation is consecutive
+        we can reduce the group to the elements the permutation accesses.
+        Then we release the permutation.  */
+      if (STMT_VINFO_STRIDED_P (stmt_info)
+         && has_consecutive_load_permutation (slp_node, group_size))
+       {
+         ls->consecutive_load_perm = true;
+         group_size = SLP_TREE_LANES (slp_node);
+         SLP_TREE_LOAD_PERMUTATION (slp_node).release ();
+       }
       gap = DR_GROUP_GAP (first_stmt_info);
       single_element_p = (stmt_info == first_stmt_info
                          && !DR_GROUP_NEXT_ELEMENT (stmt_info));
@@ -9876,7 +9912,14 @@ vectorizable_load (vec_info *vinfo,
 
       if (grouped_load)
        {
-         first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         /* If we elided a consecutive load permutation, don't
+            use the original first statement (which could be elided)
+            but the one the load permutation starts with.
+            This ensures the stride_base below is correct.  */
+         if (!ls.consecutive_load_perm)
+           first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         else
+           first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
          first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
          ref_type = get_group_alias_ptr_type (first_stmt_info);
        }
@@ -9890,7 +9933,14 @@ vectorizable_load (vec_info *vinfo,
       if (grouped_load)
        {
          if (memory_access_type == VMAT_STRIDED_SLP)
-           group_size = DR_GROUP_SIZE (first_stmt_info);
+           {
+             /* If we elided a consecutive load permutation, adjust
+                the group size here.  */
+             if (!ls.consecutive_load_perm)
+               group_size = DR_GROUP_SIZE (first_stmt_info);
+             else
+               group_size = SLP_TREE_LANES (slp_node);
+           }
          else /* VMAT_ELEMENTWISE */
            group_size = SLP_TREE_LANES (slp_node);
        }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 359c994139b..25706913778 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -288,11 +288,14 @@ struct vect_load_store_data : vect_data {
       tree decl;       // VMAT_GATHER_SCATTER_DECL
   } gs;
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
+  /* Larger load/store type used for punning the vectype.  */
   tree ls_type; // VMAT_GATHER_SCATTER_IFN
   auto_vec<int> elsvals;
   /* True if the load requires a load permutation.  */
   bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
+  /* Whether the load permutation is consecutive and simple.  */
+  bool consecutive_load_perm; // VMAT_STRIDED_SLP
 };
 
 /* A computation tree of an SLP instance.  Each node corresponds to a group of
-- 
2.51.0

Reply via email to