[PATCH 2/2] vect: Use strided loads for VMAT_STRIDED_SLP.

Robin Dapp Tue, 20 May 2025 03:27:14 -0700

This patch enables strided loads for VMAT_STRIDED_SLP.  Instead of
building vectors from scalars or other vectors we can use strided loads
directly when applicable.


The current implementation limits strided loads to cases where we can
load entire groups and not subsets of them.  A future improvement would
be to e.g. load a group of three uint8_t

  g0 g1  g2,     g0 + stride g1 + stride g2 + stride, ...

by

  vlse16 vlse8

and permute those into place (after re-interpreting as vector of
uint8_t).

For satd_8x4 in particular we can do even better by eliding the strided
SLP load permutations, essentially turning
  
  vlse64 v0, (a0)
  vlse64 v1, (a1)
  VEC_PERM_EXPR <v0, v1, { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 
  26, 27 }>;
  VEC_PERM_EXPR <v0, v1, { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 
  30, 31 }>;

into

  vlse32 v0, (a0)
  vlse32 v1, (a1)
  vlse32 v0, 4(a0)
  vlse32 v1, 4(a1)

but that is going to be a follow up.

Bootstrapped and regtested on x86, aarch64, and power10.
Regtested on rv64gcv_zvl512b.  I'm seeing one additional failure in
gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
where we use a larger LMUL than we should but IMHO this can wait.

        PR target/118109

gcc/ChangeLog:

        * internal-fn.cc (internal_strided_fn_supported_p): New
        function.
        * internal-fn.h (internal_strided_fn_supported_p): Declare.
        * tree-vect-stmts.cc (vect_supportable_strided_type): New
        function.
        (vectorizable_load): Add strided-load support for strided
        groups.

gcc/testsuite/ChangeLog:

        * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
---
 gcc/internal-fn.cc                            |  21 ++
 gcc/internal-fn.h                             |   2 +
 .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  51 +++++
 gcc/tree-vect-stmts.cc                        | 203 +++++++++++++++---
 4 files changed, 247 insertions(+), 30 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 6b04443f7cd..aec90ef87cc 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -5203,6 +5203,27 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, 
tree vector_type,
   return ok;
 }
 
+/* Return true if the target supports a strided load/store function IFN
+   with VECTOR_TYPE.  If supported and ELSVALS is nonzero the supported else
+   values will be added to the vector ELSVALS points to.  */
+
+bool
+internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+                                vec<int> *elsvals)
+{
+  machine_mode mode = TYPE_MODE (vector_type);
+  optab optab = direct_internal_fn_optab (ifn);
+  insn_code icode = direct_optab_handler (optab, mode);
+
+  bool ok = icode != CODE_FOR_nothing;
+
+  if (ok && elsvals)
+    get_supported_else_vals
+      (icode, internal_fn_else_index (IFN_MASK_LEN_STRIDED_LOAD), *elsvals);
+
+  return ok;
+}
+
 /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
    for pointers of type TYPE when the accesses have LENGTH bytes and their
    common byte alignment is ALIGN.  */
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index afd4f8e64c7..7d386246a42 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -242,6 +242,8 @@ extern int internal_fn_stored_value_index (internal_fn);
 extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
                                                    tree, tree, int,
                                                    vec<int> * = nullptr);
+extern bool internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
+                                            vec<int> *elsvals);
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
                                                poly_uint64, unsigned int);
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
new file mode 100644
index 00000000000..9918d4d7f52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d 
-mno-vector-strict-align" } */
+
+/* Ensure we use strided loads.  */
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3)                              
\
+  {                                                                            
\
+    int t0 = s0 + s1;                                                          
\
+    int t1 = s0 - s1;                                                          
\
+    int t2 = s2 + s3;                                                          
\
+    int t3 = s2 - s3;                                                          
\
+    d0 = t0 + t2;                                                              
\
+    d2 = t0 - t2;                                                              
\
+    d1 = t1 + t3;                                                              
\
+    d3 = t1 - t3;                                                              
\
+  }
+
+uint32_t
+abs2 (uint32_t a)
+{
+  uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
+  return (a + s) ^ s;
+}
+
+int
+x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
+{
+  uint32_t tmp[4][4];
+  uint32_t a0, a1, a2, a3;
+  int sum = 0;
+  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
+    {
+      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+      HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
+    }
+  for (int i = 0; i < 4; i++)
+    {
+      HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
+      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
+    }
+  return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
+}
+
+/* { dg-final { scan-assembler-times "vlse64" 8 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c1998a22448..0183bfe7166 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10165,6 +10165,46 @@ hoist_defs_of_uses (gimple *stmt, class loop *loop, 
bool hoist_p)
   return true;
 }
 
+/* Check if there is a vector type with the same number of elements as
+   VECTYPE but NELTS units.  If so, set VTYPE to the appropriate type and
+   return true if the target supports a strided load/store for the respective
+   mode.  Return false otherwise.  */
+static bool
+vect_supportable_strided_type (vec_load_store_type vls_type,
+                              tree vectype, int nelts,
+                              tree *vtype,
+                              vec<int> *elsvals)
+{
+  gcc_assert (VECTOR_TYPE_P (vectype));
+  gcc_assert (TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
+
+  machine_mode vmode = TYPE_MODE (vectype);
+  if (!VECTOR_MODE_P (vmode))
+    return NULL_TREE;
+
+  internal_fn ifn = vls_type == VLS_LOAD
+    ? IFN_MASK_LEN_STRIDED_LOAD
+    : IFN_MASK_LEN_STRIDED_STORE;
+
+  poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
+  unsigned int pbsize;
+  if (constant_multiple_p (vbsize, nelts, &pbsize))
+    {
+      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+      machine_mode rmode;
+      if (int_mode_for_size (pbsize, 0).exists (&elmode)
+         && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
+       {
+         tree ptype = build_nonstandard_integer_type (pbsize, 1);
+         *vtype = build_vector_type (ptype, nelts);
+         if (internal_strided_fn_supported_p (ifn, *vtype, elsvals))
+           return true;
+       }
+    }
+
+  return false;
+}
+
 /* vectorizable_load.
 
    Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
@@ -10669,6 +10709,8 @@ vectorizable_load (vec_info *vinfo,
       tree running_off;
       vec<constructor_elt, va_gc> *v = NULL;
       tree stride_base, stride_step, alias_off;
+      bool strided_load_ok_p = false;
+      tree stride_step_signed = NULL_TREE;
       /* Checked by get_load_store_type.  */
       unsigned int const_nunits = nunits.to_constant ();
       unsigned HOST_WIDE_INT cst_offset = 0;
@@ -10745,15 +10787,37 @@ vectorizable_load (vec_info *vinfo,
          stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
        }
 
+      tree stride_step_full = NULL_TREE;
+      auto_vec<tree> dr_chain;
+
+      if (1)
+       {
+         /* For SLP permutation support we need to load the whole group,
+            not only the number of vector stmts the permutation result
+            fits in.  */
+         if (slp_perm)
+           {
+             /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
+                variable VF.  */
+             unsigned int const_vf = vf.to_constant ();
+             ncopies = CEIL (group_size * const_vf, const_nunits);
+             dr_chain.create (ncopies);
+           }
+         else
+           ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+       }
+
+      /* Initialize for VMAT_ELEMENTWISE i.e. when we must load each element
+        separately.  */
       running_off = offvar;
       alias_off = build_int_cst (ref_type, 0);
       int nloads = const_nunits;
       int lnel = 1;
       tree ltype = TREE_TYPE (vectype);
       tree lvectype = vectype;
-      auto_vec<tree> dr_chain;
       if (memory_access_type == VMAT_STRIDED_SLP)
        {
+         tree strided_vtype;
          HOST_WIDE_INT n = gcd (group_size, const_nunits);
          /* Use the target vector type if the group size is a multiple
             of it.  */
@@ -10782,9 +10846,42 @@ vectorizable_load (vec_info *vinfo,
                  misalignment = mis_align;
                }
            }
+         /* Instead of loading individual vector elements and
+            constructing a larger vector from them we can use
+            a strided load directly.
+            ??? For non-power-of-two groups we could build the
+            group from smaller element sizes and permute them
+            into place afterwards instead of relying on a more
+            rigid vec_init.  */
+         else if (n > 1 && n == group_size
+             && vect_supportable_strided_type
+             (VLS_LOAD, vectype, const_nunits / n,
+              &strided_vtype, &elsvals))
+           {
+             dr_alignment_support dr_align = dr_aligned;
+             int mis_align = 0;
+             mis_align = dr_misalignment (first_dr_info,
+                                          strided_vtype);
+             dr_align
+               = vect_supportable_dr_alignment (vinfo, dr_info,
+                                                strided_vtype,
+                                                mis_align);
+             if (dr_align == dr_aligned
+                 || dr_align == dr_unaligned_supported)
+               {
+                 nloads = 1;
+                 lnel = const_nunits;
+                 lvectype = strided_vtype;
+                 ltype = TREE_TYPE (strided_vtype);
+                 alignment_support_scheme = dr_align;
+                 misalignment = mis_align;
+
+                 strided_load_ok_p = true;
+               }
+           }
          /* Else use the biggest vector we can load the group without
             accessing excess elements.  */
-         else if (n > 1)
+         else if (!strided_load_ok_p && n > 1)
            {
              tree ptype;
              tree vtype
@@ -10830,22 +10927,36 @@ vectorizable_load (vec_info *vinfo,
          ltype = build_aligned_type (ltype, align * BITS_PER_UNIT);
        }
 
-      if (1)
+      /* We don't use masking here so just use any else value and don't
+        perform any zeroing.  */
+      tree vec_els = NULL_TREE;
+      if (strided_load_ok_p && !costing_p)
        {
-         /* For SLP permutation support we need to load the whole group,
-            not only the number of vector stmts the permutation result
-            fits in.  */
-         if (slp_perm)
-           {
-             /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
-                variable VF.  */
-             unsigned int const_vf = vf.to_constant ();
-             ncopies = CEIL (group_size * const_vf, const_nunits);
-             dr_chain.create (ncopies);
-           }
-         else
-           ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+         gcc_assert (elsvals.length ());
+         maskload_elsval = *elsvals.begin ();
+         vec_els = vect_get_mask_load_else (maskload_elsval, lvectype);
+
+         stride_step_full
+           = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step),
+                          stride_step,
+                          build_int_cst (TREE_TYPE (stride_step),
+                                         TYPE_VECTOR_SUBPARTS (lvectype)));
+         stride_step_full
+           = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_full);
+
+         tree cst_off = build_int_cst (ref_type, cst_offset);
+         dataref_ptr
+           = vect_create_data_ref_ptr (vinfo, first_stmt_info, lvectype,
+                                       loop, cst_off, &dummy, gsi, &ptr_incr,
+                                       false, stride_step_full);
+
+         stride_step_signed
+           = fold_build1 (NOP_EXPR, signed_type_for (TREE_TYPE (stride_step)),
+                          stride_step);
+         stride_step_signed
+           = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_signed);
        }
+
       unsigned int group_el = 0;
       unsigned HOST_WIDE_INT
        elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
@@ -10855,38 +10966,66 @@ vectorizable_load (vec_info *vinfo,
       unsigned int n_adjacent_loads = 0;
       for (j = 0; j < ncopies; j++)
        {
-         if (nloads > 1 && !costing_p)
+         if (nloads > 1 && !strided_load_ok_p && !costing_p)
            vec_alloc (v, nloads);
          gimple *new_stmt = NULL;
          for (i = 0; i < nloads; i++)
            {
              if (costing_p)
                {
+                 /* TODO: The vectype in stmt_info/slp_node is potentially
+                    wrong as we could be using a much smaller vectype
+                    as determined by vector_vector_composition_type.  */
+                 if (strided_load_ok_p)
+                   inside_cost += record_stmt_cost (cost_vec, 1,
+                                                    vector_gather_load,
+                                                    slp_node, 0,
+                                                    vect_body);
                  /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
                     avoid ICE, see PR110776.  */
-                 if (VECTOR_TYPE_P (ltype)
-                     && memory_access_type != VMAT_ELEMENTWISE)
+                 else if (VECTOR_TYPE_P (ltype)
+                          && memory_access_type != VMAT_ELEMENTWISE)
                    n_adjacent_loads++;
                  else
                    inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
                                                     slp_node, 0, vect_body);
                  continue;
                }
-             tree this_off = build_int_cst (TREE_TYPE (alias_off),
-                                            group_el * elsz + cst_offset);
-             tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
-             vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
-             new_temp = make_ssa_name (ltype);
-             new_stmt = gimple_build_assign (new_temp, data_ref);
+
+             if (!strided_load_ok_p)
+               {
+
+                 tree this_off = build_int_cst (TREE_TYPE (alias_off),
+                                                group_el * elsz + cst_offset);
+                 tree data_ref = build2 (MEM_REF, ltype, running_off,
+                                         this_off);
+                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+                 new_temp = make_ssa_name (ltype);
+                 new_stmt = gimple_build_assign (new_temp, data_ref);
+                 if (nloads > 1)
+                   CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
+               }
+             else
+               {
+                 mask_vectype = truth_type_for (lvectype);
+                 tree final_mask = build_minus_one_cst (mask_vectype);
+                 tree bias = build_int_cst (intQI_type_node, 0);
+                 tree len = size_int (TYPE_VECTOR_SUBPARTS (lvectype));
+                 tree zero = build_zero_cst (lvectype);
+                 new_stmt
+                   = gimple_build_call_internal
+                   (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr,
+                    stride_step_signed, zero, final_mask, vec_els, len, bias);
+                 new_temp = make_ssa_name (lvectype);
+                 gimple_set_lhs (new_stmt, new_temp);
+               }
              vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
-             if (nloads > 1)
-               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
 
              group_el += lnel;
              if (0
-                 || group_el == group_size)
+                 || group_el >= group_size)
                {
-                 n_groups++;
+                 n_groups += (group_el / group_size);
                  /* When doing SLP make sure to not load elements from
                     the next vector iteration, those will not be accessed
                     so just use the last element again.  See PR107451.  */
@@ -10898,6 +11037,10 @@ vectorizable_load (vec_info *vinfo,
                                               running_off, stride_step);
                      vect_finish_stmt_generation (vinfo, stmt_info, incr, gsi);
                      running_off = newoff;
+                     if (strided_load_ok_p && !costing_p)
+                       dataref_ptr
+                         = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
+                                            stmt_info, stride_step_full);
                    }
                  group_el = 0;
                }
@@ -10941,7 +11084,7 @@ vectorizable_load (vec_info *vinfo,
              if (1)
                {
                  if (slp_perm)
-                   dr_chain.quick_push (gimple_assign_lhs (new_stmt));
+                   dr_chain.quick_push (gimple_get_lhs (new_stmt));
                  else
                    slp_node->push_vec_def (new_stmt);
                }
-- 
2.49.0

[PATCH 2/2] vect: Use strided loads for VMAT_STRIDED_SLP.

Reply via email to