On Tue, May 27, 2025 at 6:04 PM Robin Dapp <rdapp....@gmail.com> wrote:
>
> From: Robin Dapp <rdapp....@gmail.com>
>
> This patch enables strided loads for VMAT_STRIDED_SLP.  Instead of
> building vectors from scalars or other vectors we can use strided loads
> directly when applicable.
>
> The current implementation limits strided loads to cases where we can
> load entire groups and not subsets of them.  A future improvement would
> be to e.g. load a group of three uint8_t
>
>   g0 g1  g2,     g0 + stride g1 + stride g2 + stride, ...
>
> by
>
>   vlse16 vlse8
>
> and permute those into place (after re-interpreting as vector of
> uint8_t).
>
> For satd_8x4 in particular we can do even better by eliding the strided
> SLP load permutations, essentially turning
>
>   vlse64 v0, (a0)
>   vlse64 v1, (a1)
>   VEC_PERM_EXPR <v0, v1, { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25,
>   26, 27 }>;
>   VEC_PERM_EXPR <v0, v1, { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29,
>   30, 31 }>;
>
> into
>
>   vlse32 v0, (a0)
>   vlse32 v1, (a1)
>   vlse32 v0, 4(a0)
>   vlse32 v1, 4(a1)
>
> but that is going to be a follow up.
>
> Bootstrapped and regtested on x86, aarch64, and power10.
> Regtested on rv64gcv_zvl512b.  I'm seeing one additional failure in
> gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
> where we use a larger LMUL than we should but IMHO this can wait.
>
>         PR target/118109
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (internal_strided_fn_supported_p): New
>         function.
>         * internal-fn.h (internal_strided_fn_supported_p): Declare.
>         * tree-vect-stmts.cc (vect_supportable_strided_type): New
>         function.
>         (vectorizable_load): Add strided-load support for strided
>         groups.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
> ---
>  gcc/internal-fn.cc                            |  21 ++
>  gcc/internal-fn.h                             |   2 +
>  .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  51 +++++
>  gcc/tree-vect-stmts.cc                        | 196 +++++++++++++++---
>  4 files changed, 243 insertions(+), 27 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 6b04443f7cd..aec90ef87cc 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5203,6 +5203,27 @@ internal_gather_scatter_fn_supported_p (internal_fn 
> ifn, tree vector_type,
>    return ok;
>  }
>
> +/* Return true if the target supports a strided load/store function IFN
> +   with VECTOR_TYPE.  If supported and ELSVALS is nonzero the supported else
> +   values will be added to the vector ELSVALS points to.  */
> +
> +bool
> +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
> +                                vec<int> *elsvals)
> +{

can we add an assert here as to what 'ifn' we expect here?
MASK_LEN_STRIDED_LOAD and MASK_LEN_STRIDED_STORE I guess?

> +  machine_mode mode = TYPE_MODE (vector_type);
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  bool ok = icode != CODE_FOR_nothing;
> +
> +  if (ok && elsvals)
> +    get_supported_else_vals
> +      (icode, internal_fn_else_index (IFN_MASK_LEN_STRIDED_LOAD), *elsvals);

shouldn't you use 'ifn' here instead of IFN_MASK_LEN_STRIDED_LOAD?

> +
> +  return ok;
> +}
> +
>  /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
>     for pointers of type TYPE when the accesses have LENGTH bytes and their
>     common byte alignment is ALIGN.  */
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index afd4f8e64c7..7d386246a42 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -242,6 +242,8 @@ extern int internal_fn_stored_value_index (internal_fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>                                                     tree, tree, int,
>                                                     vec<int> * = nullptr);
> +extern bool internal_strided_fn_supported_p (internal_fn ifn, tree 
> vector_type,
> +                                            vec<int> *elsvals);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>                                                 poly_uint64, unsigned int);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> new file mode 100644
> index 00000000000..9918d4d7f52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d 
> -mno-vector-strict-align" } */
> +
> +/* Ensure we use strided loads.  */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned int uint32_t;
> +
> +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3)                            
>   \
> +  {                                                                          
>   \
> +    int t0 = s0 + s1;                                                        
>   \
> +    int t1 = s0 - s1;                                                        
>   \
> +    int t2 = s2 + s3;                                                        
>   \
> +    int t3 = s2 - s3;                                                        
>   \
> +    d0 = t0 + t2;                                                            
>   \
> +    d2 = t0 - t2;                                                            
>   \
> +    d1 = t1 + t3;                                                            
>   \
> +    d3 = t1 - t3;                                                            
>   \
> +  }
> +
> +uint32_t
> +abs2 (uint32_t a)
> +{
> +  uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
> +  return (a + s) ^ s;
> +}
> +
> +int
> +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
> +{
> +  uint32_t tmp[4][4];
> +  uint32_t a0, a1, a2, a3;
> +  int sum = 0;
> +  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
> +    {
> +      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> +      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> +      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> +      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> +      HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
> +    }
> +  for (int i = 0; i < 4; i++)
> +    {
> +      HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
> +      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
> +    }
> +  return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "vlse64" 8 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3710694ac75..090169d9cb5 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -10165,6 +10165,46 @@ hoist_defs_of_uses (gimple *stmt, class loop *loop, 
> bool hoist_p)
>    return true;
>  }
>
> +/* Check if there is a vector type with the same number of elements as
> +   VECTYPE but NELTS units.

It looks like the function looks for a same element typed vector type
with a _different_ number of elements, NELTS number.

?  If so, set VTYPE to the appropriate type and
> +   return true if the target supports a strided load/store for the respective
> +   mode.  Return false otherwise.  */

So it combines two things.  I think
get_related_vectype_for_scalar_type (TYPE_MODE (vector_type),
TREE_TYPE (vectype), nelts)
does exactly this first part?  Alternatively
you can use get_vectype_for_scalar_type (vinfo, TREE_TYPE (vectype), nelts.
I'd hate adding a third variant?

> +static bool
> +vect_supportable_strided_type (vec_load_store_type vls_type,
> +                              tree vectype, int nelts,
> +                              tree *vtype,
> +                              vec<int> *elsvals)
> +{
> +  gcc_assert (VECTOR_TYPE_P (vectype));
> +  gcc_assert (TYPE_VECTOR_SUBPARTS (vectype).is_constant ());

why this?

> +
> +  machine_mode vmode = TYPE_MODE (vectype);
> +  if (!VECTOR_MODE_P (vmode))
> +    return NULL_TREE;
> +
> +  internal_fn ifn = vls_type == VLS_LOAD
> +    ? IFN_MASK_LEN_STRIDED_LOAD
> +    : IFN_MASK_LEN_STRIDED_STORE;
> +
> +  poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
> +  unsigned int pbsize;
> +  if (constant_multiple_p (vbsize, nelts, &pbsize))
> +    {
> +      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
> +      machine_mode rmode;
> +      if (int_mode_for_size (pbsize, 0).exists (&elmode)
> +         && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
> +       {
> +         tree ptype = build_nonstandard_integer_type (pbsize, 1);
> +         *vtype = build_vector_type (ptype, nelts);
> +         if (internal_strided_fn_supported_p (ifn, *vtype, elsvals))
> +           return true;
> +       }
> +    }
> +
> +  return false;
> +}
> +
>  /* vectorizable_load.
>
>     Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure)
> @@ -10669,6 +10709,8 @@ vectorizable_load (vec_info *vinfo,
>        tree running_off;
>        vec<constructor_elt, va_gc> *v = NULL;
>        tree stride_base, stride_step, alias_off;
> +      bool strided_load_ok_p = false;
> +      tree stride_step_signed = NULL_TREE;
>        /* Checked by get_load_store_type.  */
>        unsigned int const_nunits = nunits.to_constant ();
>        unsigned HOST_WIDE_INT cst_offset = 0;
> @@ -10744,15 +10786,34 @@ vectorizable_load (vec_info *vinfo,
>           stride_step = cse_and_gimplify_to_preheader (loop_vinfo, 
> stride_step);
>         }
>
> +      tree stride_step_full = NULL_TREE;
> +      auto_vec<tree> dr_chain;
> +
> +      /* For SLP permutation support we need to load the whole group,
> +        not only the number of vector stmts the permutation result
> +        fits in.  */
> +      if (slp_perm)
> +       {
> +         /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> +            variable VF.  */
> +         unsigned int const_vf = vf.to_constant ();
> +         ncopies = CEIL (group_size * const_vf, const_nunits);
> +         dr_chain.create (ncopies);
> +       }
> +      else
> +       ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> +
> +      /* Initialize for VMAT_ELEMENTWISE i.e. when we must load each element
> +        separately.  */
>        running_off = offvar;
>        alias_off = build_int_cst (ref_type, 0);
>        int nloads = const_nunits;
>        int lnel = 1;
>        tree ltype = TREE_TYPE (vectype);
>        tree lvectype = vectype;
> -      auto_vec<tree> dr_chain;
>        if (memory_access_type == VMAT_STRIDED_SLP)
>         {
> +         tree strided_vtype;
>           HOST_WIDE_INT n = gcd (group_size, const_nunits);
>           /* Use the target vector type if the group size is a multiple
>              of it.  */
> @@ -10781,9 +10842,42 @@ vectorizable_load (vec_info *vinfo,
>                   misalignment = mis_align;
>                 }
>             }
> +         /* Instead of loading individual vector elements and
> +            constructing a larger vector from them we can use
> +            a strided load directly.
> +            ??? For non-power-of-two groups we could build the
> +            group from smaller element sizes and permute them
> +            into place afterwards instead of relying on a more
> +            rigid vec_init.  */
> +         else if (n > 1 && n == group_size
> +             && vect_supportable_strided_type
> +             (VLS_LOAD, vectype, const_nunits / n,
> +              &strided_vtype, &elsvals))

So I do wonder how this interacts with vector_vector_composition_type,
in fact the difference is that for strided_load we know the composition
happens as part of a load, so how about instead extending
this function, pass it VLS_LOAD/STORE and also consider
strided_loads as composition kind there?  This would avoid duplication
and I think at least some cases of non-power-of-two groups would
be handled this way already (permuting out gaps).

> +           {
> +             dr_alignment_support dr_align = dr_aligned;
> +             int mis_align = 0;
> +             mis_align = dr_misalignment (first_dr_info,
> +                                          strided_vtype);
> +             dr_align
> +               = vect_supportable_dr_alignment (vinfo, dr_info,
> +                                                strided_vtype,
> +                                                mis_align);
> +             if (dr_align == dr_aligned
> +                 || dr_align == dr_unaligned_supported)
> +               {
> +                 nloads = 1;
> +                 lnel = const_nunits;
> +                 lvectype = strided_vtype;
> +                 ltype = TREE_TYPE (strided_vtype);
> +                 alignment_support_scheme = dr_align;
> +                 misalignment = mis_align;
> +
> +                 strided_load_ok_p = true;
> +               }
> +           }
>           /* Else use the biggest vector we can load the group without
>              accessing excess elements.  */
> -         else if (n > 1)
> +         else if (!strided_load_ok_p && n > 1)
>             {
>               tree ptype;
>               tree vtype
> @@ -10829,19 +10923,35 @@ vectorizable_load (vec_info *vinfo,
>           ltype = build_aligned_type (ltype, align * BITS_PER_UNIT);
>         }
>
> -      /* For SLP permutation support we need to load the whole group,
> -        not only the number of vector stmts the permutation result
> -        fits in.  */
> -      if (slp_perm)
> +      /* We don't use masking here so just use any else value and don't
> +        perform any zeroing.  */
> +      tree vec_els = NULL_TREE;
> +      if (strided_load_ok_p && !costing_p)
>         {
> -         /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
> -            variable VF.  */
> -         unsigned int const_vf = vf.to_constant ();
> -         ncopies = CEIL (group_size * const_vf, const_nunits);
> -         dr_chain.create (ncopies);
> +         gcc_assert (elsvals.length ());
> +         maskload_elsval = *elsvals.begin ();
> +         vec_els = vect_get_mask_load_else (maskload_elsval, lvectype);
> +
> +         stride_step_full
> +           = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step),
> +                          stride_step,
> +                          build_int_cst (TREE_TYPE (stride_step),
> +                                         TYPE_VECTOR_SUBPARTS (lvectype)));
> +         stride_step_full
> +           = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_full);
> +
> +         tree cst_off = build_int_cst (ref_type, cst_offset);
> +         dataref_ptr
> +           = vect_create_data_ref_ptr (vinfo, first_stmt_info, lvectype,
> +                                       loop, cst_off, &dummy, gsi, &ptr_incr,
> +                                       false, stride_step_full);
> +
> +         stride_step_signed
> +           = fold_build1 (NOP_EXPR, signed_type_for (TREE_TYPE 
> (stride_step)),
> +                          stride_step);
> +         stride_step_signed
> +           = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_signed);
>         }
> -      else
> -       ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>
>        unsigned int group_el = 0;
>        unsigned HOST_WIDE_INT
> @@ -10852,37 +10962,65 @@ vectorizable_load (vec_info *vinfo,
>        unsigned int n_adjacent_loads = 0;
>        for (j = 0; j < ncopies; j++)
>         {
> -         if (nloads > 1 && !costing_p)
> +         if (nloads > 1 && !strided_load_ok_p && !costing_p)
>             vec_alloc (v, nloads);
>           gimple *new_stmt = NULL;
>           for (i = 0; i < nloads; i++)
>             {
>               if (costing_p)
>                 {
> +                 /* TODO: The vectype in stmt_info/slp_node is potentially
> +                    wrong as we could be using a much smaller vectype
> +                    as determined by vector_vector_composition_type.  */
> +                 if (strided_load_ok_p)
> +                   inside_cost += record_stmt_cost (cost_vec, 1,
> +                                                    vector_gather_load,
> +                                                    slp_node, 0,
> +                                                    vect_body);
>                   /* For VMAT_ELEMENTWISE, just cost it as scalar_load to
>                      avoid ICE, see PR110776.  */
> -                 if (VECTOR_TYPE_P (ltype)
> -                     && memory_access_type != VMAT_ELEMENTWISE)
> +                 else if (VECTOR_TYPE_P (ltype)
> +                          && memory_access_type != VMAT_ELEMENTWISE)
>                     n_adjacent_loads++;
>                   else
>                     inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
>                                                      slp_node, 0, vect_body);
>                   continue;
>                 }
> -             tree this_off = build_int_cst (TREE_TYPE (alias_off),
> -                                            group_el * elsz + cst_offset);
> -             tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
> -             vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> -             new_temp = make_ssa_name (ltype);
> -             new_stmt = gimple_build_assign (new_temp, data_ref);
> +
> +             if (!strided_load_ok_p)
> +               {
> +
> +                 tree this_off = build_int_cst (TREE_TYPE (alias_off),
> +                                                group_el * elsz + 
> cst_offset);
> +                 tree data_ref = build2 (MEM_REF, ltype, running_off,
> +                                         this_off);
> +                 vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
> +                 new_temp = make_ssa_name (ltype);
> +                 new_stmt = gimple_build_assign (new_temp, data_ref);
> +                 if (nloads > 1)
> +                   CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
> +               }
> +             else
> +               {
> +                 mask_vectype = truth_type_for (lvectype);
> +                 tree final_mask = build_minus_one_cst (mask_vectype);
> +                 tree bias = build_int_cst (intQI_type_node, 0);
> +                 tree len = size_int (TYPE_VECTOR_SUBPARTS (lvectype));
> +                 tree zero = build_zero_cst (lvectype);
> +                 new_stmt
> +                   = gimple_build_call_internal
> +                   (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr,
> +                    stride_step_signed, zero, final_mask, vec_els, len, 
> bias);
> +                 new_temp = make_ssa_name (lvectype);
> +                 gimple_set_lhs (new_stmt, new_temp);
> +               }
>               vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> -             if (nloads > 1)
> -               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp);
>
>               group_el += lnel;
> -             if (group_el == group_size)
> +             if (group_el >= group_size)
>                 {
> -                 n_groups++;
> +                 n_groups += (group_el / group_size);
>                   /* When doing SLP make sure to not load elements from
>                      the next vector iteration, those will not be accessed
>                      so just use the last element again.  See PR107451.  */
> @@ -10894,6 +11032,10 @@ vectorizable_load (vec_info *vinfo,
>                                                running_off, stride_step);
>                       vect_finish_stmt_generation (vinfo, stmt_info, incr, 
> gsi);
>                       running_off = newoff;
> +                     if (strided_load_ok_p && !costing_p)
> +                       dataref_ptr
> +                         = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, 
> gsi,
> +                                            stmt_info, stride_step_full);
>                     }
>                   group_el = 0;
>                 }
> @@ -10935,7 +11077,7 @@ vectorizable_load (vec_info *vinfo,
>           if (!costing_p)
>             {
>               if (slp_perm)
> -               dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> +               dr_chain.quick_push (gimple_get_lhs (new_stmt));
>               else
>                 slp_node->push_vec_def (new_stmt);
>             }
> --
> 2.49.0
>

Reply via email to