On Fri, Sep 19, 2025 at 11:57 AM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> Changes from v3:
>
>  - Remove load-perm optimization and try to pun whole group.
>  - Add slp_perm handling to VMAT_GATHER_SCATTER when using "grouped gather".
>
> The generated code is obviously worse than with v2 for my example because we
> have 2x the number of strided loads and the (non-single-source) load-perm
> gathers throw away have of the loaded data afterwards.  But we can still
> improve this in a follow up.
>
> Bootstrapped and regtested on x86 and power10.  Regtested on rv64gcv_zvl512b,
> aarch64 running.
>
> This patch adds gather/scatter handling for grouped access.  The idea is
> to e.g. replace an access (for uint8_t elements) like
>   arr[0]
>   arr[1]
>   arr[2]
>   arr[3]
>   arr[0 + step]
>   arr[1 + step]
>   ...
> by a gather load of uint32_t
>   arr[0..3]
>   arr[0 + step * 1..3 + step * 1]
>   arr[0 + step * 2..3 + step * 2]
>   ...
> where the offset vector is a simple series with step STEP.
> If supported, such a gather can be implemented as a strided load.
>
> If we have a masked access the transformation is not performed.  This
> could still be done after converting the data back to the original
> vectype but it does not seem worth it for now.

This patch is OK now, I spotted one minor issue,

> -      && SLP_TREE_LANES (slp_node) == 1
> -      && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> -         || single_element_p))
> +      && SLP_TREE_LANES (slp_node) >= 1)

I think SLP_TREE_LANES is always >= 1, so OK with removing this.

Thanks,
Richard.

> Regards
>  Robin
>
>         PR target/118019
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (get_supported_else_vals): Exit at invalid
>         index.
>         (internal_strided_fn_supported_p): New funtion.
>         * internal-fn.h (internal_strided_fn_supported_p): Declare.
>         * tree-vect-stmts.cc (vector_vector_composition_type):
>         Add vector_only argument.
>         (vect_use_grouped_gather): New function.
>         (vect_get_store_rhs): Adjust docs of
>         vector_vector_composition_type.
>         (get_load_store_type): Try grouped gather.
>         (vectorizable_store): Use punned vectype.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (struct vect_load_store_data): Add punned
>         vectype.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
> ---
>  gcc/internal-fn.cc                            |  22 +-
>  gcc/internal-fn.h                             |   2 +
>  .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  50 ++++
>  gcc/tree-vect-stmts.cc                        | 240 ++++++++++++++++--
>  gcc/tree-vectorizer.h                         |   1 +
>  5 files changed, 289 insertions(+), 26 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index bf2fac81807..db396c69ec5 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned 
> else_index,
>                          vec<int> &else_vals)
>  {
>    const struct insn_data_d *data = &insn_data[icode];
> -  if ((char)else_index >= data->n_operands)
> +  if ((int)else_index >= data->n_operands || (int)else_index == -1)
>      return;
>
>    machine_mode else_mode = data->operand[else_index].mode;
> @@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn 
> ifn, tree vector_type,
>    return ok;
>  }
>
> +/* Return true if the target supports a strided load/store function IFN
> +   with VECTOR_TYPE.  If supported and ELSVALS is nonzero the supported else
> +   values will be added to the vector ELSVALS points to.  */
> +
> +bool
> +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
> +                                vec<int> *elsvals)
> +{
> +  machine_mode mode = TYPE_MODE (vector_type);
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  bool ok = icode != CODE_FOR_nothing;
> +
> +  if (ok && elsvals)
> +    get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals);
> +
> +  return ok;
> +}
> +
>  /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
>     for pointers of type TYPE when the accesses have LENGTH bytes and their
>     common byte alignment is ALIGN.  */
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index fd21694dfeb..dcb707251f8 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>                                                     tree, tree, int,
>                                                     vec<int> * = nullptr);
> +extern bool internal_strided_fn_supported_p (internal_fn, tree,
> +                                             vec<int> * = nullptr);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>                                                 poly_uint64, unsigned int);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> new file mode 100644
> index 00000000000..c8c1a7291fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d 
> -mno-vector-strict-align" } */
> +
> +/* Ensure we use strided loads.  */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned int uint32_t;
> +
> +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
> +    int t0 = s0 + s1;\
> +    int t1 = s0 - s1;\
> +    int t2 = s2 + s3;\
> +    int t3 = s2 - s3;\
> +    d0 = t0 + t2;\
> +    d2 = t0 - t2;\
> +    d1 = t1 + t3;\
> +    d3 = t1 - t3;\
> +}
> +
> +uint32_t
> +abs2 (uint32_t a)
> +{
> +  uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
> +  return (a + s) ^ s;
> +}
> +
> +int
> +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
> +{
> +  uint32_t tmp[4][4];
> +  uint32_t a0, a1, a2, a3;
> +  int sum = 0;
> +  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
> +    {
> +      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> +      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> +      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> +      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> +      HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
> +    }
> +  for (int i = 0; i < 4; i++)
> +    {
> +      HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
> +      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
> +    }
> +  return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "vlse64" 8 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3af78f3af84..747659d3fda 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -62,6 +62,9 @@ along with GCC; see the file COPYING3.  If not see
>  /* For lang_hooks.types.type_for_mode.  */
>  #include "langhooks.h"
>
> +static tree vector_vector_composition_type (tree, poly_uint64, tree *,
> +                                           bool = false);
> +
>  /* Return TRUE iff the given statement is in an inner loop relative to
>     the loop being vectorized.  */
>  bool
> @@ -1723,6 +1726,95 @@ vect_truncate_gather_scatter_offset (stmt_vec_info 
> stmt_info, tree vectype,
>    return false;
>  }
>
> +/* Return true if we can use gather/scatter or strided internal functions
> +   to vectorize STMT_INFO, which is a grouped or strided load or store
> +   with multiple lanes and will be implemented by a type-punned access
> +   of a vector with element size that matches the number of lanes.
> +
> +   MASKED_P is true if load or store is conditional.
> +   When returning true, fill in GS_INFO with the information required to
> +   perform the operation.  Also, store the punning type in PUNNED_VECTYPE.
> +
> +   If successful and ELSVALS is nonzero the supported
> +   else values will be stored in the vector ELSVALS points to.  */
> +
> +static bool
> +vect_use_grouped_gather (dr_vec_info *dr_info, tree vectype,
> +                        loop_vec_info loop_vinfo, bool masked_p,
> +                        unsigned int nelts,
> +                        gather_scatter_info *info, vec<int> *elsvals,
> +                        tree *pun_vectype)
> +{
> +  data_reference *dr = dr_info->dr;
> +
> +  /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
> +     multiple gathers/scatter.  */
> +  if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
> +    return false;
> +
> +  /* Pun the vectype with one of the same size but an element spanning
> +     NELTS elements of VECTYPE.
> +     The punned type of a V16QI with NELTS = 4 would be V4SI.
> +     */
> +  tree tmp;
> +  unsigned int pieces;
> +  if (!can_div_trunc_p (TYPE_VECTOR_SUBPARTS (vectype), nelts, &pieces)
> +      || !pieces)
> +    return false;
> +
> +  *pun_vectype = vector_vector_composition_type (vectype, pieces, &tmp, 
> true);
> +
> +  if (!*pun_vectype || !VECTOR_TYPE_P (*pun_vectype))
> +    return false;
> +
> +  internal_fn ifn;
> +  tree offset_vectype = *pun_vectype;
> +
> +  internal_fn strided_ifn = DR_IS_READ (dr)
> +    ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
> +
> +  /* Check if we have a gather/scatter with the new type.  We're just trying
> +     with the type itself as offset for now.  If not, check if we have a
> +     strided load/store.  These have fewer constraints (for example no offset
> +     type must exist) so it is possible that even though a gather/scatter is
> +     not available we still have a strided load/store.  */
> +  bool ok = false;
> +  if (vect_gather_scatter_fn_p
> +      (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
> +       TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
> +       &offset_vectype, elsvals))
> +    ok = true;
> +  else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
> +                                           elsvals))
> +    {
> +      /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
> +        will switch back to the strided variants.  */
> +      ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
> +       IFN_MASK_LEN_SCATTER_STORE;
> +      ok = true;
> +    }
> +
> +  if (ok)
> +    {
> +      info->ifn = ifn;
> +      info->decl = NULL_TREE;
> +      info->base = dr->ref;
> +      info->alias_ptr = build_int_cst
> +       (reference_alias_ptr_type (DR_REF (dr)),
> +        get_object_alignment (DR_REF (dr)));
> +      info->element_type = TREE_TYPE (*pun_vectype);
> +      info->offset_vectype = offset_vectype;
> +      /* No need to set the offset, vect_get_strided_load_store_ops
> +        will do that.  */
> +      info->scale = 1;
> +      info->memory_type = TREE_TYPE (DR_REF (dr));
> +      return true;
> +    }
> +
> +  return false;
> +}
> +
> +
>  /* Return true if we can use gather/scatter internal functions to
>     vectorize STMT_INFO, which is a grouped or strided load or store.
>     MASKED_P is true if load or store is conditional.  When returning
> @@ -1888,12 +1980,14 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
>
>  /* Function VECTOR_VECTOR_COMPOSITION_TYPE
>
> -   This function returns a vector type which can be composed with NETLS 
> pieces,
> +   This function returns a vector type which can be composed with NELTS 
> pieces,
>     whose type is recorded in PTYPE.  VTYPE should be a vector type, and has 
> the
>     same vector size as the return vector.  It checks target whether supports
>     pieces-size vector mode for construction firstly, if target fails to, 
> check
>     pieces-size scalar mode for construction further.  It returns NULL_TREE if
> -   fails to find the available composition.
> +   fails to find the available composition.  If the caller only wants scalar
> +   pieces where PTYPE e.g. is a possible gather/scatter element type
> +   SCALAR_PTYPE_ONLY must be true.
>
>     For example, for (vtype=V16QI, nelts=4), we can probably get:
>       - V16QI with PTYPE V4QI.
> @@ -1901,7 +1995,8 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
>       - NULL_TREE.  */
>
>  static tree
> -vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
> +vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
> +                               bool scalar_ptype_only)
>  {
>    gcc_assert (VECTOR_TYPE_P (vtype));
>    gcc_assert (known_gt (nelts, 0U));
> @@ -1927,7 +2022,8 @@ vector_vector_composition_type (tree vtype, poly_uint64 
> nelts, tree *ptype)
>        scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
>        poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
>        machine_mode rmode;
> -      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
> +      if (!scalar_ptype_only
> +         && related_vector_mode (vmode, elmode, inelts).exists (&rmode)
>           && (convert_optab_handler (vec_init_optab, vmode, rmode)
>               != CODE_FOR_nothing))
>         {
> @@ -1938,12 +2034,15 @@ vector_vector_composition_type (tree vtype, 
> poly_uint64 nelts, tree *ptype)
>        /* Otherwise check if exists an integer type of the same piece size and
>          if vec_init optab supports construction from it directly.  */
>        if (int_mode_for_size (pbsize, 0).exists (&elmode)
> -         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
> -         && (convert_optab_handler (vec_init_optab, rmode, elmode)
> -             != CODE_FOR_nothing))
> +         && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
>         {
> -         *ptype = build_nonstandard_integer_type (pbsize, 1);
> -         return build_vector_type (*ptype, nelts);
> +         if (scalar_ptype_only
> +             || convert_optab_handler (vec_init_optab, rmode, elmode)
> +             != CODE_FOR_nothing)
> +           {
> +             *ptype = build_nonstandard_integer_type (pbsize, 1);
> +             return build_vector_type (*ptype, nelts);
> +           }
>         }
>      }
>
> @@ -1981,6 +2080,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    int *misalignment = &ls->misalignment;
>    internal_fn *lanes_ifn = &ls->lanes_ifn;
>    vec<int> *elsvals = &ls->elsvals;
> +  tree *ls_type = &ls->ls_type;
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -1994,6 +2094,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>
>    *misalignment = DR_MISALIGNMENT_UNKNOWN;
>    *poffset = 0;
> +  *ls_type = NULL_TREE;
>
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      {
> @@ -2318,18 +2419,20 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>       on nearby locations.  Or, even if it's a win over scalar code,
>       it might not be a win over vectorizing at a lower VF, if that
>       allows us to use contiguous accesses.  */
> +  vect_memory_access_type grouped_gather_fallback = VMAT_UNINITIALIZED;
>    if (loop_vinfo
>        && (*memory_access_type == VMAT_ELEMENTWISE
>           || *memory_access_type == VMAT_STRIDED_SLP)
>        && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> -      && SLP_TREE_LANES (slp_node) == 1
> -      && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> -         || single_element_p))
> +      && SLP_TREE_LANES (slp_node) >= 1)
>      {
>        gather_scatter_info gs_info;
> -      if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
> -                                             masked_p, &gs_info, elsvals,
> -                                             group_size, single_element_p))
> +      if (SLP_TREE_LANES (slp_node) == 1
> +         && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> +             || single_element_p)
> +         && vect_use_strided_gather_scatters_p (stmt_info, vectype, 
> loop_vinfo,
> +                                                masked_p, &gs_info, elsvals,
> +                                                group_size, 
> single_element_p))
>         {
>           SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
>           SLP_TREE_GS_BASE (slp_node) = error_mark_node;
> @@ -2337,6 +2440,20 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>           ls->strided_offset_vectype = gs_info.offset_vectype;
>           *memory_access_type = VMAT_GATHER_SCATTER_IFN;
>         }
> +      else if (SLP_TREE_LANES (slp_node) > 1
> +              && !masked_p
> +              && !single_element_p
> +              && vect_use_grouped_gather (STMT_VINFO_DR_INFO (stmt_info),
> +                                          vectype, loop_vinfo,
> +                                          masked_p, group_size,
> +                                          &gs_info, elsvals, ls_type))
> +       {
> +         grouped_gather_fallback = *memory_access_type;
> +         *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> +         ls->gs.ifn = gs_info.ifn;
> +         vectype = *ls_type;
> +         ls->strided_offset_vectype = gs_info.offset_vectype;
> +       }
>      }
>
>    if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
> @@ -2362,6 +2479,18 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>         = vect_supportable_dr_alignment
>            (vinfo, first_dr_info, vectype, *misalignment,
>             mat_gather_scatter_p (*memory_access_type));
> +      if (grouped_gather_fallback != VMAT_UNINITIALIZED
> +         && *alignment_support_scheme != dr_aligned
> +         && *alignment_support_scheme != dr_unaligned_supported)
> +       {
> +         /* No supportable alignment for a grouped gather, fall back to the
> +            original memory access type.  Even though VMAT_STRIDED_SLP might
> +            also try aligned vector loads it can still choose vector
> +            construction from scalars.  */
> +         *memory_access_type = grouped_gather_fallback;
> +         *alignment_support_scheme = dr_unaligned_supported;
> +         *misalignment = DR_MISALIGNMENT_UNKNOWN;
> +       }
>      }
>
>    if (overrun_p)
> @@ -8368,10 +8497,13 @@ vectorizable_store (vec_info *vinfo,
>      {
>        aggr_type = elem_type;
>        if (!costing_p)
> -       vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
> -                                        ls.strided_offset_vectype,
> -                                        loop_vinfo, gsi,
> -                                        &bump, &vec_offset, loop_lens);
> +       {
> +         tree vtype = ls.ls_type ? ls.ls_type : vectype;
> +         vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
> +                                          ls.strided_offset_vectype,
> +                                          loop_vinfo, gsi,
> +                                          &bump, &vec_offset, loop_lens);
> +       }
>      }
>    else
>      {
> @@ -8557,7 +8689,9 @@ vectorizable_store (vec_info *vinfo,
>
>    if (mat_gather_scatter_p (memory_access_type))
>      {
> -      gcc_assert (!grouped_store);
> +      gcc_assert (!grouped_store || ls.ls_type);
> +      if (ls.ls_type)
> +       vectype = ls.ls_type;
>        auto_vec<tree> vec_offsets;
>        unsigned int inside_cost = 0, prologue_cost = 0;
>        int num_stmts = vec_num;
> @@ -8604,8 +8738,9 @@ vectorizable_store (vec_info *vinfo,
>               if (mask_node)
>                 vec_mask = vec_masks[j];
>               /* We should have catched mismatched types earlier.  */
> -             gcc_assert (useless_type_conversion_p (vectype,
> -                                                    TREE_TYPE (vec_oprnd)));
> +             gcc_assert (ls.ls_type
> +                         || useless_type_conversion_p
> +                         (vectype, TREE_TYPE (vec_oprnd)));
>             }
>           tree final_mask = NULL_TREE;
>           tree final_len = NULL_TREE;
> @@ -8658,6 +8793,18 @@ vectorizable_store (vec_info *vinfo,
>                     }
>                 }
>
> +             if (ls.ls_type)
> +               {
> +                 gimple *conv_stmt
> +                   = gimple_build_assign (make_ssa_name (vectype),
> +                                          VIEW_CONVERT_EXPR,
> +                                          build1 (VIEW_CONVERT_EXPR, vectype,
> +                                                  vec_oprnd));
> +                 vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
> +                                              gsi);
> +                 vec_oprnd = gimple_get_lhs (conv_stmt);
> +               }
> +
>               gcall *call;
>               if (final_len && final_mask)
>                 {
> @@ -10029,7 +10176,8 @@ vectorizable_load (vec_info *vinfo,
>        return true;
>      }
>
> -  if (mat_gather_scatter_p (memory_access_type))
> +  if (mat_gather_scatter_p (memory_access_type)
> +      && !ls.ls_type)
>      grouped_load = false;
>
>    if (grouped_load
> @@ -10088,6 +10236,7 @@ vectorizable_load (vec_info *vinfo,
>           vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
>           group_gap_adj = group_size - scalar_lanes;
>         }
> +      dr_chain.create (vec_num);
>
>        ref_type = get_group_alias_ptr_type (first_stmt_info);
>      }
> @@ -10422,7 +10571,14 @@ vectorizable_load (vec_info *vinfo,
>
>    if (mat_gather_scatter_p (memory_access_type))
>      {
> -      gcc_assert (!grouped_load && !slp_perm);
> +      gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
> +
> +      /* If we pun the original vectype the loads as well as costing, length,
> +        etc. is performed with the new type.  After loading we VIEW_CONVERT
> +        the data to the original vectype.  */
> +      tree original_vectype = vectype;
> +      if (ls.ls_type)
> +       vectype = ls.ls_type;
>
>        /* 1. Create the vector or array pointer update chain.  */
>        if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> @@ -10763,8 +10919,42 @@ vectorizable_load (vec_info *vinfo,
>               new_temp = new_temp2;
>             }
>
> +         if (ls.ls_type)
> +           {
> +             new_stmt = gimple_build_assign (make_ssa_name
> +                                             (original_vectype),
> +                                             VIEW_CONVERT_EXPR,
> +                                             build1 (VIEW_CONVERT_EXPR,
> +                                                     original_vectype,
> +                                                     new_temp));
> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +           }
> +
>           /* Store vector loads in the corresponding SLP_NODE.  */
> -         slp_node->push_vec_def (new_stmt);
> +         if (!costing_p)
> +           {
> +             if (slp_perm)
> +               dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> +             else
> +               slp_node->push_vec_def (new_stmt);
> +           }
> +       }
> +
> +      if (slp_perm)
> +       {
> +         if (costing_p)
> +           {
> +             gcc_assert (ls.n_perms != -1U);
> +             inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> +                                              slp_node, 0, vect_body);
> +           }
> +         else
> +           {
> +             unsigned n_perms2;
> +             vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, 
> vf,
> +                                           false, &n_perms2);
> +             gcc_assert (ls.n_perms == n_perms2);
> +           }
>         }
>
>        if (costing_p && dump_enabled_p ())
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index faa72841d3c..f3357d1d1b1 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
>        tree decl;       // VMAT_GATHER_SCATTER_DECL
>    } gs;
>    tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> +  tree ls_type; // VMAT_GATHER_SCATTER_IFN
>    auto_vec<int> elsvals;
>    unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
>  };
> --
> 2.51.0
>

Reply via email to