On Fri, Sep 19, 2025 at 11:57 AM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> Changes from v3:
>
> - Remove load-perm optimization and try to pun whole group.
> - Add slp_perm handling to VMAT_GATHER_SCATTER when using "grouped gather".
>
> The generated code is obviously worse than with v2 for my example because we
> have 2x the number of strided loads and the (non-single-source) load-perm
> gathers throw away have of the loaded data afterwards. But we can still
> improve this in a follow up.
>
> Bootstrapped and regtested on x86 and power10. Regtested on rv64gcv_zvl512b,
> aarch64 running.
>
> This patch adds gather/scatter handling for grouped access. The idea is
> to e.g. replace an access (for uint8_t elements) like
> arr[0]
> arr[1]
> arr[2]
> arr[3]
> arr[0 + step]
> arr[1 + step]
> ...
> by a gather load of uint32_t
> arr[0..3]
> arr[0 + step * 1..3 + step * 1]
> arr[0 + step * 2..3 + step * 2]
> ...
> where the offset vector is a simple series with step STEP.
> If supported, such a gather can be implemented as a strided load.
>
> If we have a masked access the transformation is not performed. This
> could still be done after converting the data back to the original
> vectype but it does not seem worth it for now.
This patch is OK now, I spotted one minor issue,
> - && SLP_TREE_LANES (slp_node) == 1
> - && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> - || single_element_p))
> + && SLP_TREE_LANES (slp_node) >= 1)
I think SLP_TREE_LANES is always >= 1, so OK with removing this.
Thanks,
Richard.
> Regards
> Robin
>
> PR target/118019
>
> gcc/ChangeLog:
>
> * internal-fn.cc (get_supported_else_vals): Exit at invalid
> index.
> (internal_strided_fn_supported_p): New funtion.
> * internal-fn.h (internal_strided_fn_supported_p): Declare.
> * tree-vect-stmts.cc (vector_vector_composition_type):
> Add vector_only argument.
> (vect_use_grouped_gather): New function.
> (vect_get_store_rhs): Adjust docs of
> vector_vector_composition_type.
> (get_load_store_type): Try grouped gather.
> (vectorizable_store): Use punned vectype.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (struct vect_load_store_data): Add punned
> vectype.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
> ---
> gcc/internal-fn.cc | 22 +-
> gcc/internal-fn.h | 2 +
> .../gcc.target/riscv/rvv/autovec/pr118019-2.c | 50 ++++
> gcc/tree-vect-stmts.cc | 240 ++++++++++++++++--
> gcc/tree-vectorizer.h | 1 +
> 5 files changed, 289 insertions(+), 26 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index bf2fac81807..db396c69ec5 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned
> else_index,
> vec<int> &else_vals)
> {
> const struct insn_data_d *data = &insn_data[icode];
> - if ((char)else_index >= data->n_operands)
> + if ((int)else_index >= data->n_operands || (int)else_index == -1)
> return;
>
> machine_mode else_mode = data->operand[else_index].mode;
> @@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn
> ifn, tree vector_type,
> return ok;
> }
>
> +/* Return true if the target supports a strided load/store function IFN
> + with VECTOR_TYPE. If supported and ELSVALS is nonzero the supported else
> + values will be added to the vector ELSVALS points to. */
> +
> +bool
> +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
> + vec<int> *elsvals)
> +{
> + machine_mode mode = TYPE_MODE (vector_type);
> + optab optab = direct_internal_fn_optab (ifn);
> + insn_code icode = direct_optab_handler (optab, mode);
> +
> + bool ok = icode != CODE_FOR_nothing;
> +
> + if (ok && elsvals)
> + get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals);
> +
> + return ok;
> +}
> +
> /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
> for pointers of type TYPE when the accesses have LENGTH bytes and their
> common byte alignment is ALIGN. */
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index fd21694dfeb..dcb707251f8 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn);
> extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
> tree, tree, int,
> vec<int> * = nullptr);
> +extern bool internal_strided_fn_supported_p (internal_fn, tree,
> + vec<int> * = nullptr);
> extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
> poly_uint64, unsigned int);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> new file mode 100644
> index 00000000000..c8c1a7291fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d
> -mno-vector-strict-align" } */
> +
> +/* Ensure we use strided loads. */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned int uint32_t;
> +
> +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
> + int t0 = s0 + s1;\
> + int t1 = s0 - s1;\
> + int t2 = s2 + s3;\
> + int t3 = s2 - s3;\
> + d0 = t0 + t2;\
> + d2 = t0 - t2;\
> + d1 = t1 + t3;\
> + d3 = t1 - t3;\
> +}
> +
> +uint32_t
> +abs2 (uint32_t a)
> +{
> + uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
> + return (a + s) ^ s;
> +}
> +
> +int
> +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
> +{
> + uint32_t tmp[4][4];
> + uint32_t a0, a1, a2, a3;
> + int sum = 0;
> + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
> + {
> + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
> + }
> + for (int i = 0; i < 4; i++)
> + {
> + HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
> + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
> + }
> + return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "vlse64" 8 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3af78f3af84..747659d3fda 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -62,6 +62,9 @@ along with GCC; see the file COPYING3. If not see
> /* For lang_hooks.types.type_for_mode. */
> #include "langhooks.h"
>
> +static tree vector_vector_composition_type (tree, poly_uint64, tree *,
> + bool = false);
> +
> /* Return TRUE iff the given statement is in an inner loop relative to
> the loop being vectorized. */
> bool
> @@ -1723,6 +1726,95 @@ vect_truncate_gather_scatter_offset (stmt_vec_info
> stmt_info, tree vectype,
> return false;
> }
>
> +/* Return true if we can use gather/scatter or strided internal functions
> + to vectorize STMT_INFO, which is a grouped or strided load or store
> + with multiple lanes and will be implemented by a type-punned access
> + of a vector with element size that matches the number of lanes.
> +
> + MASKED_P is true if load or store is conditional.
> + When returning true, fill in GS_INFO with the information required to
> + perform the operation. Also, store the punning type in PUNNED_VECTYPE.
> +
> + If successful and ELSVALS is nonzero the supported
> + else values will be stored in the vector ELSVALS points to. */
> +
> +static bool
> +vect_use_grouped_gather (dr_vec_info *dr_info, tree vectype,
> + loop_vec_info loop_vinfo, bool masked_p,
> + unsigned int nelts,
> + gather_scatter_info *info, vec<int> *elsvals,
> + tree *pun_vectype)
> +{
> + data_reference *dr = dr_info->dr;
> +
> + /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
> + multiple gathers/scatter. */
> + if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
> + return false;
> +
> + /* Pun the vectype with one of the same size but an element spanning
> + NELTS elements of VECTYPE.
> + The punned type of a V16QI with NELTS = 4 would be V4SI.
> + */
> + tree tmp;
> + unsigned int pieces;
> + if (!can_div_trunc_p (TYPE_VECTOR_SUBPARTS (vectype), nelts, &pieces)
> + || !pieces)
> + return false;
> +
> + *pun_vectype = vector_vector_composition_type (vectype, pieces, &tmp,
> true);
> +
> + if (!*pun_vectype || !VECTOR_TYPE_P (*pun_vectype))
> + return false;
> +
> + internal_fn ifn;
> + tree offset_vectype = *pun_vectype;
> +
> + internal_fn strided_ifn = DR_IS_READ (dr)
> + ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
> +
> + /* Check if we have a gather/scatter with the new type. We're just trying
> + with the type itself as offset for now. If not, check if we have a
> + strided load/store. These have fewer constraints (for example no offset
> + type must exist) so it is possible that even though a gather/scatter is
> + not available we still have a strided load/store. */
> + bool ok = false;
> + if (vect_gather_scatter_fn_p
> + (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
> + TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
> + &offset_vectype, elsvals))
> + ok = true;
> + else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
> + elsvals))
> + {
> + /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
> + will switch back to the strided variants. */
> + ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
> + IFN_MASK_LEN_SCATTER_STORE;
> + ok = true;
> + }
> +
> + if (ok)
> + {
> + info->ifn = ifn;
> + info->decl = NULL_TREE;
> + info->base = dr->ref;
> + info->alias_ptr = build_int_cst
> + (reference_alias_ptr_type (DR_REF (dr)),
> + get_object_alignment (DR_REF (dr)));
> + info->element_type = TREE_TYPE (*pun_vectype);
> + info->offset_vectype = offset_vectype;
> + /* No need to set the offset, vect_get_strided_load_store_ops
> + will do that. */
> + info->scale = 1;
> + info->memory_type = TREE_TYPE (DR_REF (dr));
> + return true;
> + }
> +
> + return false;
> +}
> +
> +
> /* Return true if we can use gather/scatter internal functions to
> vectorize STMT_INFO, which is a grouped or strided load or store.
> MASKED_P is true if load or store is conditional. When returning
> @@ -1888,12 +1980,14 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
>
> /* Function VECTOR_VECTOR_COMPOSITION_TYPE
>
> - This function returns a vector type which can be composed with NETLS
> pieces,
> + This function returns a vector type which can be composed with NELTS
> pieces,
> whose type is recorded in PTYPE. VTYPE should be a vector type, and has
> the
> same vector size as the return vector. It checks target whether supports
> pieces-size vector mode for construction firstly, if target fails to,
> check
> pieces-size scalar mode for construction further. It returns NULL_TREE if
> - fails to find the available composition.
> + fails to find the available composition. If the caller only wants scalar
> + pieces where PTYPE e.g. is a possible gather/scatter element type
> + SCALAR_PTYPE_ONLY must be true.
>
> For example, for (vtype=V16QI, nelts=4), we can probably get:
> - V16QI with PTYPE V4QI.
> @@ -1901,7 +1995,8 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
> - NULL_TREE. */
>
> static tree
> -vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
> +vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
> + bool scalar_ptype_only)
> {
> gcc_assert (VECTOR_TYPE_P (vtype));
> gcc_assert (known_gt (nelts, 0U));
> @@ -1927,7 +2022,8 @@ vector_vector_composition_type (tree vtype, poly_uint64
> nelts, tree *ptype)
> scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
> poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
> machine_mode rmode;
> - if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
> + if (!scalar_ptype_only
> + && related_vector_mode (vmode, elmode, inelts).exists (&rmode)
> && (convert_optab_handler (vec_init_optab, vmode, rmode)
> != CODE_FOR_nothing))
> {
> @@ -1938,12 +2034,15 @@ vector_vector_composition_type (tree vtype,
> poly_uint64 nelts, tree *ptype)
> /* Otherwise check if exists an integer type of the same piece size and
> if vec_init optab supports construction from it directly. */
> if (int_mode_for_size (pbsize, 0).exists (&elmode)
> - && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
> - && (convert_optab_handler (vec_init_optab, rmode, elmode)
> - != CODE_FOR_nothing))
> + && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
> {
> - *ptype = build_nonstandard_integer_type (pbsize, 1);
> - return build_vector_type (*ptype, nelts);
> + if (scalar_ptype_only
> + || convert_optab_handler (vec_init_optab, rmode, elmode)
> + != CODE_FOR_nothing)
> + {
> + *ptype = build_nonstandard_integer_type (pbsize, 1);
> + return build_vector_type (*ptype, nelts);
> + }
> }
> }
>
> @@ -1981,6 +2080,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> int *misalignment = &ls->misalignment;
> internal_fn *lanes_ifn = &ls->lanes_ifn;
> vec<int> *elsvals = &ls->elsvals;
> + tree *ls_type = &ls->ls_type;
> loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -1994,6 +2094,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
>
> *misalignment = DR_MISALIGNMENT_UNKNOWN;
> *poffset = 0;
> + *ls_type = NULL_TREE;
>
> if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> {
> @@ -2318,18 +2419,20 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> on nearby locations. Or, even if it's a win over scalar code,
> it might not be a win over vectorizing at a lower VF, if that
> allows us to use contiguous accesses. */
> + vect_memory_access_type grouped_gather_fallback = VMAT_UNINITIALIZED;
> if (loop_vinfo
> && (*memory_access_type == VMAT_ELEMENTWISE
> || *memory_access_type == VMAT_STRIDED_SLP)
> && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> - && SLP_TREE_LANES (slp_node) == 1
> - && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> - || single_element_p))
> + && SLP_TREE_LANES (slp_node) >= 1)
> {
> gather_scatter_info gs_info;
> - if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
> - masked_p, &gs_info, elsvals,
> - group_size, single_element_p))
> + if (SLP_TREE_LANES (slp_node) == 1
> + && (!SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> + || single_element_p)
> + && vect_use_strided_gather_scatters_p (stmt_info, vectype,
> loop_vinfo,
> + masked_p, &gs_info, elsvals,
> + group_size,
> single_element_p))
> {
> SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
> SLP_TREE_GS_BASE (slp_node) = error_mark_node;
> @@ -2337,6 +2440,20 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> ls->strided_offset_vectype = gs_info.offset_vectype;
> *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> }
> + else if (SLP_TREE_LANES (slp_node) > 1
> + && !masked_p
> + && !single_element_p
> + && vect_use_grouped_gather (STMT_VINFO_DR_INFO (stmt_info),
> + vectype, loop_vinfo,
> + masked_p, group_size,
> + &gs_info, elsvals, ls_type))
> + {
> + grouped_gather_fallback = *memory_access_type;
> + *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> + ls->gs.ifn = gs_info.ifn;
> + vectype = *ls_type;
> + ls->strided_offset_vectype = gs_info.offset_vectype;
> + }
> }
>
> if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
> @@ -2362,6 +2479,18 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> = vect_supportable_dr_alignment
> (vinfo, first_dr_info, vectype, *misalignment,
> mat_gather_scatter_p (*memory_access_type));
> + if (grouped_gather_fallback != VMAT_UNINITIALIZED
> + && *alignment_support_scheme != dr_aligned
> + && *alignment_support_scheme != dr_unaligned_supported)
> + {
> + /* No supportable alignment for a grouped gather, fall back to the
> + original memory access type. Even though VMAT_STRIDED_SLP might
> + also try aligned vector loads it can still choose vector
> + construction from scalars. */
> + *memory_access_type = grouped_gather_fallback;
> + *alignment_support_scheme = dr_unaligned_supported;
> + *misalignment = DR_MISALIGNMENT_UNKNOWN;
> + }
> }
>
> if (overrun_p)
> @@ -8368,10 +8497,13 @@ vectorizable_store (vec_info *vinfo,
> {
> aggr_type = elem_type;
> if (!costing_p)
> - vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
> - ls.strided_offset_vectype,
> - loop_vinfo, gsi,
> - &bump, &vec_offset, loop_lens);
> + {
> + tree vtype = ls.ls_type ? ls.ls_type : vectype;
> + vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
> + ls.strided_offset_vectype,
> + loop_vinfo, gsi,
> + &bump, &vec_offset, loop_lens);
> + }
> }
> else
> {
> @@ -8557,7 +8689,9 @@ vectorizable_store (vec_info *vinfo,
>
> if (mat_gather_scatter_p (memory_access_type))
> {
> - gcc_assert (!grouped_store);
> + gcc_assert (!grouped_store || ls.ls_type);
> + if (ls.ls_type)
> + vectype = ls.ls_type;
> auto_vec<tree> vec_offsets;
> unsigned int inside_cost = 0, prologue_cost = 0;
> int num_stmts = vec_num;
> @@ -8604,8 +8738,9 @@ vectorizable_store (vec_info *vinfo,
> if (mask_node)
> vec_mask = vec_masks[j];
> /* We should have catched mismatched types earlier. */
> - gcc_assert (useless_type_conversion_p (vectype,
> - TREE_TYPE (vec_oprnd)));
> + gcc_assert (ls.ls_type
> + || useless_type_conversion_p
> + (vectype, TREE_TYPE (vec_oprnd)));
> }
> tree final_mask = NULL_TREE;
> tree final_len = NULL_TREE;
> @@ -8658,6 +8793,18 @@ vectorizable_store (vec_info *vinfo,
> }
> }
>
> + if (ls.ls_type)
> + {
> + gimple *conv_stmt
> + = gimple_build_assign (make_ssa_name (vectype),
> + VIEW_CONVERT_EXPR,
> + build1 (VIEW_CONVERT_EXPR, vectype,
> + vec_oprnd));
> + vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
> + gsi);
> + vec_oprnd = gimple_get_lhs (conv_stmt);
> + }
> +
> gcall *call;
> if (final_len && final_mask)
> {
> @@ -10029,7 +10176,8 @@ vectorizable_load (vec_info *vinfo,
> return true;
> }
>
> - if (mat_gather_scatter_p (memory_access_type))
> + if (mat_gather_scatter_p (memory_access_type)
> + && !ls.ls_type)
> grouped_load = false;
>
> if (grouped_load
> @@ -10088,6 +10236,7 @@ vectorizable_load (vec_info *vinfo,
> vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> group_gap_adj = group_size - scalar_lanes;
> }
> + dr_chain.create (vec_num);
>
> ref_type = get_group_alias_ptr_type (first_stmt_info);
> }
> @@ -10422,7 +10571,14 @@ vectorizable_load (vec_info *vinfo,
>
> if (mat_gather_scatter_p (memory_access_type))
> {
> - gcc_assert (!grouped_load && !slp_perm);
> + gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
> +
> + /* If we pun the original vectype the loads as well as costing, length,
> + etc. is performed with the new type. After loading we VIEW_CONVERT
> + the data to the original vectype. */
> + tree original_vectype = vectype;
> + if (ls.ls_type)
> + vectype = ls.ls_type;
>
> /* 1. Create the vector or array pointer update chain. */
> if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> @@ -10763,8 +10919,42 @@ vectorizable_load (vec_info *vinfo,
> new_temp = new_temp2;
> }
>
> + if (ls.ls_type)
> + {
> + new_stmt = gimple_build_assign (make_ssa_name
> + (original_vectype),
> + VIEW_CONVERT_EXPR,
> + build1 (VIEW_CONVERT_EXPR,
> + original_vectype,
> + new_temp));
> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> + }
> +
> /* Store vector loads in the corresponding SLP_NODE. */
> - slp_node->push_vec_def (new_stmt);
> + if (!costing_p)
> + {
> + if (slp_perm)
> + dr_chain.quick_push (gimple_assign_lhs (new_stmt));
> + else
> + slp_node->push_vec_def (new_stmt);
> + }
> + }
> +
> + if (slp_perm)
> + {
> + if (costing_p)
> + {
> + gcc_assert (ls.n_perms != -1U);
> + inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> + slp_node, 0, vect_body);
> + }
> + else
> + {
> + unsigned n_perms2;
> + vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi,
> vf,
> + false, &n_perms2);
> + gcc_assert (ls.n_perms == n_perms2);
> + }
> }
>
> if (costing_p && dump_enabled_p ())
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index faa72841d3c..f3357d1d1b1 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
> tree decl; // VMAT_GATHER_SCATTER_DECL
> } gs;
> tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> + tree ls_type; // VMAT_GATHER_SCATTER_IFN
> auto_vec<int> elsvals;
> unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
> };
> --
> 2.51.0
>