On Tue, May 27, 2025 at 6:04 PM Robin Dapp <rdapp....@gmail.com> wrote: > > From: Robin Dapp <rdapp....@gmail.com> > > This patch enables strided loads for VMAT_STRIDED_SLP. Instead of > building vectors from scalars or other vectors we can use strided loads > directly when applicable. > > The current implementation limits strided loads to cases where we can > load entire groups and not subsets of them. A future improvement would > be to e.g. load a group of three uint8_t > > g0 g1 g2, g0 + stride g1 + stride g2 + stride, ... > > by > > vlse16 vlse8 > > and permute those into place (after re-interpreting as vector of > uint8_t). > > For satd_8x4 in particular we can do even better by eliding the strided > SLP load permutations, essentially turning > > vlse64 v0, (a0) > vlse64 v1, (a1) > VEC_PERM_EXPR <v0, v1, { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, > 26, 27 }>; > VEC_PERM_EXPR <v0, v1, { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, > 30, 31 }>; > > into > > vlse32 v0, (a0) > vlse32 v1, (a1) > vlse32 v0, 4(a0) > vlse32 v1, 4(a1) > > but that is going to be a follow up. > > Bootstrapped and regtested on x86, aarch64, and power10. > Regtested on rv64gcv_zvl512b. I'm seeing one additional failure in > gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c > where we use a larger LMUL than we should but IMHO this can wait. > > PR target/118109 > > gcc/ChangeLog: > > * internal-fn.cc (internal_strided_fn_supported_p): New > function. > * internal-fn.h (internal_strided_fn_supported_p): Declare. > * tree-vect-stmts.cc (vect_supportable_strided_type): New > function. > (vectorizable_load): Add strided-load support for strided > groups. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test. > --- > gcc/internal-fn.cc | 21 ++ > gcc/internal-fn.h | 2 + > .../gcc.target/riscv/rvv/autovec/pr118019-2.c | 51 +++++ > gcc/tree-vect-stmts.cc | 196 +++++++++++++++--- > 4 files changed, 243 insertions(+), 27 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 6b04443f7cd..aec90ef87cc 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -5203,6 +5203,27 @@ internal_gather_scatter_fn_supported_p (internal_fn > ifn, tree vector_type, > return ok; > } > > +/* Return true if the target supports a strided load/store function IFN > + with VECTOR_TYPE. If supported and ELSVALS is nonzero the supported else > + values will be added to the vector ELSVALS points to. */ > + > +bool > +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type, > + vec<int> *elsvals) > +{
can we add an assert here as to what 'ifn' we expect here? MASK_LEN_STRIDED_LOAD and MASK_LEN_STRIDED_STORE I guess? > + machine_mode mode = TYPE_MODE (vector_type); > + optab optab = direct_internal_fn_optab (ifn); > + insn_code icode = direct_optab_handler (optab, mode); > + > + bool ok = icode != CODE_FOR_nothing; > + > + if (ok && elsvals) > + get_supported_else_vals > + (icode, internal_fn_else_index (IFN_MASK_LEN_STRIDED_LOAD), *elsvals); shouldn't you use 'ifn' here instead of IFN_MASK_LEN_STRIDED_LOAD? > + > + return ok; > +} > + > /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN > for pointers of type TYPE when the accesses have LENGTH bytes and their > common byte alignment is ALIGN. */ > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index afd4f8e64c7..7d386246a42 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -242,6 +242,8 @@ extern int internal_fn_stored_value_index (internal_fn); > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > tree, tree, int, > vec<int> * = nullptr); > +extern bool internal_strided_fn_supported_p (internal_fn ifn, tree > vector_type, > + vec<int> *elsvals); > extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree, > poly_uint64, unsigned int); > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c > new file mode 100644 > index 00000000000..9918d4d7f52 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c > @@ -0,0 +1,51 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d > -mno-vector-strict-align" } */ > + > +/* Ensure we use strided loads. */ > + > +typedef unsigned char uint8_t; > +typedef unsigned short uint16_t; > +typedef unsigned int uint32_t; > + > +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) > \ > + { > \ > + int t0 = s0 + s1; > \ > + int t1 = s0 - s1; > \ > + int t2 = s2 + s3; > \ > + int t3 = s2 - s3; > \ > + d0 = t0 + t2; > \ > + d2 = t0 - t2; > \ > + d1 = t1 + t3; > \ > + d3 = t1 - t3; > \ > + } > + > +uint32_t > +abs2 (uint32_t a) > +{ > + uint32_t s = ((a >> 15) & 0x10001) * 0xffff; > + return (a + s) ^ s; > +} > + > +int > +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2) > +{ > + uint32_t tmp[4][4]; > + uint32_t a0, a1, a2, a3; > + int sum = 0; > + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) > + { > + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); > + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); > + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); > + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); > + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); > + } > + for (int i = 0; i < 4; i++) > + { > + HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); > + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); > + } > + return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1; > +} > + > +/* { dg-final { scan-assembler-times "vlse64" 8 } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 3710694ac75..090169d9cb5 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -10165,6 +10165,46 @@ hoist_defs_of_uses (gimple *stmt, class loop *loop, > bool hoist_p) > return true; > } > > +/* Check if there is a vector type with the same number of elements as > + VECTYPE but NELTS units. It looks like the function looks for a same element typed vector type with a _different_ number of elements, NELTS number. ? If so, set VTYPE to the appropriate type and > + return true if the target supports a strided load/store for the respective > + mode. Return false otherwise. */ So it combines two things. I think get_related_vectype_for_scalar_type (TYPE_MODE (vector_type), TREE_TYPE (vectype), nelts) does exactly this first part? Alternatively you can use get_vectype_for_scalar_type (vinfo, TREE_TYPE (vectype), nelts. I'd hate adding a third variant? > +static bool > +vect_supportable_strided_type (vec_load_store_type vls_type, > + tree vectype, int nelts, > + tree *vtype, > + vec<int> *elsvals) > +{ > + gcc_assert (VECTOR_TYPE_P (vectype)); > + gcc_assert (TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); why this? > + > + machine_mode vmode = TYPE_MODE (vectype); > + if (!VECTOR_MODE_P (vmode)) > + return NULL_TREE; > + > + internal_fn ifn = vls_type == VLS_LOAD > + ? IFN_MASK_LEN_STRIDED_LOAD > + : IFN_MASK_LEN_STRIDED_STORE; > + > + poly_uint64 vbsize = GET_MODE_BITSIZE (vmode); > + unsigned int pbsize; > + if (constant_multiple_p (vbsize, nelts, &pbsize)) > + { > + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); > + machine_mode rmode; > + if (int_mode_for_size (pbsize, 0).exists (&elmode) > + && related_vector_mode (vmode, elmode, nelts).exists (&rmode)) > + { > + tree ptype = build_nonstandard_integer_type (pbsize, 1); > + *vtype = build_vector_type (ptype, nelts); > + if (internal_strided_fn_supported_p (ifn, *vtype, elsvals)) > + return true; > + } > + } > + > + return false; > +} > + > /* vectorizable_load. > > Check if STMT_INFO reads a non scalar data-ref (array/pointer/structure) > @@ -10669,6 +10709,8 @@ vectorizable_load (vec_info *vinfo, > tree running_off; > vec<constructor_elt, va_gc> *v = NULL; > tree stride_base, stride_step, alias_off; > + bool strided_load_ok_p = false; > + tree stride_step_signed = NULL_TREE; > /* Checked by get_load_store_type. */ > unsigned int const_nunits = nunits.to_constant (); > unsigned HOST_WIDE_INT cst_offset = 0; > @@ -10744,15 +10786,34 @@ vectorizable_load (vec_info *vinfo, > stride_step = cse_and_gimplify_to_preheader (loop_vinfo, > stride_step); > } > > + tree stride_step_full = NULL_TREE; > + auto_vec<tree> dr_chain; > + > + /* For SLP permutation support we need to load the whole group, > + not only the number of vector stmts the permutation result > + fits in. */ > + if (slp_perm) > + { > + /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for > + variable VF. */ > + unsigned int const_vf = vf.to_constant (); > + ncopies = CEIL (group_size * const_vf, const_nunits); > + dr_chain.create (ncopies); > + } > + else > + ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > + > + /* Initialize for VMAT_ELEMENTWISE i.e. when we must load each element > + separately. */ > running_off = offvar; > alias_off = build_int_cst (ref_type, 0); > int nloads = const_nunits; > int lnel = 1; > tree ltype = TREE_TYPE (vectype); > tree lvectype = vectype; > - auto_vec<tree> dr_chain; > if (memory_access_type == VMAT_STRIDED_SLP) > { > + tree strided_vtype; > HOST_WIDE_INT n = gcd (group_size, const_nunits); > /* Use the target vector type if the group size is a multiple > of it. */ > @@ -10781,9 +10842,42 @@ vectorizable_load (vec_info *vinfo, > misalignment = mis_align; > } > } > + /* Instead of loading individual vector elements and > + constructing a larger vector from them we can use > + a strided load directly. > + ??? For non-power-of-two groups we could build the > + group from smaller element sizes and permute them > + into place afterwards instead of relying on a more > + rigid vec_init. */ > + else if (n > 1 && n == group_size > + && vect_supportable_strided_type > + (VLS_LOAD, vectype, const_nunits / n, > + &strided_vtype, &elsvals)) So I do wonder how this interacts with vector_vector_composition_type, in fact the difference is that for strided_load we know the composition happens as part of a load, so how about instead extending this function, pass it VLS_LOAD/STORE and also consider strided_loads as composition kind there? This would avoid duplication and I think at least some cases of non-power-of-two groups would be handled this way already (permuting out gaps). > + { > + dr_alignment_support dr_align = dr_aligned; > + int mis_align = 0; > + mis_align = dr_misalignment (first_dr_info, > + strided_vtype); > + dr_align > + = vect_supportable_dr_alignment (vinfo, dr_info, > + strided_vtype, > + mis_align); > + if (dr_align == dr_aligned > + || dr_align == dr_unaligned_supported) > + { > + nloads = 1; > + lnel = const_nunits; > + lvectype = strided_vtype; > + ltype = TREE_TYPE (strided_vtype); > + alignment_support_scheme = dr_align; > + misalignment = mis_align; > + > + strided_load_ok_p = true; > + } > + } > /* Else use the biggest vector we can load the group without > accessing excess elements. */ > - else if (n > 1) > + else if (!strided_load_ok_p && n > 1) > { > tree ptype; > tree vtype > @@ -10829,19 +10923,35 @@ vectorizable_load (vec_info *vinfo, > ltype = build_aligned_type (ltype, align * BITS_PER_UNIT); > } > > - /* For SLP permutation support we need to load the whole group, > - not only the number of vector stmts the permutation result > - fits in. */ > - if (slp_perm) > + /* We don't use masking here so just use any else value and don't > + perform any zeroing. */ > + tree vec_els = NULL_TREE; > + if (strided_load_ok_p && !costing_p) > { > - /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for > - variable VF. */ > - unsigned int const_vf = vf.to_constant (); > - ncopies = CEIL (group_size * const_vf, const_nunits); > - dr_chain.create (ncopies); > + gcc_assert (elsvals.length ()); > + maskload_elsval = *elsvals.begin (); > + vec_els = vect_get_mask_load_else (maskload_elsval, lvectype); > + > + stride_step_full > + = fold_build2 (MULT_EXPR, TREE_TYPE (stride_step), > + stride_step, > + build_int_cst (TREE_TYPE (stride_step), > + TYPE_VECTOR_SUBPARTS (lvectype))); > + stride_step_full > + = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_full); > + > + tree cst_off = build_int_cst (ref_type, cst_offset); > + dataref_ptr > + = vect_create_data_ref_ptr (vinfo, first_stmt_info, lvectype, > + loop, cst_off, &dummy, gsi, &ptr_incr, > + false, stride_step_full); > + > + stride_step_signed > + = fold_build1 (NOP_EXPR, signed_type_for (TREE_TYPE > (stride_step)), > + stride_step); > + stride_step_signed > + = cse_and_gimplify_to_preheader (loop_vinfo, stride_step_signed); > } > - else > - ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > > unsigned int group_el = 0; > unsigned HOST_WIDE_INT > @@ -10852,37 +10962,65 @@ vectorizable_load (vec_info *vinfo, > unsigned int n_adjacent_loads = 0; > for (j = 0; j < ncopies; j++) > { > - if (nloads > 1 && !costing_p) > + if (nloads > 1 && !strided_load_ok_p && !costing_p) > vec_alloc (v, nloads); > gimple *new_stmt = NULL; > for (i = 0; i < nloads; i++) > { > if (costing_p) > { > + /* TODO: The vectype in stmt_info/slp_node is potentially > + wrong as we could be using a much smaller vectype > + as determined by vector_vector_composition_type. */ > + if (strided_load_ok_p) > + inside_cost += record_stmt_cost (cost_vec, 1, > + vector_gather_load, > + slp_node, 0, > + vect_body); > /* For VMAT_ELEMENTWISE, just cost it as scalar_load to > avoid ICE, see PR110776. */ > - if (VECTOR_TYPE_P (ltype) > - && memory_access_type != VMAT_ELEMENTWISE) > + else if (VECTOR_TYPE_P (ltype) > + && memory_access_type != VMAT_ELEMENTWISE) > n_adjacent_loads++; > else > inside_cost += record_stmt_cost (cost_vec, 1, scalar_load, > slp_node, 0, vect_body); > continue; > } > - tree this_off = build_int_cst (TREE_TYPE (alias_off), > - group_el * elsz + cst_offset); > - tree data_ref = build2 (MEM_REF, ltype, running_off, this_off); > - vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > - new_temp = make_ssa_name (ltype); > - new_stmt = gimple_build_assign (new_temp, data_ref); > + > + if (!strided_load_ok_p) > + { > + > + tree this_off = build_int_cst (TREE_TYPE (alias_off), > + group_el * elsz + > cst_offset); > + tree data_ref = build2 (MEM_REF, ltype, running_off, > + this_off); > + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > + new_temp = make_ssa_name (ltype); > + new_stmt = gimple_build_assign (new_temp, data_ref); > + if (nloads > 1) > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp); > + } > + else > + { > + mask_vectype = truth_type_for (lvectype); > + tree final_mask = build_minus_one_cst (mask_vectype); > + tree bias = build_int_cst (intQI_type_node, 0); > + tree len = size_int (TYPE_VECTOR_SUBPARTS (lvectype)); > + tree zero = build_zero_cst (lvectype); > + new_stmt > + = gimple_build_call_internal > + (IFN_MASK_LEN_STRIDED_LOAD, 7, dataref_ptr, > + stride_step_signed, zero, final_mask, vec_els, len, > bias); > + new_temp = make_ssa_name (lvectype); > + gimple_set_lhs (new_stmt, new_temp); > + } > vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > - if (nloads > 1) > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_temp); > > group_el += lnel; > - if (group_el == group_size) > + if (group_el >= group_size) > { > - n_groups++; > + n_groups += (group_el / group_size); > /* When doing SLP make sure to not load elements from > the next vector iteration, those will not be accessed > so just use the last element again. See PR107451. */ > @@ -10894,6 +11032,10 @@ vectorizable_load (vec_info *vinfo, > running_off, stride_step); > vect_finish_stmt_generation (vinfo, stmt_info, incr, > gsi); > running_off = newoff; > + if (strided_load_ok_p && !costing_p) > + dataref_ptr > + = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > gsi, > + stmt_info, stride_step_full); > } > group_el = 0; > } > @@ -10935,7 +11077,7 @@ vectorizable_load (vec_info *vinfo, > if (!costing_p) > { > if (slp_perm) > - dr_chain.quick_push (gimple_assign_lhs (new_stmt)); > + dr_chain.quick_push (gimple_get_lhs (new_stmt)); > else > slp_node->push_vec_def (new_stmt); > } > -- > 2.49.0 >