Richard Biener <rguent...@suse.de> writes: > This avoids falling back to elementwise accesses for strided SLP > loads when the group size is not a multiple of the vector element > size. Instead we can use a smaller vector or integer type for the load. > > For stores we can do the same though restrictions on stores we handle > and the fact that store-merging covers up makes this mostly effective > for cost modeling which shows for gcc.target/i386/vect-strided-3.c > which we now vectorize with V4SI vectors rather than just V2SI ones. > > For all of this there's still the opportunity to use non-uniform > accesses, say for a 6-element group with a VF of two do > V4SI, { V2SI, V2SI }, V4SI. But that's for a possible followup. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, textually > this depends on the gap improvement series so I'll push only > after those. Target independent testing is difficult, strided > accesses are difficult for VLA - I suppose they should go > through gather/scatter but we have to be able to construct the > offset vector there.
Yeah, agreed. And I suppose for tests like these, which load consecutive pairs of 32-bit elements, we'd want to generate a gather of 64-bit elements. So there'd be a similar accretion process, but only if it applies regularly across the whole vector. Richard > > Richard. > > * gcc.target/i386/vect-strided-1.c: New testcase. > * gcc.target/i386/vect-strided-2.c: Likewise. > * gcc.target/i386/vect-strided-3.c: Likewise. > * gcc.target/i386/vect-strided-4.c: Likewise. > --- > .../gcc.target/i386/vect-strided-1.c | 24 +++++ > .../gcc.target/i386/vect-strided-2.c | 17 +++ > .../gcc.target/i386/vect-strided-3.c | 20 ++++ > .../gcc.target/i386/vect-strided-4.c | 20 ++++ > gcc/tree-vect-stmts.cc | 100 ++++++++---------- > 5 files changed, 127 insertions(+), 54 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-4.c > > diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-1.c > b/gcc/testsuite/gcc.target/i386/vect-strided-1.c > new file mode 100644 > index 00000000000..db4a06711f1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect-strided-1.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +void foo (int * __restrict a, int *b, int s) > +{ > + for (int i = 0; i < 1024; ++i) > + { > + a[8*i+0] = b[s*i+0]; > + a[8*i+1] = b[s*i+1]; > + a[8*i+2] = b[s*i+2]; > + a[8*i+3] = b[s*i+3]; > + a[8*i+4] = b[s*i+4]; > + a[8*i+5] = b[s*i+5]; > + a[8*i+6] = b[s*i+4]; > + a[8*i+7] = b[s*i+5]; > + } > +} > + > +/* Three two-element loads, two four-element stores. On ia32 we elide > + a permute and perform a redundant load. */ > +/* { dg-final { scan-assembler-times "movq" 2 } } */ > +/* { dg-final { scan-assembler-times "movhps" 2 { target ia32 } } } */ > +/* { dg-final { scan-assembler-times "movhps" 1 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "movups" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-2.c > b/gcc/testsuite/gcc.target/i386/vect-strided-2.c > new file mode 100644 > index 00000000000..6fd64e28cf0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect-strided-2.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx" } */ > + > +void foo (int * __restrict a, int *b, int s) > +{ > + for (int i = 0; i < 1024; ++i) > + { > + a[4*i+0] = b[s*i+0]; > + a[4*i+1] = b[s*i+1]; > + a[4*i+2] = b[s*i+0]; > + a[4*i+3] = b[s*i+1]; > + } > +} > + > +/* One two-element load, one four-element store. */ > +/* { dg-final { scan-assembler-times "movq" 1 } } */ > +/* { dg-final { scan-assembler-times "movups" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-3.c > b/gcc/testsuite/gcc.target/i386/vect-strided-3.c > new file mode 100644 > index 00000000000..b462701a0b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect-strided-3.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse2 -mno-avx -fno-tree-slp-vectorize" } */ > + > +void foo (int * __restrict a, int *b, int s) > +{ > + if (s >= 6) > + for (int i = 0; i < 1024; ++i) > + { > + a[s*i+0] = b[4*i+0]; > + a[s*i+1] = b[4*i+1]; > + a[s*i+2] = b[4*i+2]; > + a[s*i+3] = b[4*i+3]; > + a[s*i+4] = b[4*i+0]; > + a[s*i+5] = b[4*i+1]; > + } > +} > + > +/* While the vectorizer generates 6 uint64 stores. */ > +/* { dg-final { scan-assembler-times "movq" 4 } } */ > +/* { dg-final { scan-assembler-times "movhps" 2 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c > b/gcc/testsuite/gcc.target/i386/vect-strided-4.c > new file mode 100644 > index 00000000000..dd922926a2a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */ > + > +void foo (int * __restrict a, int * __restrict b, int *c, int s) > +{ > + if (s >= 2) > + for (int i = 0; i < 1024; ++i) > + { > + a[s*i+0] = c[4*i+0]; > + a[s*i+1] = c[4*i+1]; > + b[s*i+0] = c[4*i+2]; > + b[s*i+1] = c[4*i+3]; > + } > +} > + > +/* Vectorization factor two, two two-element stores to a using movq > + and two two-element stores to b via pextrq/movhps of the high part. */ > +/* { dg-final { scan-assembler-times "movq" 2 } } */ > +/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */ > +/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 8aa41833433..03a5db45976 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -2036,15 +2036,10 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > first_dr_info > = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]); > if (STMT_VINFO_STRIDED_P (first_stmt_info)) > - { > - /* Try to use consecutive accesses of DR_GROUP_SIZE elements, > - separated by the stride, until we have a complete vector. > - Fall back to scalar accesses if that isn't possible. */ > - if (multiple_p (nunits, group_size)) > - *memory_access_type = VMAT_STRIDED_SLP; > - else > - *memory_access_type = VMAT_ELEMENTWISE; > - } > + /* Try to use consecutive accesses of as many elements as possible, > + separated by the stride, until we have a complete vector. > + Fall back to scalar accesses if that isn't possible. */ > + *memory_access_type = VMAT_STRIDED_SLP; > else > { > int cmp = compare_step_with_zero (vinfo, stmt_info); > @@ -8512,12 +8507,29 @@ vectorizable_store (vec_info *vinfo, > tree lvectype = vectype; > if (slp) > { > - if (group_size < const_nunits > - && const_nunits % group_size == 0) > + HOST_WIDE_INT n = gcd (group_size, const_nunits); > + if (n == const_nunits) > { > - nstores = const_nunits / group_size; > - lnel = group_size; > - ltype = build_vector_type (elem_type, group_size); > + int mis_align = dr_misalignment (first_dr_info, vectype); > + dr_alignment_support dr_align > + = vect_supportable_dr_alignment (vinfo, dr_info, vectype, > + mis_align); > + if (dr_align == dr_aligned > + || dr_align == dr_unaligned_supported) > + { > + nstores = 1; > + lnel = const_nunits; > + ltype = vectype; > + lvectype = vectype; > + alignment_support_scheme = dr_align; > + misalignment = mis_align; > + } > + } > + else if (n > 1) > + { > + nstores = const_nunits / n; > + lnel = n; > + ltype = build_vector_type (elem_type, n); > lvectype = vectype; > > /* First check if vec_extract optab doesn't support extraction > @@ -8526,7 +8538,7 @@ vectorizable_store (vec_info *vinfo, > machine_mode vmode; > if (!VECTOR_MODE_P (TYPE_MODE (vectype)) > || !related_vector_mode (TYPE_MODE (vectype), elmode, > - group_size).exists (&vmode) > + n).exists (&vmode) > || (convert_optab_handler (vec_extract_optab, > TYPE_MODE (vectype), vmode) > == CODE_FOR_nothing)) > @@ -8537,8 +8549,8 @@ vectorizable_store (vec_info *vinfo, > re-interpreting it as the original vector type if > supported. */ > unsigned lsize > - = group_size * GET_MODE_BITSIZE (elmode); > - unsigned int lnunits = const_nunits / group_size; > + = n * GET_MODE_BITSIZE (elmode); > + unsigned int lnunits = const_nunits / n; > /* If we can't construct such a vector fall back to > element extracts from the original vector type and > element size stores. */ > @@ -8551,7 +8563,7 @@ vectorizable_store (vec_info *vinfo, > != CODE_FOR_nothing)) > { > nstores = lnunits; > - lnel = group_size; > + lnel = n; > ltype = build_nonstandard_integer_type (lsize, 1); > lvectype = build_vector_type (ltype, nstores); > } > @@ -8562,24 +8574,6 @@ vectorizable_store (vec_info *vinfo, > issue exists here for reasonable archs. */ > } > } > - else if (group_size >= const_nunits > - && group_size % const_nunits == 0) > - { > - int mis_align = dr_misalignment (first_dr_info, vectype); > - dr_alignment_support dr_align > - = vect_supportable_dr_alignment (vinfo, dr_info, vectype, > - mis_align); > - if (dr_align == dr_aligned > - || dr_align == dr_unaligned_supported) > - { > - nstores = 1; > - lnel = const_nunits; > - ltype = vectype; > - lvectype = vectype; > - alignment_support_scheme = dr_align; > - misalignment = mis_align; > - } > - } > ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); > ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > } > @@ -10364,34 +10358,32 @@ vectorizable_load (vec_info *vinfo, > auto_vec<tree> dr_chain; > if (memory_access_type == VMAT_STRIDED_SLP) > { > - if (group_size < const_nunits) > + HOST_WIDE_INT n = gcd (group_size, const_nunits); > + /* Use the target vector type if the group size is a multiple > + of it. */ > + if (n == const_nunits) > + { > + nloads = 1; > + lnel = const_nunits; > + ltype = vectype; > + } > + /* Else use the biggest vector we can load the group without > + accessing excess elements. */ > + else if (n > 1) > { > - /* First check if vec_init optab supports construction from vector > - elts directly. Otherwise avoid emitting a constructor of > - vector elements by performing the loads using an integer type > - of the same size, constructing a vector of those and then > - re-interpreting it as the original vector type. This avoids a > - huge runtime penalty due to the general inability to perform > - store forwarding from smaller stores to a larger load. */ > tree ptype; > tree vtype > - = vector_vector_composition_type (vectype, > - const_nunits / group_size, > + = vector_vector_composition_type (vectype, const_nunits / n, > &ptype); > if (vtype != NULL_TREE) > { > - nloads = const_nunits / group_size; > - lnel = group_size; > + nloads = const_nunits / n; > + lnel = n; > lvectype = vtype; > ltype = ptype; > } > } > - else > - { > - nloads = 1; > - lnel = const_nunits; > - ltype = vectype; > - } > + /* Else fall back to the default element-wise access. */ > ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); > } > /* Load vector(1) scalar_type if it's 1 element-wise vectype. */