On Wed, Apr 7, 2021 at 11:10 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > > On Tue, Apr 6, 2021 at 12:03 PM Richard Sandiford via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > >> > >> As noted in the PR, we were no longer using ST3 for the testcase and > >> instead stored each lane individually. This is because we'd split > >> the store group during SLP and couldn't recover when SLP failed. > >> > >> However, we seem to get better code with ST3 and ST4 even if > >> SLP would have succeeded, such as for vect-complex-5.c. > >> I think the best thing for GCC 11 would therefore be to skip > >> the split entirely if we could use IFN_STORE_LANES. A downside > >> of this is that SLP can handle smaller iteration counts than > >> IFN_STORE_LANES can, but we don't have the infrastructure to > >> choose reliably based on that. > >> > >> Tested on aarch64-linux-gnu (with and without SVE), arm-linux-gnueabihf, > >> armeb-eabi and x86_64-linux-gnu. OK to install? > > > > One of the cases where splitting helps is if you have say V2DFmode > > and a group size of 4 but if you break up the group into sizes of 2 > > then you get two V2DFmode group size 2 SLP subgraphs. So I wonder > > if, since you look for a vector type, want to only disable splitting > > in case the resulting vector type has the same number of lanes > > as the group size? (and if not, instead limit where we consider splitting) > > Hmm, yeah. If we can apply SLP to full vectors within a loop iteration > then I agree it's still better to split. I think the test should favour > ST3/ST4 more though: split only if one of the new group sizes is a > multiple of the vector size. If the vector type has more lanes than the > group size then we should still use ST3/ST4. > > How does this version look? Tested as before.
Looks good. Thanks, Richard. > Thanks, > Richard > > > gcc/ > PR tree-optimization/99873 > * tree-vect-slp.c (vect_slp_prefer_store_lanes_p): New function. > (vect_build_slp_instance): Don't split store groups that could > use IFN_STORE_LANES. > > gcc/testsuite/ > * gcc.dg/vect/slp-21.c: Only expect 2 of the loops to use SLP > if IFN_STORE_LANES is available. > * vect/vect-complex-5.c: Expect no loops to use SLP if > IFN_STORE_LANES is available. > * gcc.target/aarch64/pr99873_1.c: New test. > * gcc.target/aarch64/pr99873_2.c: Likewise. > * gcc.target/aarch64/pr99873_3.c: Likewise. > * gcc.target/aarch64/sve/pr99873_1.c: Likewise. > * gcc.target/aarch64/sve/pr99873_2.c: Likewise. > * gcc.target/aarch64/sve/pr99873_3.c: Likewise. > --- > gcc/testsuite/gcc.dg/vect/slp-21.c | 4 +-- > gcc/testsuite/gcc.dg/vect/vect-complex-5.c | 3 +- > gcc/testsuite/gcc.target/aarch64/pr99873_1.c | 17 ++++++++++ > gcc/testsuite/gcc.target/aarch64/pr99873_2.c | 20 ++++++++++++ > gcc/testsuite/gcc.target/aarch64/pr99873_3.c | 20 ++++++++++++ > .../gcc.target/aarch64/sve/pr99873_1.c | 15 +++++++++ > .../gcc.target/aarch64/sve/pr99873_2.c | 18 +++++++++++ > .../gcc.target/aarch64/sve/pr99873_3.c | 18 +++++++++++ > gcc/tree-vect-slp.c | 31 ++++++++++++++++++- > 9 files changed, 142 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_3.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index ceec7f5c410..eadbd521966 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -2458,6 +2458,34 @@ vect_match_slp_patterns (slp_instance instance, > vec_info *vinfo, > return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited); > } > > +/* STMT_INFO is a store group of size GROUP_SIZE that we are considering > + splitting into two, with the first split group having size NEW_GROUP_SIZE. > + Return true if we could use IFN_STORE_LANES instead and if that appears > + to be the better approach. */ > + > +static bool > +vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info, > + unsigned int group_size, > + unsigned int new_group_size) > +{ > + tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info))); > + tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); > + if (!vectype) > + return false; > + /* Allow the split if one of the two new groups would operate on full > + vectors *within* rather than across one scalar loop iteration. > + This is purely a heuristic, but it should work well for group > + sizes of 3 and 4, where the possible splits are: > + > + 3->2+1: OK if the vector has exactly two elements > + 4->2+2: Likewise > + 4->3+1: Less clear-cut. */ > + if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS > (vectype)) > + || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype))) > + return false; > + return vect_store_lanes_supported (vectype, group_size, false); > +} > + > /* Analyze an SLP instance starting from a group of grouped stores. Call > vect_build_slp_tree to build a tree of packed stmts if possible. > Return FALSE if it's impossible to SLP any stmt in the loop. */ > @@ -2693,7 +2721,8 @@ vect_build_slp_instance (vec_info *vinfo, > > /* For loop vectorization split into arbitrary pieces of size > 1. */ > if (is_a <loop_vec_info> (vinfo) > - && (i > 1 && i < group_size)) > + && (i > 1 && i < group_size) > + && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i)) > { > unsigned group1_size = i; > > diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c > b/gcc/testsuite/gcc.dg/vect/slp-21.c > index bf8f434dd50..85393975b45 100644 > --- a/gcc/testsuite/gcc.dg/vect/slp-21.c > +++ b/gcc/testsuite/gcc.dg/vect/slp-21.c > @@ -210,7 +210,7 @@ int main (void) > > Not all vect_perm targets support that, and it's a bit too specific to > have > its own effective-target selector, so we just test targets directly. */ > -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" > { target { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } */ > -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" > { target { vect_strided4 && { ! { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } > } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" > { target powerpc64*-*-* } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" > { target { vect_strided4 && { ! powerpc64*-*-* } } } } } */ > /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" > { target { ! { vect_strided4 } } } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c > b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c > index 81fdb67ce81..addcf60438c 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c > @@ -40,4 +40,5 @@ main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" > { xfail { ! vect_hw_misalign } } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" > { target vect_load_lanes } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" > { target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_1.c > b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c > new file mode 100644 > index 00000000000..bc4d81e3ae5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c > @@ -0,0 +1,17 @@ > +/* { dg-options "-O3" } */ > + > +#pragma GCC target "+nosve" > + > +void > +f (int *restrict x, int *restrict y, int *restrict z, int n) > +{ > + for (int i = 0; i < n; i += 3) > + { > + x[i] = y[i] + z[i]; > + x[i + 1] = y[i + 1] - z[i + 1]; > + x[i + 2] = y[i + 2] | z[i + 2]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld3\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst3\t} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_2.c > b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c > new file mode 100644 > index 00000000000..b73fbdc0a18 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c > @@ -0,0 +1,20 @@ > +/* { dg-options "-O3" } */ > + > +#include <stdint.h> > + > +#pragma GCC target "+nosve" > + > +void __attribute ((noipa)) > +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n) > +{ > + for (int i = 0; i < n; i += 4) > + { > + x[i] += y[i]; > + x[i + 1] += y[i + 1]; > + x[i + 2] |= y[i + 2]; > + x[i + 3] |= y[i + 3]; > + } > +} > + > +/* { dg-final { scan-assembler-not {\tld4\t} } } */ > +/* { dg-final { scan-assembler-not {\tst4\t} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_3.c > b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c > new file mode 100644 > index 00000000000..ccbab6d74be > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c > @@ -0,0 +1,20 @@ > +/* { dg-options "-O3" } */ > + > +#include <stdint.h> > + > +#pragma GCC target "+nosve" > + > +void __attribute ((noipa)) > +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n) > +{ > + for (int i = 0; i < n; i += 4) > + { > + x[i] += y[i]; > + x[i + 1] += y[i + 1]; > + x[i + 2] |= y[i + 2]; > + x[i + 3] |= y[i + 3]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld4\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst4\t} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c > new file mode 100644 > index 00000000000..f4b95da2afa > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c > @@ -0,0 +1,15 @@ > +/* { dg-options "-O3" } */ > + > +void > +f (int *restrict x, int *restrict y, int *restrict z, int n) > +{ > + for (int i = 0; i < n; i += 3) > + { > + x[i] = y[i] + z[i]; > + x[i + 1] = y[i + 1] - z[i + 1]; > + x[i + 2] = y[i + 2] | z[i + 2]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld3w\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst3w\t} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c > new file mode 100644 > index 00000000000..03dc4ef930d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c > @@ -0,0 +1,18 @@ > +/* { dg-options "-O3" } */ > + > +#include <stdint.h> > + > +void __attribute ((noipa)) > +foo (uint64_t *__restrict x, uint64_t *__restrict y, int n) > +{ > + for (int i = 0; i < n; i += 4) > + { > + x[i] += y[i]; > + x[i + 1] += y[i + 1]; > + x[i + 2] |= y[i + 2]; > + x[i + 3] |= y[i + 3]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld4d\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst4d\t} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c > new file mode 100644 > index 00000000000..87a0141e508 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c > @@ -0,0 +1,18 @@ > +/* { dg-options "-O3" } */ > + > +#include <stdint.h> > + > +void __attribute ((noipa)) > +foo (uint32_t *__restrict x, uint32_t *__restrict y, int n) > +{ > + for (int i = 0; i < n; i += 4) > + { > + x[i] += y[i]; > + x[i + 1] += y[i + 1]; > + x[i + 2] |= y[i + 2]; > + x[i + 3] |= y[i + 3]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld4w\t} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst4w\t} 1 } } */