On Fri, Oct 23, 2015 at 5:20 PM, Alan Lawrence <alan.lawre...@arm.com> wrote: > vect_analyze_slp_instance currently only creates an slp_instance if _all_ > stores > in a group fitted the same pattern. This patch splits non-matching groups up > on vector boundaries, allowing only part of the group to be SLP'd, or multiple > subgroups to be SLP'd differently. > > The algorithm could be made more efficient: we have more info available in > the matches vector, and a single match in a vector full of non-matches, means > we will be unable to SLP). But the double-recursion case has at most log(N) > depth and the single-recursion case is at worst N^2 in *the group width*, > which > is generally small. > > This could possibly be extended to hybrid SLP, but I believe that would also > require splitting the load groups, as at present removing the bb_vinfo check > ends up with data being stored to the wrong locations e.g. in slp-11a.c. > Hence, > leaving that extension to a future patch. > > Bootstrapped + check-{gcc,g++,fortran} on aarch64, x86_64 and arm (v7-a neon). > > Thanks, Alan > > gcc/ChangeLog: > > * tree-vect-slp.c (vect_split_slp_group): New. > (vect_analyze_slp_instance): Recurse on subgroup(s) if > vect_build_slp_tree fails during basic block SLP. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/bb-slp-7.c: Make that SLP group even more > non-isomorphic. > * gcc.dg/vect/bb-slp-subgroups-1.c: New. > * gcc.dg/vect/bb-slp-subgroups-2.c: New. > --- > gcc/testsuite/gcc.dg/vect/bb-slp-7.c | 8 ++-- > gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c | 44 ++++++++++++++++++ > gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c | 42 +++++++++++++++++ > gcc/tree-vect-slp.c | 63 > +++++++++++++++++++++++++- > 4 files changed, 152 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c > index ab54a48..b012d78 100644 > --- a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c > @@ -16,12 +16,12 @@ main1 (unsigned int x, unsigned int y) > unsigned int *pout = &out[0]; > unsigned int a0, a1, a2, a3; > > - /* Non isomorphic. */ > + /* Non isomorphic, even 64-bit subgroups. */ > a0 = *pin++ + 23; > - a1 = *pin++ + 142; > + a1 = *pin++ * 142; > a2 = *pin++ + 2; > a3 = *pin++ * 31; > - > + > *pout++ = a0 * x; > *pout++ = a1 * y; > *pout++ = a2 * x; > @@ -47,4 +47,4 @@ int main (void) > } > > /* { dg-final { scan-tree-dump-times "basic block vectorized" 0 "slp2" } } */ > - > + > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c > new file mode 100644 > index 0000000..39c23c3 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c > @@ -0,0 +1,44 @@ > +/* { dg-require-effective-target vect_int } */ > +/* PR tree-optimization/67682. */ > + > +#include "tree-vect.h" > + > +int __attribute__((__aligned__(8))) a[8]; > +int __attribute__((__aligned__(8))) b[4]; > + > +__attribute__ ((noinline)) void > +test () > +{ > + a[0] = b[0]; > + a[1] = b[1]; > + a[2] = b[2]; > + a[3] = b[3]; > + a[4] = 0; > + a[5] = 0; > + a[6] = 0; > + a[7] = 0; > +} > + > +int > +main (int argc, char **argv) > +{ > + check_vect (); > + > + for (int i = 0; i < 8; i++) > + a[i] = 1; > + for (int i = 0; i < 4; i++) > + b[i] = i + 4; > + __asm__ volatile ("" : : : "memory"); > + test (a, b); > + __asm__ volatile ("" : : : "memory"); > + for (int i = 0; i < 4; i++) > + if (a[i] != i+4) > + abort (); > + for (int i = 4; i < 8; i++) > + if (a[i] != 0) > + abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using > SLP" 1 "slp2" } } */ > +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c > new file mode 100644 > index 0000000..06099fd > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c > @@ -0,0 +1,42 @@ > +/* { dg-require-effective-target vect_int } */ > +/* PR tree-optimization/67682. */ > + > +#include "tree-vect.h" > + > +int __attribute__((__aligned__(8))) a[8]; > +int __attribute__((__aligned__(8))) b[8]; > + > +__attribute__ ((noinline)) void > +test () > +{ > + a[0] = b[0]; > + a[1] = b[1] + 1; > + a[2] = b[2] * 2; > + a[3] = b[3] + 3; > + > + a[4] = b[4] + 4; > + a[5] = b[5] + 4; > + a[6] = b[6] + 4; > + a[7] = b[7] + 4; > +} > + > +int > +main (int argc, char **argv) > +{ > + check_vect (); > + > + for (int i = 0; i < 8; i++) > + b[i] = i + 1; > + __asm__ volatile ("" : : : "memory"); > + test (a, b); > + __asm__ volatile ("" : : : "memory"); > + if ((a[0] != 1) || (a[1] != 3) || (a[2] != 6) || (a[3] != 7)) > + abort (); > + for (int i = 4; i < 8; i++) > + if (a[i] != i + 5) > + abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using > SLP" 1 "slp2" } } */ > +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 7a2d623..fb8d11e 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -1627,6 +1627,33 @@ vect_analyze_slp_cost (slp_instance instance, void > *data) > body_cost_vec.release (); > } > > +/* Splits a group of statements currently beginning at FIRST_STMT into two > + groups, one (still beginning at FIRST_STMT) containing the first I stmts, > + the second containing the remainder. Return the first stmt in the second > + group. */ > + > +static gimple * > +vect_split_slp_group (gimple *first_stmt, unsigned int i) > +{ > + gcc_assert (GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)) == > first_stmt); > + gcc_assert (i > 0); > + int group2_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)) - i; > + gcc_assert (group2_size > 0); > + GROUP_SIZE (vinfo_for_stmt (first_stmt)) = i; > + > + gimple *stmt = first_stmt; > + while (i-- > 1) > + stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); > + /* STMT is now the last element of the first group. */ > + gimple *first = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); > + GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0; > + > + GROUP_SIZE (vinfo_for_stmt (first)) = group2_size; > + for (stmt = first; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt))) > + GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = first;
apart from the fact that you'll post a new version you need to adjust GROUP_GAP. You also seem to somewhat "confuse" "first I stmts" and "a group of size I", those are not the same when the group has haps. I'd say "a group of size i" makes the most sense here thus I suggest to adjust the function comment accordingly. GROUP_GAP is the gap from the previous group element. The goup first elements GROUP_GAP is the gap at the _end_ of the whole group. Thus for the 2nd group the first stmts GROUP_GAP needs to be GROUP_GAP of the 1st stmt of the original group. For the new 1st group GROUP_GAP of the 1st stmt should be the old GROUP_GAP of the 1st element of the new 2nd group. And of course the counting of 'i' needs to properly factor in GROUP_GAP as well. This is probably what broke the loop-vectorization side but I'm quite sure I added some intermediate-gap BB vectorization support this year as well. Like ... = a[0]; ... = a[2]; ... = a[2]; ... = a[3]; just no gaps at the boundaries of the group (but after being split they might be on a boundary). Richard. > + return first; > +} > + > /* Analyze an SLP instance starting from a group of grouped stores. Call > vect_build_slp_tree to build a tree of packed stmts if possible. > Return FALSE if it's impossible to SLP any stmt in the loop. */ > @@ -1642,7 +1669,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, > bb_vec_info bb_vinfo, > tree vectype, scalar_type = NULL_TREE; > gimple *next; > unsigned int vectorization_factor = 0; > - int i; > + unsigned int i; > unsigned int max_nunits = 0; > vec<slp_tree> loads; > struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); > @@ -1836,6 +1863,40 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, > bb_vec_info bb_vinfo, > vect_free_slp_tree (node); > loads.release (); > > + /* For basic block vectorization, try to break the group up into multiples > + of the vectorization factor. */ > + if (bb_vinfo && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) > + { > + /* We consider breaking the group only on VF boundaries from the > existing > + start. */ > + for (i = 0; i < group_size; i++) > + if (!matches[i]) break; > + > + if (i < vectorization_factor && vectorization_factor < group_size) > + { > + /* First vector is a mix of (non-/)matches, or first element was > + impossible for another reason. Retry without the first vector. > */ > + stmt = vect_split_slp_group (stmt, vectorization_factor); > + return vect_analyze_slp_instance (loop_vinfo, bb_vinfo, > + stmt, max_tree_size); > + } > + else if (i >= vectorization_factor && i < group_size) > + { > + /* Split into two groups at the first vector boundary before i. */ > + gcc_assert ((vectorization_factor & (vectorization_factor - 1)) == > 0); > + i &= ~(vectorization_factor - 1); > + gimple *group2start = vect_split_slp_group (stmt, i); > + return vect_analyze_slp_instance (loop_vinfo, bb_vinfo, > + stmt, max_tree_size) > + | vect_analyze_slp_instance (loop_vinfo, bb_vinfo, > + group2start, max_tree_size); > + } > + } > + else > + { > + /* TODO: handle reduction case? */ > + } > + > return false; > } > > -- > 1.9.1 >