vect_analyze_slp_instance currently only creates an slp_instance if _all_ stores in a group fitted the same pattern. This patch splits non-matching groups up on vector boundaries, allowing only part of the group to be SLP'd, or multiple subgroups to be SLP'd differently.
The algorithm could be made more efficient: we have more info available in the matches vector, and a single match in a vector full of non-matches, means we will be unable to SLP). But the double-recursion case has at most log(N) depth and the single-recursion case is at worst N^2 in *the group width*, which is generally small. This could possibly be extended to hybrid SLP, but I believe that would also require splitting the load groups, as at present removing the bb_vinfo check ends up with data being stored to the wrong locations e.g. in slp-11a.c. Hence, leaving that extension to a future patch. Bootstrapped + check-{gcc,g++,fortran} on aarch64, x86_64 and arm (v7-a neon). Thanks, Alan gcc/ChangeLog: * tree-vect-slp.c (vect_split_slp_group): New. (vect_analyze_slp_instance): Recurse on subgroup(s) if vect_build_slp_tree fails during basic block SLP. gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-7.c: Make that SLP group even more non-isomorphic. * gcc.dg/vect/bb-slp-subgroups-1.c: New. * gcc.dg/vect/bb-slp-subgroups-2.c: New. --- gcc/testsuite/gcc.dg/vect/bb-slp-7.c | 8 ++-- gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c | 44 ++++++++++++++++++ gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c | 42 +++++++++++++++++ gcc/tree-vect-slp.c | 63 +++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c index ab54a48..b012d78 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-7.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-7.c @@ -16,12 +16,12 @@ main1 (unsigned int x, unsigned int y) unsigned int *pout = &out[0]; unsigned int a0, a1, a2, a3; - /* Non isomorphic. */ + /* Non isomorphic, even 64-bit subgroups. */ a0 = *pin++ + 23; - a1 = *pin++ + 142; + a1 = *pin++ * 142; a2 = *pin++ + 2; a3 = *pin++ * 31; - + *pout++ = a0 * x; *pout++ = a1 * y; *pout++ = a2 * x; @@ -47,4 +47,4 @@ int main (void) } /* { dg-final { scan-tree-dump-times "basic block vectorized" 0 "slp2" } } */ - + diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c new file mode 100644 index 0000000..39c23c3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-1.c @@ -0,0 +1,44 @@ +/* { dg-require-effective-target vect_int } */ +/* PR tree-optimization/67682. */ + +#include "tree-vect.h" + +int __attribute__((__aligned__(8))) a[8]; +int __attribute__((__aligned__(8))) b[4]; + +__attribute__ ((noinline)) void +test () +{ + a[0] = b[0]; + a[1] = b[1]; + a[2] = b[2]; + a[3] = b[3]; + a[4] = 0; + a[5] = 0; + a[6] = 0; + a[7] = 0; +} + +int +main (int argc, char **argv) +{ + check_vect (); + + for (int i = 0; i < 8; i++) + a[i] = 1; + for (int i = 0; i < 4; i++) + b[i] = i + 4; + __asm__ volatile ("" : : : "memory"); + test (a, b); + __asm__ volatile ("" : : : "memory"); + for (int i = 0; i < 4; i++) + if (a[i] != i+4) + abort (); + for (int i = 4; i < 8; i++) + if (a[i] != 0) + abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */ +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c new file mode 100644 index 0000000..06099fd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-2.c @@ -0,0 +1,42 @@ +/* { dg-require-effective-target vect_int } */ +/* PR tree-optimization/67682. */ + +#include "tree-vect.h" + +int __attribute__((__aligned__(8))) a[8]; +int __attribute__((__aligned__(8))) b[8]; + +__attribute__ ((noinline)) void +test () +{ + a[0] = b[0]; + a[1] = b[1] + 1; + a[2] = b[2] * 2; + a[3] = b[3] + 3; + + a[4] = b[4] + 4; + a[5] = b[5] + 4; + a[6] = b[6] + 4; + a[7] = b[7] + 4; +} + +int +main (int argc, char **argv) +{ + check_vect (); + + for (int i = 0; i < 8; i++) + b[i] = i + 1; + __asm__ volatile ("" : : : "memory"); + test (a, b); + __asm__ volatile ("" : : : "memory"); + if ((a[0] != 1) || (a[1] != 3) || (a[2] != 6) || (a[3] != 7)) + abort (); + for (int i = 4; i < 8; i++) + if (a[i] != i + 5) + abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using SLP" 1 "slp2" } } */ +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 7a2d623..fb8d11e 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1627,6 +1627,33 @@ vect_analyze_slp_cost (slp_instance instance, void *data) body_cost_vec.release (); } +/* Splits a group of statements currently beginning at FIRST_STMT into two + groups, one (still beginning at FIRST_STMT) containing the first I stmts, + the second containing the remainder. Return the first stmt in the second + group. */ + +static gimple * +vect_split_slp_group (gimple *first_stmt, unsigned int i) +{ + gcc_assert (GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt)) == first_stmt); + gcc_assert (i > 0); + int group2_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)) - i; + gcc_assert (group2_size > 0); + GROUP_SIZE (vinfo_for_stmt (first_stmt)) = i; + + gimple *stmt = first_stmt; + while (i-- > 1) + stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); + /* STMT is now the last element of the first group. */ + gimple *first = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)); + GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt)) = 0; + + GROUP_SIZE (vinfo_for_stmt (first)) = group2_size; + for (stmt = first; stmt; stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt))) + GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = first; + return first; +} + /* Analyze an SLP instance starting from a group of grouped stores. Call vect_build_slp_tree to build a tree of packed stmts if possible. Return FALSE if it's impossible to SLP any stmt in the loop. */ @@ -1642,7 +1669,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, tree vectype, scalar_type = NULL_TREE; gimple *next; unsigned int vectorization_factor = 0; - int i; + unsigned int i; unsigned int max_nunits = 0; vec<slp_tree> loads; struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); @@ -1836,6 +1863,40 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, vect_free_slp_tree (node); loads.release (); + /* For basic block vectorization, try to break the group up into multiples + of the vectorization factor. */ + if (bb_vinfo && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))) + { + /* We consider breaking the group only on VF boundaries from the existing + start. */ + for (i = 0; i < group_size; i++) + if (!matches[i]) break; + + if (i < vectorization_factor && vectorization_factor < group_size) + { + /* First vector is a mix of (non-/)matches, or first element was + impossible for another reason. Retry without the first vector. */ + stmt = vect_split_slp_group (stmt, vectorization_factor); + return vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + stmt, max_tree_size); + } + else if (i >= vectorization_factor && i < group_size) + { + /* Split into two groups at the first vector boundary before i. */ + gcc_assert ((vectorization_factor & (vectorization_factor - 1)) == 0); + i &= ~(vectorization_factor - 1); + gimple *group2start = vect_split_slp_group (stmt, i); + return vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + stmt, max_tree_size) + | vect_analyze_slp_instance (loop_vinfo, bb_vinfo, + group2start, max_tree_size); + } + } + else + { + /* TODO: handle reduction case? */ + } + return false; } -- 1.9.1