The following fixes missed optimizations due to the strange way we
split stores in BB vectorization.  The solution is to split at
the failure boundary and not re-align that to the initial piece
chosen vector size.  Also re-analyze any larger matching rest.

Bootstrapped / tested on x86_64-unknown-linux-gnu, pushed.

2020-10-28  Richard Biener  <rguent...@suse.de>

        * tree-vect-slp.c (vect_build_slp_instance): Split the store
        group at the failure boundary and also re-analyze a large enough
        matching rest.

        * gcc.dg/vect/bb-slp-68.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-68.c | 22 ++++++++++++++++++++++
 gcc/tree-vect-slp.c                   | 20 +++++++++++++-------
 2 files changed, 35 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-68.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-68.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-68.c
new file mode 100644
index 00000000000..8718031cc71
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-68.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-additional-options "-mavx" { target avx } } */
+
+double x[10], y[6], z[4];
+
+void foo ()
+{
+  x[0] = y[0];
+  x[1] = y[1];
+  x[2] = y[2];
+  x[3] = y[3];
+  x[4] = y[4];
+  x[5] = y[5];
+  x[6] = z[0] + 1.;
+  x[7] = z[1] + 1.;
+  x[8] = z[2] + 1.;
+  x[9] = z[3] + 1.;
+}
+
+/* We want to have the store group split into 4, 2, 4 when using 32byte 
vectors.  */
+/* { dg-final { scan-tree-dump-not "from scalars" "slp2" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 470b67d76b5..50a2d37eb25 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2412,15 +2412,21 @@ vect_build_slp_instance (vec_info *vinfo,
                                                               group1_size);
              bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
                                                    max_tree_size);
-             /* If the first non-match was in the middle of a vector,
-                skip the rest of that vector.  Do not bother to re-analyze
-                single stmt groups.  */
-             if (group1_size < i)
+             /* Split the rest at the failure point and possibly
+                re-analyze the remaining matching part if it has
+                at least two lanes.  */
+             if (group1_size < i
+                 && (i + 1 < group_size
+                     || i - group1_size > 1))
                {
-                 i = group1_size + const_nunits;
-                 if (i + 1 < group_size)
-                   rest = vect_split_slp_store_group (rest, const_nunits);
+                 stmt_vec_info rest2 = rest;
+                 rest = vect_split_slp_store_group (rest, i - group1_size);
+                 if (i - group1_size > 1)
+                   res |= vect_analyze_slp_instance (vinfo, bst_map,
+                                                     rest2, max_tree_size);
                }
+             /* Re-analyze the non-matching tail if it has at least
+                two lanes.  */
              if (i + 1 < group_size)
                res |= vect_analyze_slp_instance (vinfo, bst_map,
                                                  rest, max_tree_size);
-- 
2.26.2

Reply via email to