The following moves a overly conservative check that we do not access
excess elements when vectorizing a BB to a place where we can do
a better job with respect to the elements we actually use.

This means that for the included testcase we are not confused
by the read from c[4] but just do not vectorize the stores to x[0]
and x[1].

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Richard.

2016-11-07  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/78205
        * tree-vect-stmts.c (vectorizable_load): Move check whether
        we may run into gaps when BB vectorizing SLP permutations ...
        * tree-vect-slp.c (vect_supported_load_permutation_p): ...
        here where we can do a more precise check.

        * gcc.dg/vect/bb-slp-pr78205.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 241893)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -6548,18 +6611,6 @@ vectorizable_load (gimple *stmt, gimple_
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
        slp_perm = true;
 
-      /* ???  The following is overly pessimistic (as well as the loop
-         case above) in the case we can statically determine the excess
-        elements loaded are within the bounds of a decl that is accessed.
-        Likewise for BB vectorizations using masked loads is a possibility.  */
-      if (bb_vinfo && slp_perm && group_size % nunits != 0)
-       {
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "BB vectorization with gaps at the end of a load "
-                          "is not supported\n");
-         return false;
-       }
-
       /* Invalidate assumptions made by dependence analysis when vectorization
         on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c (revision 241893)
+++ gcc/tree-vect-slp.c (working copy)
@@ -1459,6 +1459,25 @@ vect_supported_load_permutation_p (slp_i
            SLP_TREE_LOAD_PERMUTATION (node).release ();
          else
            {
+             stmt_vec_info group_info
+               = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
+             group_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (group_info));
+             unsigned nunits
+               = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (group_info));
+             unsigned k, maxk = 0;
+             FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k)
+               if (k > maxk)
+                 maxk = k;
+             /* In BB vectorization we may not actually use a loaded vector
+                accessing elements in excess of GROUP_SIZE.  */
+             if (maxk >= (GROUP_SIZE (group_info) & ~(nunits - 1)))
+               {
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "BB vectorization with gaps at the end of "
+                                  "a load is not supported\n");
+                 return false;
+               }
+
              /* Verify the permutation can be generated.  */
              vec<tree> tem;
              unsigned n_perms;
Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c  (revision 0)
+++ gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c  (working copy)
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+double x[2], a[4], b[4], c[5];
+
+void foo ()
+{
+  a[0] = c[0];
+  a[1] = c[1];
+  a[2] = c[0];
+  a[3] = c[1];
+  b[0] = c[2];
+  b[1] = c[3];
+  b[2] = c[2];
+  b[3] = c[3];
+  x[0] = c[4];
+  x[1] = c[4];
+}
+
+/* We may not vectorize the store to x[] as it accesses c out-of bounds
+   but we do want to vectorize the other two store groups.  */
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "x\\\[\[0-1\]\\\] = " 2 "optimized" } } */

Reply via email to