On Mon, Jan 09, 2017 at 11:08:08AM +0100, Richard Biener wrote:
> > > > There is one thing my patch doesn't do but should for efficiency, if 
> > > > loop1
> > > > (outer loop) is not successfully outer-loop vectorized, then we should 
> > > > mark
> > > > loop2 (its inner loop) as dont_vectorize if the outer loop has been
> > > > LOOP_VECTORIZED guarded.  Then the gcc.dg/gomp/pr68128-1.c change
> > > > wouldn't be actually needed.
> 
> (*)
> Ok, I don't have too many spare cycles either so can you fix (*)?  Then
> we can go with the extra versionings for GCC 7 for the moment and if any
> of us has enough time to revisit this soon we can.

Here is untested (except for the affected testcases) patch to do that,
ok if it passes bootstrap/regtest?  I'll file [8 Regression] with details
what we want to undo and do for GCC 8.

2017-01-09  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/78899
        * tree-if-conv.c (version_loop_for_if_conversion): Instead of
        returning bool return struct loop *, NULL for failure and the new
        loop on success.
        (versionable_outer_loop_p): Don't version outer loop if it has
        dont_vectorized bit set.
        (tree_if_conversion): When versioning outer loop, ensure
        tree_if_conversion is performed also on the inner loop of the
        non-vectorizable outer loop copy.
        * tree-vectorizer.c (set_uid_loop_bbs): Formatting fix.  Fold
        LOOP_VECTORIZED in inner loop of the scalar outer loop and
        prevent vectorization of it.
        (vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure
        the outer loop vectorization of the non-scalar version is attempted
        before vectorization of the inner loop in scalar version.  If
        outer LOOP_VECTORIZED guarded loop is not vectorized, prevent
        vectorization of its inner loop.
        * tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop
        has 2 inner loops, rename also on edges from bb whose single pred
        is outer_loop->header.  Fix typo in function comment.

        * gcc.target/i386/pr78899.c: New test.
        * gcc.dg/pr71077.c: New test.

--- gcc/tree-if-conv.c.jj       2017-01-06 19:34:04.052560851 +0100
+++ gcc/tree-if-conv.c  2017-01-09 11:52:59.154806369 +0100
@@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop)
    loop to execute.  The vectorizer pass will fold this
    internal call into either true or false.  */
 
-static bool
+static struct loop *
 version_loop_for_if_conversion (struct loop *loop)
 {
   basic_block cond_bb;
@@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l
     ifc_bbs[i]->aux = saved_preds[i];
 
   if (new_loop == NULL)
-    return false;
+    return NULL;
 
   new_loop->dont_vectorize = true;
   new_loop->force_vectorize = false;
@@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l
   gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, new_loop->num));
   gsi_insert_before (&gsi, g, GSI_SAME_STMT);
   update_ssa (TODO_update_ssa);
-  return true;
+  return new_loop;
 }
 
 /* Return true when LOOP satisfies the follow conditions that will
@@ -2594,6 +2594,7 @@ static bool
 versionable_outer_loop_p (struct loop *loop)
 {
   if (!loop_outer (loop)
+      || loop->dont_vectorize
       || !loop->inner
       || loop->inner->next
       || !single_exit (loop)
@@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l
       || !single_pred_p (loop->latch)
       || !single_pred_p (loop->inner->latch))
     return false;
-  
+
   basic_block outer_exit = single_pred (loop->latch);
   basic_block inner_exit = single_pred (loop->inner->latch);
 
@@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop)
 {
   unsigned int todo = 0;
   bool aggressive_if_conv;
+  struct loop *rloop;
 
+ again:
+  rloop = NULL;
   ifc_bbs = NULL;
   any_pred_load_store = false;
   any_complicated_phi = false;
@@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop)
       struct loop *vloop
        = (versionable_outer_loop_p (loop_outer (loop))
           ? loop_outer (loop) : loop);
-      if (!version_loop_for_if_conversion (vloop))
+      struct loop *nloop = version_loop_for_if_conversion (vloop);
+      if (nloop == NULL)
        goto cleanup;
+      if (vloop != loop)
+       {
+         /* If versionable_outer_loop_p decided to version the
+            outer loop, version also the inner loop of the non-vectorized
+            loop copy.  So we transform:
+             loop1
+               loop2
+            into:
+             if (LOOP_VECTORIZED (1, 3))
+               {
+                 loop1
+                   loop2
+               }
+             else
+               loop3 (copy of loop1)
+                 if (LOOP_VECTORIZED (4, 5))
+                   loop4 (copy of loop2)
+                 else
+                   loop5 (copy of loop4)  */
+         gcc_assert (nloop->inner && nloop->inner->next == NULL);
+         rloop = nloop->inner;
+       }
     }
 
   /* Now all statements are if-convertible.  Combine all the basic
@@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop)
       free (ifc_bbs);
       ifc_bbs = NULL;
     }
+  if (rloop != NULL)
+    {
+      loop = rloop;
+      goto again;
+    }
 
   return todo;
 }
--- gcc/tree-vectorizer.c.jj    2017-01-06 19:34:04.064560697 +0100
+++ gcc/tree-vectorizer.c       2017-01-09 12:04:08.721053174 +0100
@@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr
       update_stmt (use_stmt);
     }
 }
+
 /* Set the uids of all the statements in basic blocks inside loop
    represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal
    call guarding the loop which has been if converted.  */
@@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin
   struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
 
   LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop;
-  gcc_checking_assert (vect_loop_vectorized_call
-                      (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+  gcc_checking_assert (vect_loop_vectorized_call (scalar_loop)
                       == loop_vectorized_call);
+  /* If we are going to vectorize outer loop, prevent vectorization
+     of the inner loop in the scalar loop - either the scalar loop is
+     thrown away, so it is a wasted work, or is used only for
+     a few iterations.  */
+  if (scalar_loop->inner)
+    {
+      gimple *g = vect_loop_vectorized_call (scalar_loop->inner);
+      if (g)
+       {
+         arg = gimple_call_arg (g, 0);
+         get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true;
+         fold_loop_vectorized_call (g, boolean_false_node);
+       }
+    }
   bbs = get_loop_body (scalar_loop);
   for (i = 0; i < scalar_loop->num_nodes; i++)
     {
@@ -534,14 +548,59 @@ vectorize_loops (void)
      only over initial loops skipping newly generated ones.  */
   FOR_EACH_LOOP (loop, 0)
     if (loop->dont_vectorize)
-      any_ifcvt_loops = true;
-    else if ((flag_tree_loop_vectorize
-             && optimize_loop_nest_for_speed_p (loop))
-            || loop->force_vectorize)
       {
-       loop_vec_info loop_vinfo, orig_loop_vinfo = NULL;
-       gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
-vectorize_epilogue:
+       any_ifcvt_loops = true;
+       /* If-conversion sometimes versions both the outer loop
+          (for the case when outer loop vectorization might be
+          desirable) as well as the inner loop in the scalar version
+          of the loop.  So we have:
+           if (LOOP_VECTORIZED (1, 3))
+             {
+               loop1
+                 loop2
+             }
+           else
+             loop3 (copy of loop1)
+               if (LOOP_VECTORIZED (4, 5))
+                 loop4 (copy of loop2)
+               else
+                 loop5 (copy of loop4)
+          If FOR_EACH_LOOP gives us loop3 first (which has
+          dont_vectorize set), make sure to process loop1 before loop4;
+          so that we can prevent vectorization of loop4 if loop1
+          is successfully vectorized.  */
+       if (loop->inner)
+         {
+           gimple *loop_vectorized_call
+             = vect_loop_vectorized_call (loop);
+           if (loop_vectorized_call
+               && vect_loop_vectorized_call (loop->inner))
+             {
+               tree arg = gimple_call_arg (loop_vectorized_call, 0);
+               struct loop *vector_loop
+                 = get_loop (cfun, tree_to_shwi (arg));
+               if (vector_loop && vector_loop != loop)
+                 {
+                   loop = vector_loop;
+                   /* Make sure we don't vectorize it twice.  */
+                   loop->dont_vectorize = true;
+                   goto try_vectorize;
+                 }
+             }
+         }
+      }
+    else
+      {
+       loop_vec_info loop_vinfo, orig_loop_vinfo;
+       gimple *loop_vectorized_call;
+       try_vectorize:
+       if (!((flag_tree_loop_vectorize
+              && optimize_loop_nest_for_speed_p (loop))
+             || loop->force_vectorize))
+         continue;
+       orig_loop_vinfo = NULL;
+       loop_vectorized_call = vect_loop_vectorized_call (loop);
+       vectorize_epilogue:
        vect_location = find_loop_location (loop);
         if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
            && dump_enabled_p ())
@@ -595,6 +654,12 @@ vectorize_epilogue:
                    ret |= TODO_cleanup_cfg;
                  }
              }
+           /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
+              loop, don't vectorize its inner loop; we'll attempt to
+              vectorize LOOP_VECTORIZED guarded inner loop of the scalar
+              loop version.  */
+           if (loop_vectorized_call && loop->inner)
+             loop->inner->dont_vectorize = true;
            continue;
          }
 
--- gcc/tree-vect-loop-manip.c.jj       2017-01-06 19:34:04.040561006 +0100
+++ gcc/tree-vect-loop-manip.c  2017-01-09 11:52:59.155806356 +0100
@@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p)
 }
 
 
-/* Renames the variables in basic block BB.  Allow renaming  of PHI argumnets
+/* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
    true.  */
 
@@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb,
 
   FOR_EACH_EDGE (e, ei, bb->preds)
     {
-      if (!flow_bb_inside_loop_p (loop, e->src)
-         && (!rename_from_outer_loop || e->src != outer_loop->header))
-       continue;
+      if (!flow_bb_inside_loop_p (loop, e->src))
+       {
+         if (!rename_from_outer_loop)
+           continue;
+         if (e->src != outer_loop->header)
+           {
+             if (outer_loop->inner->next)
+               {
+                 /* If outer_loop has 2 inner loops, allow there to
+                    be an extra basic block which decides which of the
+                    two loops to use using LOOP_VECTORIZED.  */
+                 if (!single_pred_p (e->src)
+                     || single_pred (e->src) != outer_loop->header)
+                   continue;
+               }
+             else
+               continue;
+           }
+       }
       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
           gsi_next (&gsi))
         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
--- gcc/testsuite/gcc.target/i386/pr78899.c.jj  2017-01-09 11:52:59.156806343 
+0100
+++ gcc/testsuite/gcc.target/i386/pr78899.c     2017-01-09 11:52:59.156806343 
+0100
@@ -0,0 +1,27 @@
+/* PR tree-optimization/78899 */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */
+
+#define N 1024
+#define M 4
+int p1[N], p2[N], p3[N], c[N];
+
+void
+foo (int n)
+{
+  int i, k;
+  for (k = 0; k < n / M; k++)
+    {
+    #pragma omp simd
+      for (i = 0; i < M; i++)
+       if (c[k * M + i])
+         {
+           p1[k * M + i] += 1;
+           p2[k * M + i] = p3[k * M + i] + 2;
+         }
+    }
+}
+
+/* Ensure the loop is vectorized.  */
+/* { dg-final { scan-assembler "vpmaskmov" } } */
+/* { dg-final { scan-assembler "vpadd" } } */
--- gcc/testsuite/gcc.dg/pr71077.c.jj   2017-01-09 11:52:59.179806042 +0100
+++ gcc/testsuite/gcc.dg/pr71077.c      2017-01-09 11:52:59.179806042 +0100
@@ -0,0 +1,14 @@
+/* PR c++/71077  */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
+
+void
+foo (int *a, int n)
+{
+  int b, c;
+  for (b = 0; b < n; b++)
+    for (c = 0; c < 32; c++)
+      if ((b & 1U) << c)
+       a[b + c] = 0;
+}


        Jakub

Reply via email to