The following removes late deciding to elide vectorized epilogues to
the analysis phase and also avoids altering the epilogues niter.
The costing part from vect_determine_partial_vectors_and_peeling is
moved to vect_analyze_loop_costing where we use the main loop
analysis to constrain the epilogue scalar iterations.

I have not tried to integrate this with vect_known_niters_smaller_than_vf.

It seems the for_epilogue_p parameter in
vect_determine_partial_vectors_and_peeling is largely useless and
we could compute that in the function itself.

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

I suppose testing on aarch64 would be nice-to-have - any takers?

Thanks,
Richard.

        PR tree-optimization/110310
        * tree-vect-loop.cc (vect_determine_partial_vectors_and_peeling):
        Move costing part ...
        (vect_analyze_loop_costing): ... here.  Integrate better
        estimate for epilogues from ...
        (vect_analyze_loop_2): Call vect_determine_partial_vectors_and_peeling
        with actual epilogue status.
        * tree-vect-loop-manip.cc (vect_do_peeling): ... here and
        avoid cancelling epilogue vectorization.
        (vect_update_epilogue_niters): Remove.  No longer update
        epilogue LOOP_VINFO_NITERS.

        * gcc.target/i386/pr110310.c: New testcase.
        * gcc.dg/vect/slp-perm-12.c: Disable epilogue vectorization.
---
 gcc/testsuite/gcc.dg/vect/slp-perm-12.c  |   1 +
 gcc/testsuite/gcc.target/i386/pr110310.c |  13 +++
 gcc/tree-vect-loop-manip.cc              | 104 +++++------------------
 gcc/tree-vect-loop.cc                    |  98 ++++++++++++++-------
 4 files changed, 102 insertions(+), 114 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110310.c

diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c 
b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
index 113223ab0f9..635fca54399 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
@@ -1,5 +1,6 @@
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/i386/pr110310.c 
b/gcc/testsuite/gcc.target/i386/pr110310.c
new file mode 100644
index 00000000000..dce388aeb20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110310.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver4 -fdump-tree-vect-optimized" } */
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 20; ++i)
+    a[i] = b[i] + 42;
+}
+
+/* We should vectorize the main loop with AVX512 and the epilog with SSE.  */
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 16 byte 
vectors" "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 20f570e4a0d..6c452e07880 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2882,34 +2882,6 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 }
 
-/* EPILOGUE_VINFO is an epilogue loop that we now know would need to
-   iterate exactly CONST_NITERS times.  Make a final decision about
-   whether the epilogue loop should be used, returning true if so.  */
-
-static bool
-vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
-                            unsigned HOST_WIDE_INT const_niters)
-{
-  /* Avoid wrap-around when computing const_niters - 1.  Also reject
-     using an epilogue loop for a single scalar iteration, even if
-     we could in principle implement that using partial vectors.  */
-  unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
-  if (const_niters <= gap_niters + 1)
-    return false;
-
-  /* Install the number of iterations.  */
-  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
-  tree niters_tree = build_int_cst (niters_type, const_niters);
-  tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
-
-  LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
-  LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
-
-  /* Decide what to do if the number of epilogue iterations is not
-     a multiple of the epilogue loop's vectorization factor.  */
-  return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
-}
-
 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
    Return a value that equals:
 
@@ -3039,7 +3011,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
   int estimated_vf;
   int prolog_peeling = 0;
   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
-  bool vect_epilogues_updated_niters = false;
   /* We currently do not support prolog peeling if the target alignment is not
      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
      target alignment being constant.  */
@@ -3167,36 +3138,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
   tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
   edge update_e = NULL, skip_e = NULL;
   unsigned int lowest_vf = constant_lower_bound (vf);
-  /* If we know the number of scalar iterations for the main loop we should
-     check whether after the main loop there are enough iterations left over
-     for the epilogue.  */
-  if (vect_epilogues
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && prolog_peeling >= 0
-      && known_eq (vf, lowest_vf))
-    {
-      unsigned HOST_WIDE_INT eiters
-       = (LOOP_VINFO_INT_NITERS (loop_vinfo)
-          - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
-
-      eiters -= prolog_peeling;
-      eiters
-       = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
-
-      while (!vect_update_epilogue_niters (epilogue_vinfo, eiters))
-       {
-         delete epilogue_vinfo;
-         epilogue_vinfo = NULL;
-         if (loop_vinfo->epilogue_vinfos.length () == 0)
-           {
-             vect_epilogues = false;
-             break;
-           }
-         epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
-         loop_vinfo->epilogue_vinfos.ordered_remove (0);
-       }
-      vect_epilogues_updated_niters = true;
-    }
   /* Prolog loop may be skipped.  */
   bool skip_prolog = (prolog_peeling != 0);
   /* Skip this loop to epilog when there are not enough iterations to enter 
this
@@ -3473,9 +3414,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
         skip_e edge.  */
       if (skip_vector)
        {
-         gcc_assert (update_e != NULL
-                     && skip_e != NULL
-                     && !vect_epilogues_updated_niters);
+         gcc_assert (update_e != NULL && skip_e != NULL);
          gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
                                           update_e->dest);
          tree new_ssa = make_ssa_name (TREE_TYPE (niters));
@@ -3506,28 +3445,25 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
         loop and its prologue.  */
       *advance = niters;
 
-      if (!vect_epilogues_updated_niters)
-       {
-         /* Subtract the number of iterations performed by the vectorized loop
-            from the number of total iterations.  */
-         tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
-                                             before_loop_niters,
-                                             niters);
-
-         LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
-         LOOP_VINFO_NITERSM1 (epilogue_vinfo)
-           = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
-                          epilogue_niters,
-                          build_one_cst (TREE_TYPE (epilogue_niters)));
-
-         /* Decide what to do if the number of epilogue iterations is not
-            a multiple of the epilogue loop's vectorization factor.
-            We should have rejected the loop during the analysis phase
-            if this fails.  */
-         if (!vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
-                                                          true))
-           gcc_unreachable ();
-       }
+      /* Subtract the number of iterations performed by the vectorized loop
+        from the number of total iterations.  */
+      tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
+                                         before_loop_niters,
+                                         niters);
+
+      LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
+      LOOP_VINFO_NITERSM1 (epilogue_vinfo)
+       = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
+                      epilogue_niters,
+                      build_one_cst (TREE_TYPE (epilogue_niters)));
+
+      /* Decide what to do if the number of epilogue iterations is not
+        a multiple of the epilogue loop's vectorization factor.
+        We should have rejected the loop during the analysis phase
+        if this fails.  */
+      bool res = vect_determine_partial_vectors_and_peeling (epilogue_vinfo,
+                                                            true);
+      gcc_assert (res);
     }
 
   adjust_vec.release ();
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 0a03f56aae7..f39a1ecb306 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2144,14 +2144,76 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
 
   /* Only loops that can handle partially-populated vectors can have iteration
      counts less than the vectorization factor.  */
-  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && vect_known_niters_smaller_than_vf (loop_vinfo))
     {
-      if (vect_known_niters_smaller_than_vf (loop_vinfo))
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: iteration count smaller than "
+                        "vectorization factor.\n");
+      return 0;
+    }
+
+  /* If we know the number of iterations we can do better, for the
+     epilogue we can also decide whether the main loop leaves us
+     with enough iterations, prefering a smaller vector epilog then
+     also possibly used for the case we skip the vector loop.  */
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      widest_int scalar_niters
+       = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
+      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+       {
+         loop_vec_info orig_loop_vinfo
+           = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+         unsigned lowest_vf
+           = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
+         int prolog_peeling = 0;
+         if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
+           prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
+         if (prolog_peeling >= 0
+             && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
+                          lowest_vf))
+           {
+             unsigned gap
+               = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
+             scalar_niters = ((scalar_niters - gap - prolog_peeling)
+                              % lowest_vf + gap);
+             if (scalar_niters == 0)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "not vectorized: loop never entered\n");
+                 return 0;
+               }
+           }
+       }
+
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      if (known_lt (scalar_niters, vf))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: iteration count smaller than "
-                            "vectorization factor.\n");
+                            "loop does not have enough iterations "
+                            "to support vectorization.\n");
+         return 0;
+       }
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_le (scalar_niters, vf))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "loop does not have enough iterations "
+                            "to support peeling for gaps.\n");
          return 0;
        }
     }
@@ -2502,31 +2564,6 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
                              LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
     }
 
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      /* Check that the loop processes at least one full vector.  */
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop does not have enough iterations"
-                                      " to support vectorization.\n");
-
-      /* If we need to peel an extra epilogue iteration to handle data
-        accesses with gaps, check that there are enough scalar iterations
-        available.
-
-        The check above is redundant with this one when peeling for gaps,
-        but the distinction is useful for diagnostics.  */
-      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-         && known_lt (wi::to_widest (scalar_nitersm1), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop does not have enough iterations"
-                                      " to support peeling for gaps.\n");
-    }
-
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
        && need_peeling_or_partial_vectors_p);
@@ -3002,7 +3039,8 @@ start_over:
      assuming that the loop will be used as a main loop.  We will redo
      this analysis later if we instead decide to use the loop as an
      epilogue loop.  */
-  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  ok = vect_determine_partial_vectors_and_peeling
+        (loop_vinfo, LOOP_VINFO_EPILOGUE_P (loop_vinfo));
   if (!ok)
     return ok;
 
-- 
2.35.3

Reply via email to