[PATCH] vect: Enable prolog peeling for uncounted loops

Victor Do Nascimento Wed, 12 Nov 2025 05:58:56 -0800

The categorization of uncounted loops as
LOOP_VINFO_EARLY_BREAKS_VECT_PEELED disables prolog peeling by
default.  This is due to the assumption that you have early break
exits following the IV counting main exit.  For such loops, prolog
peeling is indeed problematic.


For enabling prolog peeling in uncounted loops it is sufficient, when
duplicating the loop for the prolog, to convert the prolog loop into a
counted loop, inserting a counting IV exit at the end, thus resulting
in the kind of early-break loop already supported by the compiler.

The pre-existing exits will continue to point to the exit node, while
the new exit will point to the vectorized loop, directing control flow
there once the number of iterations required for alignment are
completed.

In order to achieve this, we note that `vect_set_loop_condition'
replaces the condition in the main exit of a counted loop, all the
while inserting the prolog IV and its update statement.  The design
strategy is thus:

  - Have `slpeel_tree_duplicate_loop_to_edge_cfg' duplicate the final
  exit of the loop.  For the original exit, if the exit condition is
  true, the edge->dest will remain unchanged.  For the false
  condition, we add the the exit condition again, having split the
  single predecessor edge to the latch.
  Ultimately, other than evaluating an exit condition twice, no change
  is made to the control flow of the loop here and, consequently, no
  change is made to the control flow of the wider program to which the
  loop belongs.

  - As this new basic block will contain the IV-counting exit
  condition, its exit edge will be used for the control flow when
  alignment is achieved and thus we mark it as the new `new_exit'.
  This exit is then used in `redirect_edge_and_branch_force (new_exit,
  preheader)' and its basic block passed to `vect_set_loop_condition',
  wherein its condition will be replaced accordingly, correctly
  completing the setting up of our prolog loop.

  - In order to control this new functionality in
  slpeel_tree_duplicate_loop_to_edge_cfg we are, however, required to
  add a new parameter to the function. This is to be set to true when
  we have an uncounted loop AND we're generating its prolog.  This is
  done via the `bool duplicate_main_e' parameter, defaulting to false,
  allowing existing calls to the function to remain unchanged.

No testsuite or performance regressions on AArch64.

gcc/ChangeLog:

        * tree-vect-data-refs.cc (vect_enhance_data_refs_alignment):
        Enable peeling for uncounted loops.
        * tree-vect-loop-manip.cc
        (slpeel_tree_duplicate_loop_to_edge_cfg): Add exit condition
        duplication functionality.
        * tree-vectorizer.h (slpeel_tree_duplicate_loop_to_edge_cfg):
        Modify function signature.
        (vect_do_peeling): Enable uncounted loop peeling.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/vect-uncounted-prolog-peel-1.c: New.
---
 .../vect/vect-uncounted-prolog-peel-1.c       |  23 ++++
 gcc/tree-vect-data-refs.cc                    |   3 +-
 gcc/tree-vect-loop-manip.cc                   | 116 ++++++++++++++++--
 gcc/tree-vectorizer.h                         |   2 +-
 4 files changed, 131 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-uncounted-prolog-peel-1.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-uncounted-prolog-peel-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-uncounted-prolog-peel-1.c
new file mode 100644
index 00000000000..fab4ed0f569
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-uncounted-prolog-peel-1.c
@@ -0,0 +1,23 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+int
+foo (int *haystack, int needle)
+{
+  int i = 0;
+  while (1)
+    {
+      if (haystack[i] == needle)
+       return i;
+      i++;
+     }
+}
+
+/* { dg-final { scan-tree-dump {note:\s*Alignment of access forced using 
peeling.} "vect" } } */
+/* { dg-final { scan-tree-dump {if \(prolog_loop_niters.[0-9_]+ == 
0\)\n\s*goto} "vect" } } */
+/* { dg-final { scan-tree-dump {ivtmp_[0-9_]+ = PHI <ivtmp_[0-9_]+\([0-9_]+\), 
0\([0-9_]+\)>} "vect" } } */
+/* { dg-final { scan-tree-dump {ivtmp_[0-9_]+ = ivtmp_[0-9_]+ \+ 1;} "vect" } 
} */
+/* { dg-final { scan-tree-dump {if \(ivtmp_[0-9_]+ >= 
prolog_loop_niters.[0-9_]+\)\n\s*goto} "vect" } } */
+/* { dg-final { scan-tree-dump {vectorized 1 loops in function} "vect" } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 0c87973fc63..13aefddc0c2 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -2598,7 +2598,8 @@ vect_enhance_data_refs_alignment (loop_vec_info 
loop_vinfo)
       || loop->inner
       /* We don't currently maintaing the LCSSA for prologue peeled inversed
         loops.  */
-      || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
+      || (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
+         && !LOOP_VINFO_NITERS_UNCOUNTED_P (loop_vinfo)))
     do_peeling = false;
 
   struct _vect_peel_extended_info peel_for_known_alignment;
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 6d623e980f2..b03113f3449 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1480,7 +1480,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, 
edge loop_exit,
                                        edge scalar_exit, edge e, edge *new_e,
                                        bool flow_loops,
                                        vec<basic_block> *updated_doms,
-                                       bool uncounted_p)
+                                       bool uncounted_p, bool duplicate_main_e)
 {
   class loop *new_loop;
   basic_block *new_bbs, *bbs, *pbbs;
@@ -1938,10 +1938,98 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop 
*loop, edge loop_exit,
       flush_pending_stmts (entry_e);
       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
 
+
+      /* `vect_set_loop_condition' replaces the condition in the main exit of
+        loop.  For counted loops, this is the IV counting exit, so in the case
+        of the prolog loop, we are replacing the old IV counting exit with
+        limit of total loop niters for the new IV counting exit with limit of
+        the prolog niters, as desired.  For uncounted loops, we don't have an
+        IV-counting exit to replace, so we replicate the last exit to serve as
+        a dummy exit to be consumed by `vect_set_loop_condition' later on.  */
+      if (duplicate_main_e)
+       {
+         edge to_latch_e = single_pred_edge (new_loop->latch);
+         gcond* exit_cond = NULL;
+         for (edge exit_e : get_loop_exit_edges (new_loop))
+           if (exit_e->src == to_latch_e->src)
+             exit_cond = get_loop_exit_condition (exit_e);
+
+         /* Identify edges associated with true and false branches.  */
+         edge_iterator ei;
+         basic_block true_dest = NULL, false_dest = NULL;
+         FOR_EACH_EDGE (e, ei, to_latch_e->src->succs)
+           {
+             if (e->flags & EDGE_TRUE_VALUE)
+                 true_dest = e->dest;
+             else if (e->flags & EDGE_FALSE_VALUE)
+               false_dest = e->dest;
+           }
+
+         /* Add new bb for duplicate exit.  */
+         basic_block bbcond = split_edge (to_latch_e);
+         gimple_stmt_iterator a = gsi_last_bb (bbcond);
+
+         /* Reset flags for if/else edge.  */
+         (single_pred_edge (new_loop->latch))->flags &= ~EDGE_FALLTHRU;
+
+         /* Build the condition.  */
+         gcond *cond_copy = gimple_build_cond (gimple_cond_code (exit_cond),
+                                               gimple_cond_lhs (exit_cond),
+                                               gimple_cond_rhs (exit_cond),
+                                               NULL_TREE,
+                                               NULL_TREE);
+
+         gsi_insert_after (&a, cond_copy, GSI_NEW_STMT);
+
+         edge e_true, e_false;
+         e_true = make_edge (bbcond, true_dest, EDGE_TRUE_VALUE);
+         e_false = make_edge (bbcond, false_dest, EDGE_FALSE_VALUE);
+         if (!e_false)
+           e_false = find_edge (bbcond, false_dest);
+         if (!e_true)
+           e_true = find_edge (bbcond, true_dest);
+
+         /* Identify the new exit edge.  */
+         edge dup_exit, to_latch;
+         if (e_true->dest == new_exit->dest)
+           {
+             dup_exit = e_true;
+             to_latch = e_false;
+           }
+         else
+           {
+             dup_exit = e_false;
+             to_latch = e_true;
+           }
+         dup_exit->probability = new_exit->probability;
+         to_latch->probability = dup_exit->probability.invert ();
+
+         /* Populate new phi node args.  */
+         for (gphi_iterator gsi = gsi_start_phis (new_exit->dest);
+                                  !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gphi *phi = gsi.phi ();
+             tree merge_arg = PHI_ARG_DEF_FROM_EDGE (phi, new_exit);
+             location_t merge_loc
+               = gimple_phi_arg_location_from_edge (phi, dup_exit);
+             add_phi_arg (phi, merge_arg, dup_exit, merge_loc);
+           }
+
+         set_immediate_dominator (CDI_DOMINATORS, dup_exit->src,
+                                  new_exit->src);
+         new_exit = dup_exit;
+         *new_e = new_exit;
+       }
+
       redirect_edge_and_branch_force (new_exit, preheader);
       flush_pending_stmts (new_exit);
       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
 
+      if (duplicate_main_e)
+       set_immediate_dominator (CDI_DOMINATORS, scalar_exit->dest,
+                                recompute_dominator (CDI_DOMINATORS,
+                                                     scalar_exit->dest));
+
       /* And remove the non-necessary forwarder again.  Keep the other
          one so we have a proper pre-header for the loop at the exit edge.  */
       redirect_edge_pred (single_succ_edge (new_preheader),
@@ -1951,11 +2039,11 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop 
*loop, edge loop_exit,
                               loop_preheader_edge (new_loop)->src);
 
       /* Update dominators for multiple exits.  */
-      if (multiple_exits_p)
+      if (multiple_exits_p || duplicate_main_e)
        {
          for (edge alt_e : loop_exits)
            {
-             if (alt_e == loop_exit)
+             if ((alt_e == loop_exit) && !duplicate_main_e)
                continue;
              basic_block old_dom
                = get_immediate_dominator (CDI_DOMINATORS, alt_e->dest);
@@ -3361,14 +3449,17 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
       e = loop_preheader_edge (loop);
       edge exit_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
       gcc_checking_assert (slpeel_can_duplicate_loop_p (loop, exit_e, e)
-                          && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED 
(loop_vinfo));
+                          && (!LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)
+                              || uncounted_p));
 
       /* Peel prolog and put it on preheader edge of loop.  */
       edge scalar_e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
       edge prolog_e = NULL;
       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, exit_e,
                                                       scalar_loop, scalar_e,
-                                                      e, &prolog_e);
+                                                      e, &prolog_e, true, NULL,
+                                                      false, uncounted_p);
+
       gcc_assert (prolog);
       prolog->force_vectorize = false;
 
@@ -3421,12 +3512,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
 
       /* Update init address of DRs.  */
       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
-      /* Update niters for vector loop.  */
-      LOOP_VINFO_NITERS (loop_vinfo)
-       = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
-      LOOP_VINFO_NITERSM1 (loop_vinfo)
-       = fold_build2 (MINUS_EXPR, type,
-                      LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
+      if (!uncounted_p)
+       {
+         /* Update niters for vector loop.  */
+         LOOP_VINFO_NITERS (loop_vinfo)
+           = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
+         LOOP_VINFO_NITERSM1 (loop_vinfo)
+           = fold_build2 (MINUS_EXPR, type,
+                          LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
+       }
       bool new_var_p = false;
       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
       /* It's guaranteed that vector loop bound before vectorization is at
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bb761fdc9bb..38cdfe9aee3 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2462,7 +2462,7 @@ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class 
loop *, edge,
                                                    class loop *, edge,
                                                    edge, edge *, bool = true,
                                                    vec<basic_block> * = NULL,
-                                                   bool=false);
+                                                   bool=false, bool=false);
 class loop *vect_loop_versioning (loop_vec_info, gimple *);
 extern class loop *vect_do_peeling (loop_vec_info, tree, tree,
                                    tree *, tree *, tree *, int, bool, bool,
-- 
2.43.0

[PATCH] vect: Enable prolog peeling for uncounted loops

Reply via email to