[PATCH] loop2_unroll: split loop exit during unrolling of uncountable loops

Artemiy Volkov Mon, 14 Apr 2025 05:31:12 -0700

Consider the (lightly modified) core_list_reverse () function from Coremark:


struct list_node *
core_list_reverse (struct list_node *list)
{
  struct list_node *next = 0, *tmp;
  #pragma GCC unroll 4
  while (list)
    {
      tmp = list->next;
      list->next = next;
      next = list;
      list = tmp;
    }

  return next;
}

On AArch64, this compiles to the following:

core_list_reverse:
        cbz     x0, .L2
        ldr     x1, [x0]
        mov     x6, 0
        str     x6, [x0]
        mov     x3, x0
        cbz     x1, .L2
.L4:
        ldr     x2, [x1]
        str     x3, [x1]
        mov     x0, x1
        cbz     x2, .L2
        ldr     x4, [x2]
        str     x1, [x2]
        mov     x0, x2
        cbz     x4, .L2
        ...
        mov     x0, x5
        mov     x3, x0
        ldr     x1, [x0]
        str     x6, [x0]
        cbnz    x1, .L4
.L2:
        ret

The 'next' variable lives in the x0 register, which is maintained by the
"mov x0, xR" instruction at every unrolled iteration.  However, this can
be improved by removing the instruction and splitting the loop exit into
multiple exits, each corresponding to the hard register in which the
'next' variable is at a particular iteration, so that the code looks
like:

core_list_reverse:
        cbz     x0, .L2
        mov     x1, 0
.L3:
        ldr     x2, [x0]
        str     x1, [x0]
        cbz     x2, .L2
        ldr     x3, [x2]
        str     x0, [x2]
        cbz     x3, .L13
        ...
        ldr     x0, [x1]
        str     x3, [x1]
        cbnz    x0, .L3
        mov     x0, x1
.L2:
        ret
.L13:
        mov     x0, x2
        ret
.L14:
        mov     x0, x3
        ret

This patch implements this transformation by splitting variables defined
in the loop and live at the (single) exit BB of an uncountable loop,
replacing each of those with a unique temporary pseudo inside the loop
and assigning these pseudos back to the original variables in the newly
split exit BBs (one per unrolled iteration).  (This is the behavior that
the GIMPLE unroller would exhibit were it capable of handling
uncountable loops.)  Afterwards, the cprop pass is able to propagate the
(split) loop variables and carry the move instructions out of the loop.

This change is primarily intended for small in-order cores on which the
latency of the move isn't hidden by the latency of the load.  The
optimization is guarded by the new -fsplit-exit-in-unroller flag that is
on by default.  The flag has been documented in doc/invoke.texi, and the
common.opt.urls file has been regenerated.

The change has been bootstrapped and regtested on i386, x86_64, and
aarch64, and additionally regtested on riscv32.  Two new testcases have
been added to demonstrate operation and interaction with
-fvariable-expansion-in-unroller.  On a Cortex-A53, this patch leads to
an ~0.5% improvement for Coremark and SPECINT2006 geomean (the only
regression being 483.xalancbmk), when compiled with -O2
-funroll-all-loops.  The compile-time increase is ~0.4%.

        PR rtl-optimization/119681

gcc/ChangeLog:

        * common.opt (-fsplit-exit-in-unroller): New flag.
        * common.opt.urls: Regenerate.
        * doc/invoke.texi (-fsplit-exit-in-unroller): Document it.
        * loop-unroll.cc (split_exit): New function.
        (regno_defined_inside_loop_p): New predicate.
        (unroll_loop_stupid): Call split_exit ().
        (has_use_with_multiple_defs_p): New predicate.

gcc/testsuite/ChangeLog:

        * gcc.dg/loop-exit-split-1.c: New test.
        * gcc.dg/loop-exit-split-2.c: New test.

Signed-off-by: Artemiy Volkov <arte...@synopsys.com>
---
 gcc/common.opt                           |   4 +
 gcc/common.opt.urls                      |   3 +
 gcc/doc/invoke.texi                      |  12 +-
 gcc/loop-unroll.cc                       | 201 +++++++++++++++++++++++
 gcc/testsuite/gcc.dg/loop-exit-split-1.c |  31 ++++
 gcc/testsuite/gcc.dg/loop-exit-split-2.c |  33 ++++
 6 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/loop-exit-split-1.c
 create mode 100644 gcc/testsuite/gcc.dg/loop-exit-split-2.c

diff --git a/gcc/common.opt b/gcc/common.opt
index 2c8fdde54f3..8085fcd0aca 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1645,6 +1645,10 @@ fdump-unnumbered-links
 Common Var(flag_dump_unnumbered_links)
 Suppress output of previous and next insn numbers in debugging dumps.
 
+fsplit-exit-in-unroller
+Common Var(flag_split_exit_in_unroller) Init(1) Optimization
+Split exit basic blocks of uncountable loops.
+
 fdwarf2-cfi-asm
 Common Var(flag_dwarf2_cfi_asm) Init(HAVE_GAS_CFI_DIRECTIVE)
 Enable CFI tables via GAS assembler directives.
diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index a4b14f5241f..800334fd9e2 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -654,6 +654,9 @@ UrlSuffix(gcc/Developer-Options.html#index-fdump-unnumbered)
 fdump-unnumbered-links
 UrlSuffix(gcc/Developer-Options.html#index-fdump-unnumbered-links)
 
+fsplit-exit-in-unroller
+UrlSuffix(gcc/Optimize-Options.html#index-fsplit-exit-in-unroller)
+
 fdwarf2-cfi-asm
 UrlSuffix(gcc/Debugging-Options.html#index-fdwarf2-cfi-asm)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 67155eeeda7..6ccdcfd8bc9 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -637,8 +637,8 @@ Objective-C and Objective-C++ Dialects}.
 -fsel-sched-pipelining  -fsel-sched-pipelining-outer-loops
 -fsemantic-interposition  -fshrink-wrap  -fshrink-wrap-separate
 -fsignaling-nans
--fsingle-precision-constant  -fsplit-ivs-in-unroller  -fsplit-loops
--fsplit-paths
+-fsingle-precision-constant -fsplit-exit-in-unroller  -fsplit-ivs-in-unroller
+-fsplit-loops -fsplit-paths
 -fsplit-wide-types  -fsplit-wide-types-early  -fssa-backprop  -fssa-phiopt
 -fstdarg-opt  -fstore-merging  -fstrict-aliasing -fipa-strict-aliasing
 -fthread-jumps  -ftracer  -ftree-bit-ccp
@@ -14459,6 +14459,14 @@ Split paths leading to loop backedges.  This can 
improve dead code
 elimination and common subexpression elimination.  This is enabled by
 default at @option{-O3} and above.
 
+@opindex fsplit-exit-in-unroller
+@item -fsplit-exit-in-unroller
+This option allows converting a single-exit uncountable loop into a
+multiple-exit one while splitting variables defined in the loop to improve
+propagation opportunities, saving move instructions inside the loop body.
+
+This optimization is enabled by default.
+
 @opindex fsplit-ivs-in-unroller
 @item -fsplit-ivs-in-unroller
 Enables expression of values of induction variables in later iterations
diff --git a/gcc/loop-unroll.cc b/gcc/loop-unroll.cc
index b4952055318..078b3fa057c 100644
--- a/gcc/loop-unroll.cc
+++ b/gcc/loop-unroll.cc
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "dojump.h"
 #include "expr.h"
 #include "dumpfile.h"
+#include "df.h"
 
 /* This pass performs loop unrolling.  We only perform this
    optimization on innermost loops (with single exception) because
@@ -172,6 +173,9 @@ static void unroll_loop_runtime_iterations (class loop *);
 static struct opt_info *analyze_insns_in_loop (class loop *);
 static void opt_info_start_duplication (struct opt_info *);
 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
+static void split_exit (class loop *);
+static bool regno_defined_inside_loop_p (unsigned, class loop *);
+static bool has_use_with_multiple_defs_p (df_link *);
 static void free_opt_info (struct opt_info *);
 static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn 
*);
 static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
@@ -1261,6 +1265,10 @@ unroll_loop_stupid (class loop *loop)
       free_opt_info (opt_info);
     }
 
+  /* Create multiple exits to uncover more propagation opportunities.  */
+  if (flag_split_exit_in_unroller)
+    split_exit (loop);
+
   if (desc->simple_p)
     {
       /* We indeed may get here provided that there are nontrivial assumptions
@@ -2121,6 +2129,199 @@ apply_opt_in_copies (struct opt_info *opt_info,
     }
 }
 
+/* Return TRUE if there is an instruction defining REGNO
+   inside loop LOOP.  */
+
+static bool
+regno_defined_inside_loop_p (unsigned regno, class loop *loop)
+{
+  for (df_ref def = DF_REG_DEF_CHAIN (regno); def; def = DF_REF_NEXT_REG (def))
+    {
+      if (DF_REF_IS_ARTIFICIAL (def))
+       continue;
+
+      if (!flow_bb_inside_loop_p (loop, DF_REF_BB (def)))
+       continue;
+
+      if (dump_file)
+       {
+         fprintf (dump_file, ";; Found definition of ");
+         print_simple_rtl (dump_file, regno_reg_rtx[regno]);
+         fprintf (dump_file, "\n;; in BB %d (loop %d)\n",
+               DF_REF_BB (def)->index, loop->num);
+       }
+
+      return true;
+    }
+
+  return false;
+}
+
+static bool
+has_use_with_multiple_defs_p (df_link *def)
+{
+  for (df_link *use = DF_REF_CHAIN (def->ref);
+               use;
+               use = use->next)
+    if (DF_REF_CHAIN (use->ref)->next)
+      return true;
+
+  return false;
+}
+
+/* Determine if LOOP has multiple exits to the same BB (happens after
+   unroll_loop_stupid ()), and if so, try to split DU chains for pseudos
+   defined inside the loop and live at the entry to the exit BB.  */
+static void
+split_exit (class loop *loop)
+{
+  basic_block exit_bb = NULL;
+  basic_block new_bb;
+  edge e;
+  unsigned j;
+  unsigned regno;
+  bitmap_iterator it;
+  auto_bitmap pseudos_to_replace;
+  auto_vec<basic_block> new_exit_bbs;
+  auto_vec<edge> exit_edges;
+  rtx_insn *insn;
+
+  /* Sanity check: after unrolling, the loop should have more than
+     one exit, all to the same basic block.  */
+  exit_edges = get_loop_exit_edges (loop);
+  if (exit_edges.length () <= 1)
+    return;
+
+  FOR_EACH_VEC_ELT (exit_edges, j, e)
+    {
+      if (!exit_bb)
+       exit_bb = e->dest;
+      else if (exit_bb != e->dest)
+       return;
+    }
+
+  if (dump_file)
+    fprintf (dump_file, "\n;; Splitting loop exit for loop %d\n", loop->num);
+
+  /* Construct a bitmap of pseudos that are live in at the exit BB
+     and are defined within the loop.  */
+  if (optimize == 1)
+    df_live_add_problem ();
+
+  df_set_blocks (NULL);
+  df_analyze ();
+
+  EXECUTE_IF_SET_IN_BITMAP (DF_LIVE_IN (exit_bb),
+                           FIRST_PSEUDO_REGISTER, regno, it)
+    if (regno_defined_inside_loop_p (regno, loop))
+      {
+       if (dump_file)
+         fprintf (dump_file,
+                  ";; added %u to list of pseudos to replace\n", regno);
+
+       bitmap_set_bit (pseudos_to_replace, regno);
+      }
+
+  if (bitmap_empty_p (pseudos_to_replace))
+    {
+      if (dump_file)
+       fprintf (dump_file, ";; no pseudos to replace, exiting\n");
+      return;
+    }
+
+  /* Split edge + insert copy instructions for each pseudo in
+     pseudos_to_replace.  */
+  FOR_EACH_VEC_ELT (exit_edges, j, e)
+    {
+      start_sequence ();
+
+      EXECUTE_IF_SET_IN_BITMAP (pseudos_to_replace,
+                               FIRST_PSEUDO_REGISTER, regno, it)
+       emit_insn (gen_move_insn (regno_reg_rtx[regno],
+                                 regno_reg_rtx[regno]));
+
+      insn = get_insns ();
+      end_sequence ();
+
+      new_bb = split_edge_and_insert (e, insn);
+      new_exit_bbs.safe_push (new_bb);
+
+      if (dump_file)
+       fprintf (dump_file, ";; Split exit edge %d->%d\n",
+                e->src->index, exit_bb->index);
+    }
+
+  if (optimize == 1)
+    df_remove_problem (df_live);
+
+  /* Redo chain construction with the new exit BBs.  */
+  df_remove_problem (df_chain);
+
+  df_set_flags (DF_NO_HARD_REGS + DF_DEFER_INSN_RESCAN);
+  df_chain_add_problem (DF_DU_CHAIN + DF_UD_CHAIN);
+  df_set_blocks (NULL);
+
+  df_analyze ();
+
+  /* Iterate over the new exit BBs, which by this point contain only
+     instructions of the form "mov Rx, Rx".  Try to rename the source
+     pseudo together with the whole DU-chain of its definition.  */
+  FOR_EACH_VEC_ELT (new_exit_bbs, j, new_bb)
+    FOR_BB_INSNS (new_bb, insn)
+      {
+       rtx set, old_reg, new_reg;
+       df_ref use;
+       df_link *def, *du;
+
+       if (!INSN_P (insn))
+         continue;
+
+       /* Each insn is a single set of a pseudo to itself.  Check
+          conditions required for renaming the use operand of the insn.  */
+       set = single_set (insn);
+       gcc_assert (set);
+
+       use = df_single_use (DF_INSN_INFO_GET (insn));
+       gcc_assert (use);
+
+       /* There must be exactly one reachable definition.  */
+       def = DF_REF_CHAIN (use);
+       if (!def || !def->ref || DF_REF_IS_ARTIFICIAL (def->ref) || def->next)
+         break;
+
+       /* The definition must be the only one reachable by all its uses.  */
+       if (has_use_with_multiple_defs_p (def))
+         break;
+
+       /* If the conditions above are met, replace the def and all its uses
+          with a newly allocated pseudo.  */
+       old_reg = SET_DEST (set);
+       new_reg = gen_reg_rtx (GET_MODE (old_reg));
+
+       if (dump_file)
+         {
+           fprintf (dump_file, ";; replacing DU-chain for reg %d (def: ",
+                               REGNO (old_reg));
+           df_refs_chain_dump (def->ref, false, dump_file);
+           fprintf (dump_file, ") with reg %d\n", REGNO (new_reg));
+         }
+
+       for (du = DF_REF_CHAIN (def->ref); du; du = du->next)
+           validate_replace_src_group (old_reg, new_reg,
+                           DF_REF_INSN (du->ref));
+
+       validate_replace_rtx_subexp (old_reg, new_reg,
+                         DF_REF_INSN (def->ref), DF_REF_REAL_LOC (def->ref));
+      }
+
+  /* Fix notes.  */
+  df_note_add_problem ();
+
+  df_analyze ();
+
+  df_finish_pass (true);
+}
+
 /* Release OPT_INFO.  */
 
 static void
diff --git a/gcc/testsuite/gcc.dg/loop-exit-split-1.c 
b/gcc/testsuite/gcc.dg/loop-exit-split-1.c
new file mode 100644
index 00000000000..a732ced44ad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-exit-split-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funroll-all-loops -fno-variable-expansion-in-unroller 
-fsplit-exit-in-unroller -fdump-rtl-loop2_unroll-details" } */
+
+/* Reduced from Coremark's core_list_reverse () function.  */
+
+struct list_node {
+  struct list_node *next;
+};
+
+struct list_node *
+list_reverse (struct list_node *list)
+{
+  struct list_node *next = 0, *tmp;
+  #pragma GCC unroll 4
+  while (list)
+    {
+      tmp = list->next;
+      list->next = next;
+      next = list;
+      list = tmp;
+    }
+  
+  return next;
+}
+
+/* { dg-final { scan-rtl-dump-times "15:.*: optimized: loop unrolled 3 times" 
1 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Splitting loop exit for loop 1" 1 
"loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Found definition of \\\(reg \[0-9\]+ 
\\\[ list \\\]\\\)" 1 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; added \[0-9\]+ to list of pseudos to 
replace" 1 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Split exit edge " 4 "loop2_unroll" } } 
*/
+/* { dg-final { scan-rtl-dump-times ";; replacing DU-chain for reg \[0-9\]+" 3 
"loop2_unroll" } } */
diff --git a/gcc/testsuite/gcc.dg/loop-exit-split-2.c 
b/gcc/testsuite/gcc.dg/loop-exit-split-2.c
new file mode 100644
index 00000000000..fe51f70cea5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/loop-exit-split-2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -funroll-all-loops -fvariable-expansion-in-unroller 
-fsplit-exit-in-unroller -fdump-rtl-loop2_unroll-details -ffast-math" } */
+/* { dg-require-effective-target hard_float } */
+
+/* This testcase combines loop-exit-split1.c and var-expand1.c.  */
+
+struct list_node {
+  struct list_node *next;
+  float data;
+};
+
+float
+list_reverse (struct list_node *list)
+{
+  struct list_node *next = 0, *tmp;
+  float accum = 0;
+  #pragma GCC unroll 4
+  while (list)
+    {
+      accum += list->data;
+      list = list->next;
+    }
+  
+  return accum;
+}
+
+/* { dg-final { scan-rtl-dump-times "Expanding Accumulator" 1 "loop2_unroll" } 
} */
+/* { dg-final { scan-rtl-dump-times "18:.*: optimized: loop unrolled 3 times" 
1 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Splitting loop exit for loop 1" 1 
"loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Found definition of \\\(reg \[0-9\]+" 
2 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; added \[0-9\]+ to list of pseudos to 
replace" 2 "loop2_unroll" } } */
+/* { dg-final { scan-rtl-dump-times ";; Split exit edge " 4 "loop2_unroll" } } 
*/
+/* { dg-final { scan-rtl-dump-times ";; replacing DU-chain for reg \[0-9\]+" 3 
"loop2_unroll" } } */
-- 
2.44.2

[PATCH] loop2_unroll: split loop exit during unrolling of uncountable loops

Reply via email to