On July 20, 2021 5:56:35 PM GMT+02:00, Richard Sandiford via Gcc-patches 
<gcc-patches@gcc.gnu.org> wrote:
>Unroll and jam can sometimes leave redundancies.  E.g. for:
>
>  for (int j = 0; j < 100; ++j)
>    for (int i = 0; i < 100; ++i)
>      x[i] += y[i] * z[j][i];
>
>the new loop will do the equivalent of:
>
>  for (int j = 0; j < 100; j += 2)
>    for (int i = 0; i < 100; ++i)
>      {
>        x[i] += y[i] * z[j][i];
>        x[i] += y[i] * z[j + 1][i];
>      }
>
>with two reads of y[i] and with a round trip through memory for x[i].
>
>At the moment these redundancies survive till vectorisation, so if
>vectorisation succeeds, we're reliant on being able to remove the
>redundancies from the vector form.  This can be hard to do if
>a vector loop uses predication.  E.g. on SVE we end up with:
>
>.L3:
>        ld1w    z3.s, p0/z, [x3, x0, lsl 2]
>        ld1w    z0.s, p0/z, [x5, x0, lsl 2]
>        ld1w    z1.s, p0/z, [x2, x0, lsl 2]
>        mad     z1.s, p1/m, z0.s, z3.s
>        ld1w    z2.s, p0/z, [x4, x0, lsl 2]
>        st1w    z1.s, p0, [x3, x0, lsl 2]    // store to x[i]
>        ld1w    z1.s, p0/z, [x3, x0, lsl 2]  // load back from x[i]
>        mad     z0.s, p1/m, z2.s, z1.s
>        st1w    z0.s, p0, [x3, x0, lsl 2]
>        add     x0, x0, x6
>        whilelo p0.s, w0, w1
>        b.any   .L3
>
>This patch runs a value-numbering pass on loops after a successful
>unroll-and-jam, which gets rid of the unnecessary load and gives
>a more accurate idea of vector costs.  Unfortunately the redundant
>store still persists without a pre-vect DSE, but that feels like
>a separate issue.
>
>Note that the pass requires the loop to have a single exit,
>hence the simple calculation of exit_bbs.
>
>Tested on aarch64-linux-gnu so far, will test on x86_64-linux-gnu too.
>OK to install if testing passes?

Ok. 

Thanks, 
Richard. 

>Richard
>
>
>gcc/
>       * gimple-loop-jam.c: Include tree-ssa-sccvn.h.
>       (tree_loop_unroll_and_jam): Run value-numbering on a loop that
>       has been successfully unrolled.
>
>gcc/testsuite/
>       * gcc.dg/unroll-10.c: New test.
>---
> gcc/gimple-loop-jam.c            | 14 +++++++++-----
> gcc/testsuite/gcc.dg/unroll-10.c | 13 +++++++++++++
> 2 files changed, 22 insertions(+), 5 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/unroll-10.c
>
>diff --git a/gcc/gimple-loop-jam.c b/gcc/gimple-loop-jam.c
>index 4842f0dff80..544ad779dd6 100644
>--- a/gcc/gimple-loop-jam.c
>+++ b/gcc/gimple-loop-jam.c
>@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
> #include "tree-data-ref.h"
> #include "tree-ssa-loop-ivopts.h"
> #include "tree-vectorizer.h"
>+#include "tree-ssa-sccvn.h"
> 
> /* Unroll and Jam transformation
>    
>@@ -487,7 +488,7 @@ static unsigned int
> tree_loop_unroll_and_jam (void)
> {
>   class loop *loop;
>-  bool changed = false;
>+  unsigned int todo = 0;
> 
>   gcc_assert (scev_initialized_p ());
> 
>@@ -591,7 +592,11 @@ tree_loop_unroll_and_jam (void)
>                           &desc);
>         free_original_copy_tables ();
>         fuse_loops (outer->inner);
>-        changed = true;
>+        todo |= TODO_cleanup_cfg;
>+
>+        auto_bitmap exit_bbs;
>+        bitmap_set_bit (exit_bbs, single_dom_exit (outer)->dest->index);
>+        todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
>       }
> 
>       loop_nest.release ();
>@@ -599,13 +604,12 @@ tree_loop_unroll_and_jam (void)
>       free_data_refs (datarefs);
>     }
> 
>-  if (changed)
>+  if (todo)
>     {
>       scev_reset ();
>       free_dominance_info (CDI_DOMINATORS);
>-      return TODO_cleanup_cfg;
>     }
>-  return 0;
>+  return todo;
> }
> 
> /* Pass boilerplate */
>diff --git a/gcc/testsuite/gcc.dg/unroll-10.c
>b/gcc/testsuite/gcc.dg/unroll-10.c
>new file mode 100644
>index 00000000000..0559915f2fc
>--- /dev/null
>+++ b/gcc/testsuite/gcc.dg/unroll-10.c
>@@ -0,0 +1,13 @@
>+/* { dg-options "-O3 -fdump-tree-unrolljam" } */
>+
>+void
>+f (int *restrict x, int *restrict y, int z[restrict 100][100])
>+{
>+  for (int j = 0; j < 100; ++j)
>+    for (int i = 0; i < 100; ++i)
>+      x[i] += y[i] * z[j][i];
>+}
>+
>+/* The loop should be unrolled 2 times, leaving one load from x,
>+   one load from y and 2 loads from z.  */
>+/* { dg-final { scan-tree-dump-times { = \(*\*} 4 "unrolljam" } } */

Reply via email to