In this PR, we used to generate:

     .L6:
          mov     v30.16b, v31.16b
          fadd    v31.4s, v31.4s, v27.4s
          fadd    v29.4s, v30.4s, v28.4s
          stp     q30, q29, [x0]
          add     x0, x0, 32
          cmp     x1, x0
          bne     .L6

for an unrolled induction in:

  for (int i = 0; i < 1024; i++)
    {
      arr[i] = freq;
      freq += step;
    }

with the problem being the unnecessary MOV.

The main induction IV was incremented by VF * step == 2 * nunits * step,
and then nunits * step was added for the second store to arr.

The original patch for the PR (r14-2367-g224fd59b2dc8) avoided the MOV
by incrementing the IV by nunits * step twice.  The problem with that
approach is that it doubles the loop-carried latency.  This change was
deliberately not preserved when moving from loop-vect to SLP and so
the test started failing again after r15-3509-gd34cda720988.

I think the main problem is that we put the IV increment in the wrong
place.  Normal IVs created by create_iv are placed before the exit
condition where possible, but vectorizable_induction instead always
inserted them at the start of the loop body.  The only use of the
incremented IV is by the phi node, so the effect is to make both
the old and new IV values live for the whole loop body, which is
why we need the MOV.

The simplest fix therefore seems to be to reuse the create_iv logic.

Bootstrapped & regression-tested on aarch64-linus-gnu and
x86_64-linux-gnu.  As Richi suggested, I also tested vect.exp
using unix/-mavx512f/--param=vect-partial-vector-usage=2.
OK to install?

Richard


gcc/
        PR tree-optimization/110449
        * tree-ssa-loop-manip.h (insert_iv_increment): Declare.
        * tree-ssa-loop-manip.cc (insert_iv_increment): New function,
        split out from...
        (create_iv): ...here and generalized to gimple_seqs.
        * tree-vect-loop.cc (vectorizable_induction): Use
        standard_iv_increment_position and insert_iv_increment
        to insert the IV increment.

gcc/testsuite/
        PR tree-optimization/110449
        * gcc.target/aarch64/pr110449.c: Expect an increment by 8.0,
        but test that there is no MOV.
---
 gcc/testsuite/gcc.target/aarch64/pr110449.c | 25 +++++----
 gcc/tree-ssa-loop-manip.cc                  | 62 ++++++++++++---------
 gcc/tree-ssa-loop-manip.h                   |  1 +
 gcc/tree-vect-loop.cc                       |  6 +-
 4 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c 
b/gcc/testsuite/gcc.target/aarch64/pr110449.c
index bb3b6dcfe08..51ca3f4b816 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr110449.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
@@ -1,8 +1,10 @@
 /* { dg-do compile } */
 /* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" 
} */
-/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
+/* { dg-final { scan-assembler {, #?8.0e\+0} } } */
+/* { dg-final { scan-assembler-not {\tmov\tv} } } */
 
-/* Calcualte the vectorized induction with smaller step for an unrolled loop.
+/* Insert the induction IV updates before the exit condition, rather than
+   at the start of the loop body.
 
    before (suggested_unroll_factor=2):
          fmov    s30, 8.0e+0
@@ -19,15 +21,16 @@
          bne     .L6
 
    after:
-         fmov    s31, 4.0e+0
-         dup     v29.4s, v31.s[0]
-     .L6:
-         fadd    v30.4s, v31.4s, v29.4s
-         stp     q31, q30, [x0]
-         add     x0, x0, 32
-         fadd    v31.4s, v29.4s, v30.4s
-         cmp     x0, x1
-         bne     .L6  */
+         fmov    s31, 8.0e+0
+         fmov    s29, 4.0e+0
+         dup     v31.4s, v31.s[0]
+         dup     v29.4s, v29.s[0]
+     .L2:
+         fadd    v30.4s, v0.4s, v29.4s
+         stp     q0, q30, [x0], 32
+         fadd    v0.4s, v0.4s, v31.4s
+         cmp     x1, x0
+         bne     .L2  */
 
 void
 foo2 (float *arr, float freq, float step)
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 6ceb9df370b..2907fa6532d 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -47,6 +47,39 @@ along with GCC; see the file COPYING3.  If not see
    so that we can free them all at once.  */
 static bitmap_obstack loop_renamer_obstack;
 
+/* Insert IV increment statements STMTS before or after INCR_POS;
+   AFTER selects which.  INCR_POS and AFTER can be computed using
+   standard_iv_increment_position.  */
+
+void
+insert_iv_increment (gimple_stmt_iterator *incr_pos, bool after,
+                    gimple_seq stmts)
+{
+  /* Prevent the increment from inheriting a bogus location if it is not put
+     immediately after a statement whose location is known.  */
+  if (after)
+    {
+      gimple_stmt_iterator gsi = *incr_pos;
+      if (!gsi_end_p (gsi))
+       gsi_next_nondebug (&gsi);
+      if (gsi_end_p (gsi))
+       {
+         edge e = single_succ_edge (gsi_bb (*incr_pos));
+         gimple_seq_set_location (stmts, e->goto_locus);
+       }
+      gsi_insert_seq_after (incr_pos, stmts, GSI_NEW_STMT);
+    }
+  else
+    {
+      gimple_stmt_iterator gsi = *incr_pos;
+      if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
+       gsi_next_nondebug (&gsi);
+      if (!gsi_end_p (gsi))
+       gimple_seq_set_location (stmts, gimple_location (gsi_stmt (gsi)));
+      gsi_insert_seq_before (incr_pos, stmts, GSI_NEW_STMT);
+    }
+}
+
 /* Creates an induction variable with value BASE (+/-) STEP * iteration in 
LOOP.
    If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
    If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
@@ -63,7 +96,6 @@ create_iv (tree base, tree_code incr_op, tree step, tree var, 
class loop *loop,
           gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
           tree *var_after)
 {
-  gassign *stmt;
   gphi *phi;
   tree initial, step1;
   gimple_seq stmts;
@@ -126,30 +158,10 @@ create_iv (tree base, tree_code incr_op, tree step, tree 
var, class loop *loop,
   if (stmts)
     gsi_insert_seq_on_edge_immediate (pe, stmts);
 
-  stmt = gimple_build_assign (va, incr_op, vb, step);
-  /* Prevent the increment from inheriting a bogus location if it is not put
-     immediately after a statement whose location is known.  */
-  if (after)
-    {
-      gimple_stmt_iterator gsi = *incr_pos;
-      if (!gsi_end_p (gsi))
-       gsi_next_nondebug (&gsi);
-      if (gsi_end_p (gsi))
-       {
-         edge e = single_succ_edge (gsi_bb (*incr_pos));
-         gimple_set_location (stmt, e->goto_locus);
-       }
-      gsi_insert_after (incr_pos, stmt, GSI_NEW_STMT);
-    }
-  else
-    {
-      gimple_stmt_iterator gsi = *incr_pos;
-      if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
-       gsi_next_nondebug (&gsi);
-      if (!gsi_end_p (gsi))
-       gimple_set_location (stmt, gimple_location (gsi_stmt (gsi)));
-      gsi_insert_before (incr_pos, stmt, GSI_NEW_STMT);
-    }
+  gimple_seq incr_stmts = nullptr;
+  gimple_seq_add_stmt (&incr_stmts,
+                      gimple_build_assign (va, incr_op, vb, step));
+  insert_iv_increment (incr_pos, after, incr_stmts);
 
   initial = force_gimple_operand (base, &stmts, true, var);
   if (stmts)
diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
index b1f65e3c071..80f680565c0 100644
--- a/gcc/tree-ssa-loop-manip.h
+++ b/gcc/tree-ssa-loop-manip.h
@@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
 
 typedef void (*transform_callback)(class loop *, void *);
 
+extern void insert_iv_increment (gimple_stmt_iterator *, bool, gimple_seq);
 extern void create_iv (tree, tree_code, tree, tree, class loop *,
                       gimple_stmt_iterator *, bool, tree *, tree *);
 extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 03426207879..eea0b89db69 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10580,6 +10580,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
   if (slp_node)
     {
+      gimple_stmt_iterator incr_si;
+      bool insert_after;
+      standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
+
       /* The initial values are vectorized, but any lanes > group_size
         need adjustment.  */
       slp_tree init_node
@@ -10810,7 +10814,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
          vec_def = gimple_build (&stmts,
                                  PLUS_EXPR, step_vectype, vec_def, up);
          vec_def = gimple_convert (&stmts, vectype, vec_def);
-         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         insert_iv_increment (&incr_si, insert_after, stmts);
          add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
                       UNKNOWN_LOCATION);
 
-- 
2.25.1

Reply via email to