On Thu, 6 Feb 2025, Richard Sandiford wrote:

> In this PR, we used to generate:
> 
>      .L6:
>         mov     v30.16b, v31.16b
>         fadd    v31.4s, v31.4s, v27.4s
>         fadd    v29.4s, v30.4s, v28.4s
>         stp     q30, q29, [x0]
>         add     x0, x0, 32
>         cmp     x1, x0
>         bne     .L6
> 
> for an unrolled induction in:
> 
>   for (int i = 0; i < 1024; i++)
>     {
>       arr[i] = freq;
>       freq += step;
>     }
> 
> with the problem being the unnecessary MOV.
> 
> The main induction IV was incremented by VF * step == 2 * nunits * step,
> and then nunits * step was added for the second store to arr.
> 
> The original patch for the PR (r14-2367-g224fd59b2dc8) avoided the MOV
> by incrementing the IV by nunits * step twice.  The problem with that
> approach is that it doubles the loop-carried latency.  This change was
> deliberately not preserved when moving from loop-vect to SLP and so
> the test started failing again after r15-3509-gd34cda720988.
> 
> I think the main problem is that we put the IV increment in the wrong
> place.  Normal IVs created by create_iv are placed before the exit
> condition where possible, but vectorizable_induction instead always
> inserted them at the start of the loop body.  The only use of the
> incremented IV is by the phi node, so the effect is to make both
> the old and new IV values live for the whole loop body, which is
> why we need the MOV.
> 
> The simplest fix therefore seems to be to reuse the create_iv logic.
> 
> Bootstrapped & regression-tested on aarch64-linus-gnu and
> x86_64-linux-gnu.  As Richi suggested, I also tested vect.exp
> using unix/-mavx512f/--param=vect-partial-vector-usage=2.
> OK to install?

OK.

Thanks,
Richard.

> Richard
> 
> 
> gcc/
>       PR tree-optimization/110449
>       * tree-ssa-loop-manip.h (insert_iv_increment): Declare.
>       * tree-ssa-loop-manip.cc (insert_iv_increment): New function,
>       split out from...
>       (create_iv): ...here and generalized to gimple_seqs.
>       * tree-vect-loop.cc (vectorizable_induction): Use
>       standard_iv_increment_position and insert_iv_increment
>       to insert the IV increment.
> 
> gcc/testsuite/
>       PR tree-optimization/110449
>       * gcc.target/aarch64/pr110449.c: Expect an increment by 8.0,
>       but test that there is no MOV.
> ---
>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 25 +++++----
>  gcc/tree-ssa-loop-manip.cc                  | 62 ++++++++++++---------
>  gcc/tree-ssa-loop-manip.h                   |  1 +
>  gcc/tree-vect-loop.cc                       |  6 +-
>  4 files changed, 57 insertions(+), 37 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c 
> b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> index bb3b6dcfe08..51ca3f4b816 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr110449.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> @@ -1,8 +1,10 @@
>  /* { dg-do compile } */
>  /* { dg-options "-Ofast -mcpu=neoverse-n2 --param 
> aarch64-vect-unroll-limit=2" } */
> -/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> +/* { dg-final { scan-assembler {, #?8.0e\+0} } } */
> +/* { dg-final { scan-assembler-not {\tmov\tv} } } */
>  
> -/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> +/* Insert the induction IV updates before the exit condition, rather than
> +   at the start of the loop body.
>  
>     before (suggested_unroll_factor=2):
>         fmov    s30, 8.0e+0
> @@ -19,15 +21,16 @@
>         bne     .L6
>  
>     after:
> -       fmov    s31, 4.0e+0
> -       dup     v29.4s, v31.s[0]
> -     .L6:
> -       fadd    v30.4s, v31.4s, v29.4s
> -       stp     q31, q30, [x0]
> -       add     x0, x0, 32
> -       fadd    v31.4s, v29.4s, v30.4s
> -       cmp     x0, x1
> -       bne     .L6  */
> +       fmov    s31, 8.0e+0
> +       fmov    s29, 4.0e+0
> +       dup     v31.4s, v31.s[0]
> +       dup     v29.4s, v29.s[0]
> +     .L2:
> +       fadd    v30.4s, v0.4s, v29.4s
> +       stp     q0, q30, [x0], 32
> +       fadd    v0.4s, v0.4s, v31.4s
> +       cmp     x1, x0
> +       bne     .L2  */
>  
>  void
>  foo2 (float *arr, float freq, float step)
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 6ceb9df370b..2907fa6532d 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -47,6 +47,39 @@ along with GCC; see the file COPYING3.  If not see
>     so that we can free them all at once.  */
>  static bitmap_obstack loop_renamer_obstack;
>  
> +/* Insert IV increment statements STMTS before or after INCR_POS;
> +   AFTER selects which.  INCR_POS and AFTER can be computed using
> +   standard_iv_increment_position.  */
> +
> +void
> +insert_iv_increment (gimple_stmt_iterator *incr_pos, bool after,
> +                  gimple_seq stmts)
> +{
> +  /* Prevent the increment from inheriting a bogus location if it is not put
> +     immediately after a statement whose location is known.  */
> +  if (after)
> +    {
> +      gimple_stmt_iterator gsi = *incr_pos;
> +      if (!gsi_end_p (gsi))
> +     gsi_next_nondebug (&gsi);
> +      if (gsi_end_p (gsi))
> +     {
> +       edge e = single_succ_edge (gsi_bb (*incr_pos));
> +       gimple_seq_set_location (stmts, e->goto_locus);
> +     }
> +      gsi_insert_seq_after (incr_pos, stmts, GSI_NEW_STMT);
> +    }
> +  else
> +    {
> +      gimple_stmt_iterator gsi = *incr_pos;
> +      if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
> +     gsi_next_nondebug (&gsi);
> +      if (!gsi_end_p (gsi))
> +     gimple_seq_set_location (stmts, gimple_location (gsi_stmt (gsi)));
> +      gsi_insert_seq_before (incr_pos, stmts, GSI_NEW_STMT);
> +    }
> +}
> +
>  /* Creates an induction variable with value BASE (+/-) STEP * iteration in 
> LOOP.
>     If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * 
> iteration.
>     If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * 
> iteration.
> @@ -63,7 +96,6 @@ create_iv (tree base, tree_code incr_op, tree step, tree 
> var, class loop *loop,
>          gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
>          tree *var_after)
>  {
> -  gassign *stmt;
>    gphi *phi;
>    tree initial, step1;
>    gimple_seq stmts;
> @@ -126,30 +158,10 @@ create_iv (tree base, tree_code incr_op, tree step, 
> tree var, class loop *loop,
>    if (stmts)
>      gsi_insert_seq_on_edge_immediate (pe, stmts);
>  
> -  stmt = gimple_build_assign (va, incr_op, vb, step);
> -  /* Prevent the increment from inheriting a bogus location if it is not put
> -     immediately after a statement whose location is known.  */
> -  if (after)
> -    {
> -      gimple_stmt_iterator gsi = *incr_pos;
> -      if (!gsi_end_p (gsi))
> -     gsi_next_nondebug (&gsi);
> -      if (gsi_end_p (gsi))
> -     {
> -       edge e = single_succ_edge (gsi_bb (*incr_pos));
> -       gimple_set_location (stmt, e->goto_locus);
> -     }
> -      gsi_insert_after (incr_pos, stmt, GSI_NEW_STMT);
> -    }
> -  else
> -    {
> -      gimple_stmt_iterator gsi = *incr_pos;
> -      if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi)))
> -     gsi_next_nondebug (&gsi);
> -      if (!gsi_end_p (gsi))
> -     gimple_set_location (stmt, gimple_location (gsi_stmt (gsi)));
> -      gsi_insert_before (incr_pos, stmt, GSI_NEW_STMT);
> -    }
> +  gimple_seq incr_stmts = nullptr;
> +  gimple_seq_add_stmt (&incr_stmts,
> +                    gimple_build_assign (va, incr_op, vb, step));
> +  insert_iv_increment (incr_pos, after, incr_stmts);
>  
>    initial = force_gimple_operand (base, &stmts, true, var);
>    if (stmts)
> diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h
> index b1f65e3c071..80f680565c0 100644
> --- a/gcc/tree-ssa-loop-manip.h
> +++ b/gcc/tree-ssa-loop-manip.h
> @@ -22,6 +22,7 @@ along with GCC; see the file COPYING3.  If not see
>  
>  typedef void (*transform_callback)(class loop *, void *);
>  
> +extern void insert_iv_increment (gimple_stmt_iterator *, bool, gimple_seq);
>  extern void create_iv (tree, tree_code, tree, tree, class loop *,
>                      gimple_stmt_iterator *, bool, tree *, tree *);
>  extern void rewrite_into_loop_closed_ssa (bitmap, unsigned);
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 03426207879..eea0b89db69 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10580,6 +10580,10 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>         [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
>    if (slp_node)
>      {
> +      gimple_stmt_iterator incr_si;
> +      bool insert_after;
> +      standard_iv_increment_position (iv_loop, &incr_si, &insert_after);
> +
>        /* The initial values are vectorized, but any lanes > group_size
>        need adjustment.  */
>        slp_tree init_node
> @@ -10810,7 +10814,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>         vec_def = gimple_build (&stmts,
>                                 PLUS_EXPR, step_vectype, vec_def, up);
>         vec_def = gimple_convert (&stmts, vectype, vec_def);
> -       gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> +       insert_iv_increment (&incr_si, insert_after, stmts);
>         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>                      UNKNOWN_LOCATION);
>  
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to