On Thu, 6 Feb 2025, Richard Sandiford wrote: > In this PR, we used to generate: > > .L6: > mov v30.16b, v31.16b > fadd v31.4s, v31.4s, v27.4s > fadd v29.4s, v30.4s, v28.4s > stp q30, q29, [x0] > add x0, x0, 32 > cmp x1, x0 > bne .L6 > > for an unrolled induction in: > > for (int i = 0; i < 1024; i++) > { > arr[i] = freq; > freq += step; > } > > with the problem being the unnecessary MOV. > > The main induction IV was incremented by VF * step == 2 * nunits * step, > and then nunits * step was added for the second store to arr. > > The original patch for the PR (r14-2367-g224fd59b2dc8) avoided the MOV > by incrementing the IV by nunits * step twice. The problem with that > approach is that it doubles the loop-carried latency. This change was > deliberately not preserved when moving from loop-vect to SLP and so > the test started failing again after r15-3509-gd34cda720988. > > I think the main problem is that we put the IV increment in the wrong > place. Normal IVs created by create_iv are placed before the exit > condition where possible, but vectorizable_induction instead always > inserted them at the start of the loop body. The only use of the > incremented IV is by the phi node, so the effect is to make both > the old and new IV values live for the whole loop body, which is > why we need the MOV. > > The simplest fix therefore seems to be to reuse the create_iv logic. > > Bootstrapped & regression-tested on aarch64-linus-gnu and > x86_64-linux-gnu. As Richi suggested, I also tested vect.exp > using unix/-mavx512f/--param=vect-partial-vector-usage=2. > OK to install?
OK. Thanks, Richard. > Richard > > > gcc/ > PR tree-optimization/110449 > * tree-ssa-loop-manip.h (insert_iv_increment): Declare. > * tree-ssa-loop-manip.cc (insert_iv_increment): New function, > split out from... > (create_iv): ...here and generalized to gimple_seqs. > * tree-vect-loop.cc (vectorizable_induction): Use > standard_iv_increment_position and insert_iv_increment > to insert the IV increment. > > gcc/testsuite/ > PR tree-optimization/110449 > * gcc.target/aarch64/pr110449.c: Expect an increment by 8.0, > but test that there is no MOV. > --- > gcc/testsuite/gcc.target/aarch64/pr110449.c | 25 +++++---- > gcc/tree-ssa-loop-manip.cc | 62 ++++++++++++--------- > gcc/tree-ssa-loop-manip.h | 1 + > gcc/tree-vect-loop.cc | 6 +- > 4 files changed, 57 insertions(+), 37 deletions(-) > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c > b/gcc/testsuite/gcc.target/aarch64/pr110449.c > index bb3b6dcfe08..51ca3f4b816 100644 > --- a/gcc/testsuite/gcc.target/aarch64/pr110449.c > +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c > @@ -1,8 +1,10 @@ > /* { dg-do compile } */ > /* { dg-options "-Ofast -mcpu=neoverse-n2 --param > aarch64-vect-unroll-limit=2" } */ > -/* { dg-final { scan-assembler-not "8.0e\\+0" } } */ > +/* { dg-final { scan-assembler {, #?8.0e\+0} } } */ > +/* { dg-final { scan-assembler-not {\tmov\tv} } } */ > > -/* Calcualte the vectorized induction with smaller step for an unrolled loop. > +/* Insert the induction IV updates before the exit condition, rather than > + at the start of the loop body. > > before (suggested_unroll_factor=2): > fmov s30, 8.0e+0 > @@ -19,15 +21,16 @@ > bne .L6 > > after: > - fmov s31, 4.0e+0 > - dup v29.4s, v31.s[0] > - .L6: > - fadd v30.4s, v31.4s, v29.4s > - stp q31, q30, [x0] > - add x0, x0, 32 > - fadd v31.4s, v29.4s, v30.4s > - cmp x0, x1 > - bne .L6 */ > + fmov s31, 8.0e+0 > + fmov s29, 4.0e+0 > + dup v31.4s, v31.s[0] > + dup v29.4s, v29.s[0] > + .L2: > + fadd v30.4s, v0.4s, v29.4s > + stp q0, q30, [x0], 32 > + fadd v0.4s, v0.4s, v31.4s > + cmp x1, x0 > + bne .L2 */ > > void > foo2 (float *arr, float freq, float step) > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc > index 6ceb9df370b..2907fa6532d 100644 > --- a/gcc/tree-ssa-loop-manip.cc > +++ b/gcc/tree-ssa-loop-manip.cc > @@ -47,6 +47,39 @@ along with GCC; see the file COPYING3. If not see > so that we can free them all at once. */ > static bitmap_obstack loop_renamer_obstack; > > +/* Insert IV increment statements STMTS before or after INCR_POS; > + AFTER selects which. INCR_POS and AFTER can be computed using > + standard_iv_increment_position. */ > + > +void > +insert_iv_increment (gimple_stmt_iterator *incr_pos, bool after, > + gimple_seq stmts) > +{ > + /* Prevent the increment from inheriting a bogus location if it is not put > + immediately after a statement whose location is known. */ > + if (after) > + { > + gimple_stmt_iterator gsi = *incr_pos; > + if (!gsi_end_p (gsi)) > + gsi_next_nondebug (&gsi); > + if (gsi_end_p (gsi)) > + { > + edge e = single_succ_edge (gsi_bb (*incr_pos)); > + gimple_seq_set_location (stmts, e->goto_locus); > + } > + gsi_insert_seq_after (incr_pos, stmts, GSI_NEW_STMT); > + } > + else > + { > + gimple_stmt_iterator gsi = *incr_pos; > + if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi))) > + gsi_next_nondebug (&gsi); > + if (!gsi_end_p (gsi)) > + gimple_seq_set_location (stmts, gimple_location (gsi_stmt (gsi))); > + gsi_insert_seq_before (incr_pos, stmts, GSI_NEW_STMT); > + } > +} > + > /* Creates an induction variable with value BASE (+/-) STEP * iteration in > LOOP. > If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * > iteration. > If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * > iteration. > @@ -63,7 +96,6 @@ create_iv (tree base, tree_code incr_op, tree step, tree > var, class loop *loop, > gimple_stmt_iterator *incr_pos, bool after, tree *var_before, > tree *var_after) > { > - gassign *stmt; > gphi *phi; > tree initial, step1; > gimple_seq stmts; > @@ -126,30 +158,10 @@ create_iv (tree base, tree_code incr_op, tree step, > tree var, class loop *loop, > if (stmts) > gsi_insert_seq_on_edge_immediate (pe, stmts); > > - stmt = gimple_build_assign (va, incr_op, vb, step); > - /* Prevent the increment from inheriting a bogus location if it is not put > - immediately after a statement whose location is known. */ > - if (after) > - { > - gimple_stmt_iterator gsi = *incr_pos; > - if (!gsi_end_p (gsi)) > - gsi_next_nondebug (&gsi); > - if (gsi_end_p (gsi)) > - { > - edge e = single_succ_edge (gsi_bb (*incr_pos)); > - gimple_set_location (stmt, e->goto_locus); > - } > - gsi_insert_after (incr_pos, stmt, GSI_NEW_STMT); > - } > - else > - { > - gimple_stmt_iterator gsi = *incr_pos; > - if (!gsi_end_p (gsi) && is_gimple_debug (gsi_stmt (gsi))) > - gsi_next_nondebug (&gsi); > - if (!gsi_end_p (gsi)) > - gimple_set_location (stmt, gimple_location (gsi_stmt (gsi))); > - gsi_insert_before (incr_pos, stmt, GSI_NEW_STMT); > - } > + gimple_seq incr_stmts = nullptr; > + gimple_seq_add_stmt (&incr_stmts, > + gimple_build_assign (va, incr_op, vb, step)); > + insert_iv_increment (incr_pos, after, incr_stmts); > > initial = force_gimple_operand (base, &stmts, true, var); > if (stmts) > diff --git a/gcc/tree-ssa-loop-manip.h b/gcc/tree-ssa-loop-manip.h > index b1f65e3c071..80f680565c0 100644 > --- a/gcc/tree-ssa-loop-manip.h > +++ b/gcc/tree-ssa-loop-manip.h > @@ -22,6 +22,7 @@ along with GCC; see the file COPYING3. If not see > > typedef void (*transform_callback)(class loop *, void *); > > +extern void insert_iv_increment (gimple_stmt_iterator *, bool, gimple_seq); > extern void create_iv (tree, tree_code, tree, tree, class loop *, > gimple_stmt_iterator *, bool, tree *, tree *); > extern void rewrite_into_loop_closed_ssa (bitmap, unsigned); > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 03426207879..eea0b89db69 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10580,6 +10580,10 @@ vectorizable_induction (loop_vec_info loop_vinfo, > [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */ > if (slp_node) > { > + gimple_stmt_iterator incr_si; > + bool insert_after; > + standard_iv_increment_position (iv_loop, &incr_si, &insert_after); > + > /* The initial values are vectorized, but any lanes > group_size > need adjustment. */ > slp_tree init_node > @@ -10810,7 +10814,7 @@ vectorizable_induction (loop_vec_info loop_vinfo, > vec_def = gimple_build (&stmts, > PLUS_EXPR, step_vectype, vec_def, up); > vec_def = gimple_convert (&stmts, vectype, vec_def); > - gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); > + insert_iv_increment (&incr_si, insert_after, stmts); > add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), > UNKNOWN_LOCATION); > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)