> -----Original Message----- > From: Richard Biener <rguent...@suse.de> > Sent: Tuesday, April 15, 2025 12:49 PM > To: Tamar Christina <tamar.christ...@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <n...@arm.com> > Subject: Re: [PATCH]middle-end: Fix incorrect codegen with PFA and VLS > [PR119351] > > On Tue, 15 Apr 2025, Tamar Christina wrote: > > > Hi All, > > > > The following example: > > > > #define N 512 > > #define START 2 > > #define END 505 > > > > int x[N] __attribute__((aligned(32))); > > > > int __attribute__((noipa)) > > foo (void) > > { > > for (signed int i = START; i < END; ++i) > > { > > if (x[i] == 0) > > return i; > > } > > return -1; > > } > > > > generates incorrect code with fixed length SVE because for early break we > > need > > to know which value to start the scalar loop with if we take an early exit. > > > > Historically this means that we take the first element of every induction. > > this is because there's an assumption in place, that even with masked loops > > the > > masks come from a whilel* instruction. > > > > As such we reduce using a BIT_FIELD_REF <, 0>. > > > > When PFA was added this assumption was correct for non-masked loop, > however we > > assumed that PFA for VLA wouldn't work for now, and disabled it using the > > alignment requirement checks. We also expected VLS to PFA using scalar > > loops. > > > > However as this PR shows, for VLS the vectorizer can, and does in some > > circumstances choose to peel using masks by masking the first iteration of > > the > > loop with an additional alignment mask. > > > > When this is done, the first elements of the predicate can be inactive. In > > this > > example element 1 is inactive based on the calculated misalignment. hence > > the > > -1 value in the first vector IV element. > > > > When we reduce using BIT_FIELD_REF we get the wrong value. > > > > This patch updates it by creating a new scalar PHI that keeps track of > > whether > > we are the first iteration of the loop (with the additional masking) or > > whether > > we have taken a loop iteration already. > > > > The generated sequence: > > > > pre-header: > > bb1: > > i_1 = <number of leading inactive elements> > > > > header: > > bb2: > > i_2 = PHI <i_1(bb1), 0(latch)> > > … > > > > early-exit: > > bb3: > > i_3 = iv_step * i_2 + PHI<vector-iv> > > > > Which eliminates the need to do an expensive mask based reduction. > > > > This fixes gromacs with one OpenMP thread. But with > 1 there is still an > > issue. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu, > > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > > -m32, -m64 and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR tree-optimization/119351 > > * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Reject PFA > > with masking with non-linear IVs. > > * tree-vect-loop.cc (vectorizable_induction): Support PFA for masking. > > > > gcc/testsuite/ChangeLog: > > > > PR tree-optimization/119351 > > * gcc.target/aarch64/sve/peel_ind_10.c: New test. > > * gcc.target/aarch64/sve/peel_ind_10_run.c: New test. > > * gcc.target/aarch64/sve/peel_ind_5.c: New test. > > * gcc.target/aarch64/sve/peel_ind_5_run.c: New test. > > * gcc.target/aarch64/sve/peel_ind_6.c: New test. > > * gcc.target/aarch64/sve/peel_ind_6_run.c: New test. > > * gcc.target/aarch64/sve/peel_ind_7.c: New test. > > * gcc.target/aarch64/sve/peel_ind_7_run.c: New test. > > * gcc.target/aarch64/sve/peel_ind_8.c: New test. > > * gcc.target/aarch64/sve/peel_ind_8_run.c: New test. > > * gcc.target/aarch64/sve/peel_ind_9.c: New test. > > * gcc.target/aarch64/sve/peel_ind_9_run.c: New test. > > > > --- > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb12 > 0c54ba15019832cf1 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > > @@ -0,0 +1,24 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 0 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (int start) > > +{ > > + for (unsigned int i = start; i < END; ++i) > > + { > > + if (x[i] == 0) > > + return i; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > > "vect" > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1a > f61ccec91b51cdb42 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_10.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (START); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb2 > 2c8bb02b99644239e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > > @@ -0,0 +1,24 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 2 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (void) > > +{ > > + for (signed int i = START; i < END; ++i) > > + { > > + if (x[i] == 0) > > + return i; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > > "vect" > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..f26befeab7e53561f84b03 > 7aec857b44cf018456 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_5.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf > 98508fc8af08042ba > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > > @@ -0,0 +1,24 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 1 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (int start) > > +{ > > + for (unsigned int i = start; i < END; ++i) > > + { > > + if (x[i] == 0) > > + return i; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > > "vect" > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad48 > 7db37e1e5838b87ab > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_6.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (START); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..0182e131a173b7b05e88c > 3393ba854b2da25c6b2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > > @@ -0,0 +1,24 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 1 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (void) > > +{ > > + for (unsigned int i = START; i < END; ++i) > > + { > > + if (x[i] == 0) > > + return i; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > > "vect" > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..05608dd85f13912f8555ac > 3f39284f6894875998 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_7.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c4 > 61b4a4f22b45dfba8e8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > > @@ -0,0 +1,24 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 1 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (void) > > +{ > > + for (unsigned int i = START; i < END; i*=2) > > + { > > + if (x[i] == 0) > > + return i; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ > > +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump-not "Alignment of access forced using > > peeling" > "vect" } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f > 6699d395ab2726dec > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_8.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..cc904e88170f072e1d3c6b > e86643d99a7cd5cb12 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > > @@ -0,0 +1,25 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do compile } */ > > +/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec- > preference=sve-only -fdump-tree-vect-details" } */ > > + > > +#define N 512 > > +#define START 1 > > +#define END 505 > > + > > +int x[N] __attribute__((aligned(32))); > > + > > +int __attribute__((noipa)) > > +foo (void) > > +{ > > + for (int *p = x + START; p < x + END; p++) > > + { > > + if (*p == 0) > > + return START; > > + } > > + return -1; > > +} > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > +/* Peels using a scalar loop. */ > > +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */ > > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > > "vect" > } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c > 5428c20175ed176d96 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > > @@ -0,0 +1,17 @@ > > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > > +/* { dg-do run { target aarch64_sve_hw } } */ > > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > > + > > +#include "peel_ind_9.c" > > + > > +int __attribute__ ((optimize (1))) > > +main (void) > > +{ > > + int res = foo (); > > + asm volatile (""); > > + if (res != START) > > + __builtin_abort (); > > + return 0; > > +} > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > > index > 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..0722679d6e66e5dd5af4ec1c > e591f7c38b76d07f 100644 > > --- a/gcc/tree-vect-loop-manip.cc > > +++ b/gcc/tree-vect-loop-manip.cc > > @@ -2195,6 +2195,22 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info > loop_vinfo, > > return false; > > } > > > > + /* With early break vectorization we don't know whether the accesses > > will stay > > + inside the loop or not. TODO: The early break adjustment code can be > > + implemented the same way for vectorizable_linear_induction. However > > we > > + can't test this today so reject it. */ > > + if (niters_skip != NULL_TREE > > + && vect_use_loop_mask_for_alignment_p (loop_vinfo) > > + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > > + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) > > + { > > + if (dump_enabled_p ()) > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > + "Peeling for alignement using masking is not supported" > > + " for nonlinear induction when using early breaks.\n"); > > + return false; > > + } > > + > > return true; > > } > > > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index > 9413dcef702597ab27165e676546b190e2bd36ba..6dcdee19bb250993d8cc6b0 > 057d2fa46245d04d9 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -10678,6 +10678,104 @@ vectorizable_induction (loop_vec_info > loop_vinfo, > > LOOP_VINFO_MASK_SKIP_NITERS > (loop_vinfo)); > > peel_mul = gimple_build_vector_from_val (&init_stmts, > > step_vectype, peel_mul); > > + > > + /* If early break then we have to create a new PHI which we can use as > > + an offset to adjust the induction reduction in early exits. */ > > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)) > > + { > > + auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); > > + tree ty_skip_niters = TREE_TYPE (skip_niters); > > + tree break_lhs_phi = NULL_TREE; > > + break_lhs_phi = vect_get_new_vect_var (ty_skip_niters, > > + vect_scalar_var, > > + "pfa_iv_offset"); > > + gphi *nphi = create_phi_node (break_lhs_phi, bb); > > + add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION); > > + add_phi_arg (nphi, build_zero_cst (ty_skip_niters), > > + loop_latch_edge (iv_loop), UNKNOWN_LOCATION); > > + > > + /* Rewrite all the early exit usages. */ > > I don't like that you fixup exit uses here, that IMO makes it a quite ugly > workaround. Creating an extra IV is OK (but you only need one per > function?) - on x86 I suppose making the mask live instead of using > a scalar IV would be cheaper - but then the actual adjustment should > be done in vectorizable_live_operation where we already do >
Ack. FTR I had considered this and didn't do it because I thought you'd consider that one a hack. So I flipped the coin wrong :) > /* For early exit where the exit is not in the BB that leads > to the latch then we're restarting the iteration in the > scalar loop. So get the first live value. */ > if ((all_exits_as_early_p || !main_exit_edge) > && STMT_VINFO_DEF_TYPE (stmt_info) == > vect_induction_def) > { > tmp_vec_lhs = vec_lhs0; > tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart)); > } > > so we could emit the compensation for vec_lhs0 there, and > vectorizable_induction would store the extra > early-break-induction-IV PHI in loop_vinfo. > > Besides this, sth I'd at least want to be considered for a stage1 > cleanup, some more review below: > > > > I understand that you below rewrite the scalar code for the relevant > exits, but while the commit message possibly documents the details > the actual code below lacks here. Can you add a comment on what you > insert between the LC PHI result of the induction and its uses? > > > + tree phi_lhs = PHI_RESULT (phi); > > + imm_use_iterator iter; > > + use_operand_p use_p; > > + gimple *use_stmt; > > + > > + FOR_EACH_IMM_USE_FAST (use_p, iter, phi_lhs) > > + { > > + use_stmt = USE_STMT (use_p); > > + if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)) > > + && is_a <gphi *> (use_stmt)) > > + { > > + auto gsi = gsi_last_bb (use_stmt->bb); > > + for (auto exit : get_loop_exit_edges (iv_loop)) > > + if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo) > > what about LOOP_VINFO_EARLY_BREAKS_VECT_PEELED? > That's fine, This does not check the destination block only the source. For PEELED it'll still find the exit block through ! LOOP_VINFO_IV_EXIT (loop_vinfo) exits. Note that this loop is only needed because we store only LOOP_VINFO_IV_EXIT (loop_vinfo) as we initially though storing the merge BB of the early exit wouldn't be needed as you can find it through the exits. But this is the 4-5th time where having this block stored would result in simpler code. Perhaps we should revisit this, since for a lot of cases all we really need is this BB. The down side is That any splitting of the exit edges needs this to be updated potentially. > I think you are double-visiting uses x exits here. Instead when > iterating FOR_EACH_IMM_USE_FAST you should go see the exit edge > via > I do break on the first early exit below to avoid this. But... > edge e = gimple_phi_arg_edge (use_stmt, phi_arg_index_from_use (use_p)); Is indeed simpler. I forgot about phi_arg_index_from_use. Thanks, Tamar > > > > + && bb == exit->src) > > + { > > + /* Now create the PHI for the outside loop usage to > > + retrieve the value for the offset counter. */ > > + tree rphi_lhs = make_ssa_name (ty_skip_niters); > > + gphi *rphi > > + = create_phi_node (rphi_lhs, use_stmt->bb); > > + for (unsigned i = 0; i < gimple_phi_num_args (rphi); > > + i++) > > + SET_PHI_ARG_DEF (rphi, i, PHI_RESULT (nphi)); > > + > > + tree tmp = make_ssa_name (TREE_TYPE (phi_lhs)); > > + tree stmt_lhs = PHI_RESULT (use_stmt); > > + imm_use_iterator iter2; > > + gimple *use_stmt2; > > + use_operand_p use2_p; > > + > > + /* Now rewrite all the usages first. */ > > + FOR_EACH_IMM_USE_STMT (use_stmt2, iter2, > stmt_lhs) > > + FOR_EACH_IMM_USE_ON_STMT (use2_p, iter2) > > + SET_USE (use2_p, tmp); > > + > > + /* And then generate the adjustment to avoid the > > + update code from updating this new usage. The > > + multiplicaiton is to get the original IV and the > > + downwards counting IV correct. */ > > + gimple_seq iv_stmts = NULL; > > + tree rphi_step > > + = gimple_convert (&iv_stmts, ty_skip_niters, > > + step_expr); > > + tree tmp2 > > + = gimple_build (&iv_stmts, MULT_EXPR, > > + ty_skip_niters, rphi_step, > > + PHI_RESULT (rphi)); > > + > > + if (POINTER_TYPE_P (TREE_TYPE (stmt_lhs))) > > + tmp2 > > + = gimple_build (&iv_stmts, POINTER_PLUS_EXPR, > > + TREE_TYPE (stmt_lhs), stmt_lhs, > > + tmp2); > > + else > > + { > > + tmp2 > > + = gimple_convert (&iv_stmts, > > + TREE_TYPE (stmt_lhs), > > + tmp2); > > + tmp2 > > + = gimple_build (&iv_stmts, PLUS_EXPR, > > + TREE_TYPE (stmt_lhs), > > stmt_lhs, > > + tmp2); > > + } > > + > > + gsi_insert_seq_before (&gsi, iv_stmts, > > + GSI_SAME_STMT); > > + gimple *cvt_stmt = > > + gimple_build_assign (tmp, VIEW_CONVERT_EXPR, > > + build1 (VIEW_CONVERT_EXPR, > > + TREE_TYPE (phi_lhs), > > + tmp2)); > > You can use gimple_build (... VIEW_CONVERT_EXPR, ..) just fine, we > special-case this and properly build the build1. > > > > > + gsi_insert_before (&gsi, cvt_stmt, GSI_SAME_STMT); > > + } > > + /* All early exits point to the same common block, so we > > + only have to find the first one. */ > > + break; > > + } > > + } > > + } > > } > > tree step_mul = NULL_TREE; > > unsigned ivn; > > > > > > > > -- > Richard Biener <rguent...@suse.de> > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)