On Tue, Jun 18, 2019 at 10:57 AM Richard Sandiford <richard.sandif...@arm.com> wrote: > > r272233 changed the handling of fully-masked loops so that the IV type > can be wider than the comparison type. However, it also hard-coded the > IV step to VF, whereas for SLP groups it needs to be VF * group size. > This introduced execution failures for gcc.target/aarch64/sve/slp_*_run.c > (and over 100 other execution regressions, which at least gives some > confidence that this has good coverage in the testsuite :-)). > > Also, iv_precision had type widest_int but only needs to be unsigned int. > (This was an early review comment but I hadn't noticed that it was still > widest_int in later versions.) > > Tested on aarch64-linux-gnu (with and without SVE) and x86_64-linux-gnu. > OK to install?
OK. Richard. > Richard > > > 2019-06-18 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * tree-vect-loop-manip.c (vect_set_loop_masks_directly): Remove > vf parameter. Restore the previous iv step of nscalars_step, > but give it iv_type rather than compare_type. Tweak code order > to match the comments. > (vect_set_loop_condition_masked): Update accordingly. > * tree-vect-loop.c (vect_verify_full_masking): Use "unsigned int" > for iv_precision. Tweak comment formatting. > > Index: gcc/tree-vect-loop-manip.c > =================================================================== > --- gcc/tree-vect-loop-manip.c 2019-06-18 09:35:59.473831854 +0100 > +++ gcc/tree-vect-loop-manip.c 2019-06-18 09:36:03.301800224 +0100 > @@ -382,8 +382,7 @@ vect_maybe_permute_loop_masks (gimple_se > Use LOOP_COND_GSI to insert code before the exit gcond. > > RGM belongs to loop LOOP. The loop originally iterated NITERS > - times and has been vectorized according to LOOP_VINFO. Each iteration > - of the vectorized loop handles VF iterations of the scalar loop. > + times and has been vectorized according to LOOP_VINFO. > > If NITERS_SKIP is nonnull, the first iteration of the vectorized loop > starts with NITERS_SKIP dummy iterations of the scalar loop before > @@ -410,8 +409,7 @@ vect_maybe_permute_loop_masks (gimple_se > vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo, > gimple_seq *preheader_seq, > gimple_stmt_iterator loop_cond_gsi, > - rgroup_masks *rgm, tree vf, > - tree niters, tree niters_skip, > + rgroup_masks *rgm, tree niters, tree > niters_skip, > bool might_wrap_p) > { > tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); > @@ -419,26 +417,28 @@ vect_set_loop_masks_directly (struct loo > tree mask_type = rgm->mask_type; > unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; > poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > > /* Calculate the maximum number of scalar values that the rgroup > handles in total, the number that it handles for each iteration > of the vector loop, and the number that it should skip during the > first iteration of the vector loop. */ > tree nscalars_total = niters; > - tree nscalars_step = vf; > + tree nscalars_step = build_int_cst (iv_type, vf); > tree nscalars_skip = niters_skip; > if (nscalars_per_iter != 1) > { > /* We checked before choosing to use a fully-masked loop that these > multiplications don't overflow. */ > - tree factor = build_int_cst (compare_type, nscalars_per_iter); > + tree compare_factor = build_int_cst (compare_type, nscalars_per_iter); > + tree iv_factor = build_int_cst (iv_type, nscalars_per_iter); > nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, > - nscalars_total, factor); > - nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type, > - nscalars_step, factor); > + nscalars_total, compare_factor); > + nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type, > + nscalars_step, iv_factor); > if (nscalars_skip) > nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, > - nscalars_skip, factor); > + nscalars_skip, compare_factor); > } > > /* Create an induction variable that counts the number of scalars > @@ -447,15 +447,10 @@ vect_set_loop_masks_directly (struct loo > gimple_stmt_iterator incr_gsi; > bool insert_after; > standard_iv_increment_position (loop, &incr_gsi, &insert_after); > + create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop, > + &incr_gsi, insert_after, &index_before_incr, &index_after_incr); > > - tree zero_index = build_int_cst (iv_type, 0); > - tree step = build_int_cst (iv_type, > - LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > - /* Create IV of iv_type. */ > - create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi, > - insert_after, &index_before_incr, &index_after_incr); > - > - zero_index = build_int_cst (compare_type, 0); > + tree zero_index = build_int_cst (compare_type, 0); > tree test_index, test_limit, first_limit; > gimple_stmt_iterator *test_gsi; > if (might_wrap_p) > @@ -487,7 +482,8 @@ vect_set_loop_masks_directly (struct loo > where the rightmost subtraction can be done directly in > COMPARE_TYPE. */ > test_index = index_before_incr; > - tree adjust = nscalars_step; > + tree adjust = gimple_convert (preheader_seq, compare_type, > + nscalars_step); > if (nscalars_skip) > adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, > adjust, nscalars_skip); > @@ -531,14 +527,16 @@ vect_set_loop_masks_directly (struct loo > first_limit = test_limit; > } > > - /* Provide a definition of each mask in the group. */ > - tree next_mask = NULL_TREE; > - tree mask; > - unsigned int i; > + /* Convert the IV value to the comparison type (either a no-op or > + a demotion). */ > gimple_seq test_seq = NULL; > test_index = gimple_convert (&test_seq, compare_type, test_index); > gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT); > > + /* Provide a definition of each mask in the group. */ > + tree next_mask = NULL_TREE; > + tree mask; > + unsigned int i; > FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask) > { > /* Previous masks will cover BIAS scalars. This mask covers the > @@ -672,9 +670,6 @@ vect_set_loop_condition_masked (struct l > niters = gimple_convert (&preheader_seq, compare_type, niters); > > widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); > - /* Get the vectorization factor in tree form. */ > - tree vf = build_int_cst (compare_type, > - LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > > /* Iterate over all the rgroups and fill in their masks. We could use > the first mask from any rgroup for the loop condition; here we > @@ -709,7 +704,7 @@ vect_set_loop_condition_masked (struct l > /* Set up all masks for this group. */ > test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, > &preheader_seq, > - loop_cond_gsi, rgm, vf, > + loop_cond_gsi, rgm, > niters, niters_skip, > might_wrap_p); > } > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c 2019-06-18 09:35:54.921869466 +0100 > +++ gcc/tree-vect-loop.c 2019-06-18 09:36:03.305800190 +0100 > @@ -1062,7 +1062,7 @@ vect_verify_full_masking (loop_vec_info > tree cmp_type = NULL_TREE; > tree iv_type = NULL_TREE; > widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); > - widest_int iv_precision = UINT_MAX; > + unsigned int iv_precision = UINT_MAX; > > if (iv_limit != -1) > iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, > @@ -1083,12 +1083,12 @@ vect_verify_full_masking (loop_vec_info > best choice: > > - An IV that's Pmode or wider is more likely to be reusable > - in address calculations than an IV that's narrower than > - Pmode. > + in address calculations than an IV that's narrower than > + Pmode. > > - Doing the comparison in IV_PRECISION or wider allows > - a natural 0-based IV, whereas using a narrower comparison > - type requires mitigations against wrap-around. > + a natural 0-based IV, whereas using a narrower comparison > + type requires mitigations against wrap-around. > > Conversely, if the IV limit is variable, doing the comparison > in a wider type than the original type can introduce