Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> writes: > [...] > diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c > index b3fae5b..c15b8a2 100644 > --- a/gcc/tree-vect-loop-manip.c > +++ b/gcc/tree-vect-loop-manip.c > @@ -415,10 +415,16 @@ vect_set_loop_masks_directly (struct loop *loop, > loop_vec_info loop_vinfo, > bool might_wrap_p) > { > tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); > + tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo); > tree mask_type = rgm->mask_type; > unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; > poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); > + bool convert = false; > > + /* If the compare_type is not iv_type, we will create an IV with > + iv_type with truncated use (i.e. converted to the correct type). */ > + if (compare_type != iv_type) > + convert = true; > /* Calculate the maximum number of scalar values that the rgroup > handles in total, the number that it handles for each iteration > of the vector loop, and the number that it should skip during the > @@ -444,12 +450,43 @@ vect_set_loop_masks_directly (struct loop *loop, > loop_vec_info loop_vinfo, > processed. */ > tree index_before_incr, index_after_incr; > gimple_stmt_iterator incr_gsi; > + gimple_stmt_iterator incr_gsi2; > bool insert_after; > - tree zero_index = build_int_cst (compare_type, 0); > + tree zero_index; > standard_iv_increment_position (loop, &incr_gsi, &insert_after); > - create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, > - insert_after, &index_before_incr, &index_after_incr); > > + if (convert) > + { > + /* If we are creating IV of iv_type and then converting. */ > + zero_index = build_int_cst (iv_type, 0); > + tree step = build_int_cst (iv_type, > + LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > + /* Creating IV of iv_type. */ > + create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi, > + insert_after, &index_before_incr, &index_after_incr); > + /* Create truncated index_before and after increament. */ > + tree index_before_incr_trunc = make_ssa_name (compare_type); > + tree index_after_incr_trunc = make_ssa_name (compare_type); > + gimple *incr_before_stmt = gimple_build_assign > (index_before_incr_trunc, > + NOP_EXPR, > + index_before_incr); > + gimple *incr_after_stmt = gimple_build_assign (index_after_incr_trunc, > + NOP_EXPR, > + index_after_incr); > + incr_gsi2 = incr_gsi; > + gsi_insert_before (&incr_gsi2, incr_before_stmt, GSI_NEW_STMT); > + gsi_insert_after (&incr_gsi, incr_after_stmt, GSI_NEW_STMT); > + index_before_incr = index_before_incr_trunc; > + index_after_incr = index_after_incr_trunc; > + zero_index = build_int_cst (compare_type, 0); > + } > + else > + { > + /* If the IV is of compare_type, no convertion needed. */ > + zero_index = build_int_cst (compare_type, 0); > + create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi, > + insert_after, &index_before_incr, &index_after_incr); > + } > tree test_index, test_limit, first_limit; > gimple_stmt_iterator *test_gsi; > if (might_wrap_p)
Now that we have an explicit iv_type, there shouldn't be any need to treat this as two special cases. I think we should just convert the IV to the comparison type before passing it to the WHILE. > @@ -617,6 +654,41 @@ vect_set_loop_masks_directly (struct loop *loop, > loop_vec_info loop_vinfo, > return next_mask; > } > > +/* Return the iv_limit for fully masked loop LOOP with LOOP_VINFO. > + If it is not possible to calcilate iv_limit, return -1. */ Maybe: /* Decide whether it is possible to use a zero-based induction variable when vectorizing LOOP_VINFO with a fully-masked loop. If it is, return the value that the induction variable must be able to hold in order to ensure that the loop ends with an all-false mask. Return -1 otherwise. */ I think the function should go on in tree-vect-loop.c instead. > +widest_int > +vect_get_loop_iv_limit (struct loop *loop, loop_vec_info loop_vinfo) Maybe: vect_iv_limit_for_full_masking Probably worth dropping the "loop" parameter and getting it from LOOP_VINFO. > +{ > + /* Convert skip_niters to the right type. */ Comment no longer applies. > + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); > + unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); > + > + /* Now calculate the value that the induction variable must be able > + to hit in order to ensure that we end the loop with an all-false mask. > + This involves adding the maximum number of inactive trailing scalar > + iterations. */ > + widest_int iv_limit = -1; > + bool known_max_iters = max_loop_iterations (loop, &iv_limit); > + if (known_max_iters) No need for this temporary variable. > + { > + if (niters_skip) > + { > + /* Add the maximum number of skipped iterations to the > + maximum iteration count. */ > + if (TREE_CODE (niters_skip) == INTEGER_CST) > + iv_limit += wi::to_widest (niters_skip); > + else > + iv_limit += max_vf - 1; > + } Note that MASK_SKIP_NITERS isn't set at the point you call it for vect_set_loop_condition_masked. I think we should have: else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) /* Make a conservatively-correct assumption. */ iv_limit += max_vf - 1; > + /* IV_LIMIT is the maximum number of latch iterations, which is also > + the maximum in-range IV value. Round this value down to the previous > + vector alignment boundary and then add an extra full iteration. */ > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > + iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; > + } > + return iv_limit; > +} > + > /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls. > LOOP_VINFO describes the vectorization of LOOP. NITERS is the > number of iterations of the original scalar loop that should be > [...] > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index e1229a5..431025b 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1056,6 +1056,16 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) > /* Find a scalar mode for which WHILE_ULT is supported. */ > opt_scalar_int_mode cmp_mode_iter; > tree cmp_type = NULL_TREE; > + tree iv_type = NULL_TREE; > + widest_int iv_limit = vect_get_loop_iv_limit (loop, loop_vinfo); > + widest_int iv_precision = -1; iv_precision should be unsigned int. Setting it to UINT_MAX would simplify the later code. > + > + if (iv_limit != -1) > + iv_precision > + = wi::min_precision (iv_limit > + * vect_get_max_nscalars_per_iter (loop_vinfo), > + UNSIGNED); > + Would be good to avoid the duplicated call to vect_get_max_nscalars_per_iter (also called for min_ni_width). > FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) > { > unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); > @@ -1066,13 +1076,25 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) > if (this_type > && can_produce_all_loop_masks_p (loop_vinfo, this_type)) > { > - /* Although we could stop as soon as we find a valid mode, > - it's often better to continue until we hit Pmode, since the > + /* See whether zero-based IV would ever generate all-false masks > + before wrapping around. */ > + bool might_wrap_p = (iv_limit == -1 || (iv_precision > cmp_bits)); With the above change, the iv_limit check would no longer be needed. > + /* Stop as soon as we find a valid mode. If we decided to use > + cmp_type which is less than Pmode precision, it is often better > + to use iv_type corresponding to Pmode, since the > operands to the WHILE are more likely to be reusable in > - address calculations. */ > + address calculations in this case. */ > cmp_type = this_type; > + iv_type = this_type; > if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) > break; > + if (!might_wrap_p) > + { > + iv_type > + = build_nonstandard_integer_type (GET_MODE_BITSIZE (Pmode), > + true); > + break; > + } I think the loop should break in the same place as before, with the iv_type being what used to be the cmp_type. The new behaviour is that (for the new meaning of cmp_type) we keep the current cmp_type if its precision is already >= iv_precision. Thanks, Richard