Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> writes: > @@ -609,8 +615,14 @@ vect_set_loop_masks_directly (struct loop *loop, > loop_vec_info loop_vinfo, > > /* Get the mask value for the next iteration of the loop. */ > next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask"); > - gcall *call = vect_gen_while (next_mask, test_index, this_test_limit); > - gsi_insert_before (test_gsi, call, GSI_SAME_STMT); > + tree test_index_cmp_type = make_ssa_name (compare_type); > + gimple *conv_stmt = gimple_build_assign (test_index_cmp_type, > + NOP_EXPR, > + test_index); > + gsi_insert_before (test_gsi, conv_stmt, GSI_NEW_STMT);
We only need to convert once, so this should happen before the FOR_EACH_VEC_ELT_REVERSE loop. Would be better as: gimple_seq test_seq = NULL; test_index = gimple_convert (&test_seq, compare_type, text_index); gimple_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT); so that we don't generate unnecessary converts. > + gcall *call = vect_gen_while (next_mask, test_index_cmp_type, > + this_test_limit); > + gsi_insert_after (test_gsi, call, GSI_SAME_STMT); > > vect_set_loop_mask (loop, mask, init_mask, next_mask); > } > @@ -637,12 +649,12 @@ vect_set_loop_condition_masked (struct loop *loop, > loop_vec_info loop_vinfo, > > tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); > unsigned int compare_precision = TYPE_PRECISION (compare_type); > - unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); > tree orig_niters = niters; > > /* Type of the initial value of NITERS. */ > tree ni_actual_type = TREE_TYPE (niters); > unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); > + tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); > > /* Convert NITERS to the same size as the compare. */ > if (compare_precision > ni_actual_precision > @@ -661,33 +673,7 @@ vect_set_loop_condition_masked (struct loop *loop, > loop_vec_info loop_vinfo, > else > niters = gimple_convert (&preheader_seq, compare_type, niters); > > - /* Convert skip_niters to the right type. */ > - tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); > - > - /* Now calculate the value that the induction variable must be able > - to hit in order to ensure that we end the loop with an all-false mask. > - This involves adding the maximum number of inactive trailing scalar > - iterations. */ > - widest_int iv_limit; > - bool known_max_iters = max_loop_iterations (loop, &iv_limit); > - if (known_max_iters) > - { > - if (niters_skip) > - { > - /* Add the maximum number of skipped iterations to the > - maximum iteration count. */ > - if (TREE_CODE (niters_skip) == INTEGER_CST) > - iv_limit += wi::to_widest (niters_skip); > - else > - iv_limit += max_vf - 1; > - } > - /* IV_LIMIT is the maximum number of latch iterations, which is also > - the maximum in-range IV value. Round this value down to the previous > - vector alignment boundary and then add an extra full iteration. */ > - poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > - iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; > - } > - > + widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); > /* Get the vectorization factor in tree form. */ > tree vf = build_int_cst (compare_type, > LOOP_VINFO_VECT_FACTOR (loop_vinfo)); > @@ -717,7 +703,7 @@ vect_set_loop_condition_masked (struct loop *loop, > loop_vec_info loop_vinfo, > /* See whether zero-based IV would ever generate all-false masks > before wrapping around. */ > bool might_wrap_p > - = (!known_max_iters > + = (iv_limit == UINT_MAX Shouldn't this be == -1? > || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter, > UNSIGNED) > > compare_precision)); > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 4942c69..1240037 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1029,7 +1029,10 @@ static bool > vect_verify_full_masking (loop_vec_info loop_vinfo) > { > struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); > unsigned int min_ni_width; > + unsigned int max_nscalars_per_iter > + = vect_get_max_nscalars_per_iter (loop_vinfo); > > /* Use a normal loop if there are no statements that need masking. > This only happens in rare degenerate cases: it means that the loop > @@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) > max_ni = wi::smin (max_ni, max_back_edges + 1); > > /* Account for rgroup masks, in which each bit is replicated N times. */ > - max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo); > + max_ni *= max_nscalars_per_iter; > > /* Work out how many bits we need to represent the limit. */ > min_ni_width = wi::min_precision (max_ni, UNSIGNED); > @@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) > /* Find a scalar mode for which WHILE_ULT is supported. */ > opt_scalar_int_mode cmp_mode_iter; > tree cmp_type = NULL_TREE; > + tree iv_type = NULL_TREE; > + widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); > + widest_int iv_precision = UINT_MAX; > + > + if (iv_limit != UINT_MAX) Same for != -1 here. > + iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, > + UNSIGNED); > + > FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) > { > unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); > @@ -1066,11 +1077,18 @@ vect_verify_full_masking (loop_vec_info loop_vinfo) > if (this_type > && can_produce_all_loop_masks_p (loop_vinfo, this_type)) > { > - /* Although we could stop as soon as we find a valid mode, > - it's often better to continue until we hit Pmode, since the > + /* See whether zero-based IV would ever generate all-false masks > + before wrapping around. */ > + bool might_wrap_p = (iv_precision > cmp_bits); > + /* Stop as soon as we find a valid mode. If we decided to use > + cmp_type which is less than Pmode precision, it is often better > + to use iv_type corresponding to Pmode, since the > operands to the WHILE are more likely to be reusable in > - address calculations. */ > - cmp_type = this_type; > + address calculations in this case. */ > + iv_type = this_type; > + if (might_wrap_p > + || (cmp_bits <= TYPE_PRECISION (niters_type))) > + cmp_type = this_type; It's not obvious that this always sets cmp_type when it could be set. I think instead we should have: if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) cmp_type = this_type; Thanks, Richard