Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Richard Sandiford Fri, 31 May 2019 02:44:17 -0700

Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org> writes:
> @@ -609,8 +615,14 @@ vect_set_loop_masks_directly (struct loop *loop, 
> loop_vec_info loop_vinfo,
>  
>        /* Get the mask value for the next iteration of the loop.  */
>        next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
> -      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
> -      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
> +      tree test_index_cmp_type = make_ssa_name (compare_type);
> +      gimple *conv_stmt = gimple_build_assign (test_index_cmp_type,
> +                                            NOP_EXPR,
> +                                            test_index);
> +      gsi_insert_before (test_gsi, conv_stmt, GSI_NEW_STMT);


We only need to convert once, so this should happen before the
FOR_EACH_VEC_ELT_REVERSE loop.  Would be better as:

  gimple_seq test_seq = NULL;
  test_index = gimple_convert (&test_seq, compare_type, text_index);
  gimple_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);

so that we don't generate unnecessary converts.

> +      gcall *call = vect_gen_while (next_mask, test_index_cmp_type,
> +                                 this_test_limit);
> +      gsi_insert_after (test_gsi, call, GSI_SAME_STMT);
>  
>        vect_set_loop_mask (loop, mask, init_mask, next_mask);
>      }
> @@ -637,12 +649,12 @@ vect_set_loop_condition_masked (struct loop *loop, 
> loop_vec_info loop_vinfo,
>  
>    tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
>    unsigned int compare_precision = TYPE_PRECISION (compare_type);
> -  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
>    tree orig_niters = niters;
>  
>    /* Type of the initial value of NITERS.  */
>    tree ni_actual_type = TREE_TYPE (niters);
>    unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
> +  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
>  
>    /* Convert NITERS to the same size as the compare.  */
>    if (compare_precision > ni_actual_precision
> @@ -661,33 +673,7 @@ vect_set_loop_condition_masked (struct loop *loop, 
> loop_vec_info loop_vinfo,
>    else
>      niters = gimple_convert (&preheader_seq, compare_type, niters);
>  
> -  /* Convert skip_niters to the right type.  */
> -  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> -
> -  /* Now calculate the value that the induction variable must be able
> -     to hit in order to ensure that we end the loop with an all-false mask.
> -     This involves adding the maximum number of inactive trailing scalar
> -     iterations.  */
> -  widest_int iv_limit;
> -  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
> -  if (known_max_iters)
> -    {
> -      if (niters_skip)
> -     {
> -       /* Add the maximum number of skipped iterations to the
> -          maximum iteration count.  */
> -       if (TREE_CODE (niters_skip) == INTEGER_CST)
> -         iv_limit += wi::to_widest (niters_skip);
> -       else
> -         iv_limit += max_vf - 1;
> -     }
> -      /* IV_LIMIT is the maximum number of latch iterations, which is also
> -      the maximum in-range IV value.  Round this value down to the previous
> -      vector alignment boundary and then add an extra full iteration.  */
> -      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> -      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
> -    }
> -
> +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
>    /* Get the vectorization factor in tree form.  */
>    tree vf = build_int_cst (compare_type,
>                          LOOP_VINFO_VECT_FACTOR (loop_vinfo));
> @@ -717,7 +703,7 @@ vect_set_loop_condition_masked (struct loop *loop, 
> loop_vec_info loop_vinfo,
>       /* See whether zero-based IV would ever generate all-false masks
>          before wrapping around.  */
>       bool might_wrap_p
> -       = (!known_max_iters
> +       = (iv_limit == UINT_MAX

Shouldn't this be == -1?

>            || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
>                                   UNSIGNED)
>                > compare_precision));
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 4942c69..1240037 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -1029,7 +1029,10 @@ static bool
>  vect_verify_full_masking (loop_vec_info loop_vinfo)
>  {
>    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
>    unsigned int min_ni_width;
> +  unsigned int max_nscalars_per_iter
> +    = vect_get_max_nscalars_per_iter (loop_vinfo);
>  
>    /* Use a normal loop if there are no statements that need masking.
>       This only happens in rare degenerate cases: it means that the loop
> @@ -1048,7 +1051,7 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
>      max_ni = wi::smin (max_ni, max_back_edges + 1);
>  
>    /* Account for rgroup masks, in which each bit is replicated N times.  */
> -  max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
> +  max_ni *= max_nscalars_per_iter;
>  
>    /* Work out how many bits we need to represent the limit.  */
>    min_ni_width = wi::min_precision (max_ni, UNSIGNED);
> @@ -1056,6 +1059,14 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
>    /* Find a scalar mode for which WHILE_ULT is supported.  */
>    opt_scalar_int_mode cmp_mode_iter;
>    tree cmp_type = NULL_TREE;
> +  tree iv_type = NULL_TREE;
> +  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
> +  widest_int iv_precision = UINT_MAX;
> +
> +  if (iv_limit != UINT_MAX)

Same for != -1 here.

> +    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
> +                                   UNSIGNED);
> +
>    FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
>      {
>        unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
> @@ -1066,11 +1077,18 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
>         if (this_type
>             && can_produce_all_loop_masks_p (loop_vinfo, this_type))
>           {
> -           /* Although we could stop as soon as we find a valid mode,
> -              it's often better to continue until we hit Pmode, since the
> +           /* See whether zero-based IV would ever generate all-false masks
> +              before wrapping around.  */
> +           bool might_wrap_p = (iv_precision > cmp_bits);
> +           /* Stop as soon as we find a valid mode.  If we decided to use
> +              cmp_type which is less than Pmode precision, it is often better
> +              to use iv_type corresponding to Pmode, since the
>                operands to the WHILE are more likely to be reusable in
> -              address calculations.  */
> -           cmp_type = this_type;
> +              address calculations in this case.  */
> +           iv_type = this_type;
> +           if (might_wrap_p
> +               || (cmp_bits <= TYPE_PRECISION (niters_type)))
> +             cmp_type = this_type;

It's not obvious that this always sets cmp_type when it could be set.
I think instead we should have:

              if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
                cmp_type = this_type;

Thanks,
Richard

Re: [RFC][PR88838][SVE] Use 32-bit WHILELO in LP64 mode

Reply via email to