Thanks for trying it.  I'm still surprised that no multiplication
is needed though.  Does the patch work for:

short x[100];
int y[200];

void f() {
  for (int i = 0, j = 0; i < 100; i += 2, j += 4) {
    x[i + 0] += 1;
    x[i + 1] += 2;
    y[j + 0] += 1;
    y[j + 1] += 2;
    y[j + 2] += 3;
    y[j + 3] += 4;
  }
}

?  Here, there should be a single-control rgroup for x, counting
2 units per scalar iteration.  I'd expect the IV to use this scale.

There should also be a 4-control rgroup for y, counting 4 units per
scalar iteration.  So I think the IV would need to be multiplied by 2
before being used for the y rgroup.

Thanks,
Richard

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>
>
> This patch is supporting decrement IV by following the flow designed by 
> Richard:
>
> (1) In vect_set_loop_condition_partial_vectors, for the first iteration of:
>     call vect_set_loop_controls_directly.
>
> (2) vect_set_loop_controls_directly calculates "step" as in your patch.
> If rgc has 1 control, this step is the SSA name created for that control.
> Otherwise the step is a fresh SSA name, as in your patch.
>
> (3) vect_set_loop_controls_directly stores this step somewhere for later
> use, probably in LOOP_VINFO.  Let's use "S" to refer to this stored step.
>
> (4) After the vect_set_loop_controls_directly call above, and outside
> the "if" statement that now contains vect_set_loop_controls_directly,
> check whether rgc->controls.length () > 1.  If so, use
> vect_adjust_loop_lens_control to set the controls based on S.
>
> Then the only caller of vect_adjust_loop_lens_control is
> vect_set_loop_condition_partial_vectors.  And the starting
> step for vect_adjust_loop_lens_control is always S.
>
> This patch has well tested for single-rgroup and multiple-rgroup (SLP) and
> passed all testcase in RISC-V port.
>
> Also, pass tests for multiple-rgroup (non-SLP) tested on vec_pack_trunk.
>
> ---
>  gcc/tree-vect-loop-manip.cc | 178 +++++++++++++++++++++++++++++++++---
>  gcc/tree-vect-loop.cc       |  13 +++
>  gcc/tree-vectorizer.h       |  12 +++
>  3 files changed, 192 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index ff6159e08d5..578ac5b783e 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -468,6 +468,38 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>    gimple_stmt_iterator incr_gsi;
>    bool insert_after;
>    standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +  if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
> +    {
> +      /* single rgroup:
> +      ...
> +      _10 = (unsigned long) count_12(D);
> +      ...
> +      # ivtmp_9 = PHI <ivtmp_35(6), _10(5)>
> +      _36 = MIN_EXPR <ivtmp_9, POLY_INT_CST [4, 4]>;
> +      ...
> +      vect__4.8_28 = .LEN_LOAD (_17, 32B, _36, 0);
> +      ...
> +      ivtmp_35 = ivtmp_9 - _36;
> +      ...
> +      if (ivtmp_35 != 0)
> +        goto <bb 4>; [83.33%]
> +      else
> +        goto <bb 5>; [16.67%]
> +      */
> +      nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
> +      tree step = rgc->controls.length () == 1 ? rgc->controls[0]
> +                                            : make_ssa_name (iv_type);
> +      /* Create decrement IV.  */
> +      create_iv (nitems_total, MINUS_EXPR, step, NULL_TREE, loop, &incr_gsi,
> +              insert_after, &index_before_incr, &index_after_incr);
> +      gimple_seq_add_stmt (header_seq, gimple_build_assign (step, MIN_EXPR,
> +                                                         index_before_incr,
> +                                                         nitems_step));
> +      LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo) = step;
> +      return index_after_incr;
> +    }
> +
> +  /* Create increment IV.  */
>    create_iv (build_int_cst (iv_type, 0), PLUS_EXPR, nitems_step, NULL_TREE,
>            loop, &incr_gsi, insert_after, &index_before_incr,
>            &index_after_incr);
> @@ -683,6 +715,63 @@ vect_set_loop_controls_directly (class loop *loop, 
> loop_vec_info loop_vinfo,
>    return next_ctrl;
>  }
>  
> +/* Try to use adjust loop lens for multiple-rgroups.
> +
> +     _36 = MIN_EXPR <ivtmp_34, VF>;
> +
> +     First length (MIN (X, VF/N)):
> +       loop_len_15 = MIN_EXPR <_36, VF/N>;
> +
> +     Second length:
> +       tmp = _36 - loop_len_15;
> +       loop_len_16 = MIN (tmp, VF/N);
> +
> +     Third length:
> +       tmp2 = tmp - loop_len_16;
> +       loop_len_17 = MIN (tmp2, VF/N);
> +
> +     Last length:
> +       loop_len_18 = tmp2 - loop_len_17;
> +*/
> +
> +static void
> +vect_adjust_loop_lens_control (tree iv_type, gimple_seq *seq,
> +                            rgroup_controls *dest_rgm, tree step)
> +{
> +  tree ctrl_type = dest_rgm->type;
> +  poly_uint64 nitems_per_ctrl
> +    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
> +  tree length_limit = build_int_cst (iv_type, nitems_per_ctrl);
> +
> +  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
> +    {
> +      tree ctrl = dest_rgm->controls[i];
> +      if (i == 0)
> +     {
> +       /* First iteration: MIN (X, VF/N) capped to the range [0, VF/N].  */
> +       gassign *assign
> +         = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +      else if (i == dest_rgm->controls.length () - 1)
> +     {
> +       /* Last iteration: Remain capped to the range [0, VF/N].  */
> +       gassign *assign = gimple_build_assign (ctrl, MINUS_EXPR, step,
> +                                              dest_rgm->controls[i - 1]);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +      else
> +     {
> +       /* (MIN (remain, VF*I/N)) capped to the range [0, VF/N].  */
> +       step = gimple_build (seq, MINUS_EXPR, iv_type, step,
> +                            dest_rgm->controls[i - 1]);
> +       gassign *assign
> +         = gimple_build_assign (ctrl, MIN_EXPR, step, length_limit);
> +       gimple_seq_add_stmt (seq, assign);
> +     }
> +    }
> +}
> +
>  /* Set up the iteration condition and rgroup controls for LOOP, given
>     that LOOP_VINFO_USING_PARTIAL_VECTORS_P is true for the vectorized
>     loop.  LOOP_VINFO describes the vectorization of LOOP.  NITERS is
> @@ -753,17 +842,84 @@ vect_set_loop_condition_partial_vectors (class loop 
> *loop,
>             continue;
>         }
>  
> -     /* See whether zero-based IV would ever generate all-false masks
> -        or zero length before wrapping around.  */
> -     bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> -
> -     /* Set up all controls for this group.  */
> -     test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
> -                                                  &preheader_seq,
> -                                                  &header_seq,
> -                                                  loop_cond_gsi, rgc,
> -                                                  niters, niters_skip,
> -                                                  might_wrap_p);
> +     if (!LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +         || !LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo))
> +       {
> +         /* See whether zero-based IV would ever generate all-false masks
> +            or zero length before wrapping around.  */
> +         bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
> +
> +         /* Set up all controls for this group.  */
> +         test_ctrl
> +           = vect_set_loop_controls_directly (loop, loop_vinfo,
> +                                              &preheader_seq, &header_seq,
> +                                              loop_cond_gsi, rgc, niters,
> +                                              niters_skip, might_wrap_p);
> +       }
> +
> +     /* Decrement IV only run vect_set_loop_controls_directly once.  */
> +     if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
> +         && rgc->controls.length () > 1)
> +       {
> +         /*
> +            - Multiple rgroup (SLP):
> +              ...
> +              _38 = (unsigned long) bnd.7_29;
> +              _39 = _38 * 2;
> +              ...
> +              # ivtmp_41 = PHI <ivtmp_42(6), _39(5)>
> +              ...
> +              _43 = MIN_EXPR <ivtmp_41, 32>;
> +              loop_len_26 = MIN_EXPR <_43, 16>;
> +              loop_len_25 = _43 - loop_len_26;
> +              ...
> +              .LEN_STORE (_6, 8B, loop_len_26, ...);
> +              ...
> +              .LEN_STORE (_25, 8B, loop_len_25, ...);
> +              _33 = loop_len_26 / 2;
> +              ...
> +              .LEN_STORE (_8, 16B, _33, ...);
> +              _36 = loop_len_25 / 2;
> +              ...
> +              .LEN_STORE (_15, 16B, _36, ...);
> +              ivtmp_42 = ivtmp_41 - _43;
> +              ...
> +
> +            - Multiple rgroup (non-SLP):
> +              ...
> +              _38 = (unsigned long) n_12(D);
> +              ...
> +              # ivtmp_38 = PHI <ivtmp_39(3), 100(2)>
> +              ...
> +              _40 = MIN_EXPR <ivtmp_38, POLY_INT_CST [8, 8]>;
> +              loop_len_21 = MIN_EXPR <_40, POLY_INT_CST [2, 2]>;
> +              _41 = _40 - loop_len_21;
> +              loop_len_20 = MIN_EXPR <_41, POLY_INT_CST [2, 2]>;
> +              _42 = _40 - loop_len_20;
> +              loop_len_19 = MIN_EXPR <_42, POLY_INT_CST [2, 2]>;
> +              _43 = _40 - loop_len_19;
> +              loop_len_16 = MIN_EXPR <_43, POLY_INT_CST [2, 2]>;
> +              ...
> +              vect__4.8_15 = .LEN_LOAD (_6, 64B, loop_len_21, 0);
> +              ...
> +              vect__4.9_8 = .LEN_LOAD (_13, 64B, loop_len_20, 0);
> +              ...
> +              vect__4.10_28 = .LEN_LOAD (_46, 64B, loop_len_19, 0);
> +              ...
> +              vect__4.11_30 = .LEN_LOAD (_49, 64B, loop_len_16, 0);
> +              vect__7.13_31 = VEC_PACK_TRUNC_EXPR <...>,
> +              vect__7.13_32 = VEC_PACK_TRUNC_EXPR <...>;
> +              vect__7.12_33 = VEC_PACK_TRUNC_EXPR <...>;
> +              ...
> +              .LEN_STORE (_14, 16B, _40, vect__7.12_33, 0);
> +              ivtmp_39 = ivtmp_38 - _40;
> +              ...
> +         */
> +         tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +         tree step = LOOP_VINFO_DECREMENTING_IV_STEP (loop_vinfo);
> +         gcc_assert (step);
> +         vect_adjust_loop_lens_control (iv_type, &header_seq, rgc, step);
> +       }
>        }
>  
>    /* Emit all accumulated statements.  */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index cf10132b0bf..456f50fa7cc 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -973,6 +973,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
> vec_info_shared *shared)
>      vectorizable (false),
>      can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
>      using_partial_vectors_p (false),
> +    using_decrementing_iv_p (false),
> +    decrementing_iv_step (NULL_TREE),
>      epil_using_partial_vectors_p (false),
>      partial_load_store_bias (0),
>      peeling_for_gaps (false),
> @@ -2725,6 +2727,17 @@ start_over:
>        && !vect_verify_loop_lens (loop_vinfo))
>      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>  
> +  /* If we're vectorizing an loop that uses length "controls" and
> +     can iterate more than once, we apply decrementing IV approach
> +     in loop control.  */
> +  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
> +      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ()
> +      && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
> +      && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
> +        && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
> +                     LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
> +    LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
> +
>    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
>       to be able to handle fewer than VF scalars, or needs to have a lower VF
>       than the main loop.  */
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 02d2ad6fba1..7ed079f543a 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,16 @@ public:
>       the vector loop can handle fewer than VF scalars.  */
>    bool using_partial_vectors_p;
>  
> +  /* True if we've decided to use a decrementing loop control IV that counts
> +     scalars. This can be done for any loop that:
> +
> +     (a) uses length "controls"; and
> +     (b) can iterate more than once.  */
> +  bool using_decrementing_iv_p;
> +
> +  /* The variable amount step for decrement IV.  */
> +  tree decrementing_iv_step;
> +
>    /* True if we've decided to use partially-populated vectors for the
>       epilogue of loop.  */
>    bool epil_using_partial_vectors_p;
> @@ -890,6 +900,8 @@ public:
>  #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
>  #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) 
> (L)->can_use_partial_vectors_p
>  #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
> +#define LOOP_VINFO_USING_DECREMENTING_IV_P(L) (L)->using_decrementing_iv_p
> +#define LOOP_VINFO_DECREMENTING_IV_STEP(L) (L)->decrementing_iv_step
>  #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                           
>   \
>    (L)->epil_using_partial_vectors_p
>  #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias

Reply via email to