From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>

1. Fix document description according Jeff && Richard.
2. Add LOOP_VINFO_USING_SELECT_VL_P for single rgroup.
3. Add LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P for SLP multiple rgroup.

Fix bugs for V5 after testing:
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618209.html

gcc/ChangeLog:

        * doc/md.texi: Add seletc_vl pattern.
        * internal-fn.def (SELECT_VL): New ifn.
        * optabs.def (OPTAB_D): New optab.
        * tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
        (vect_set_loop_controls_by_select_vl): Ditto.
        (vect_set_loop_condition_partial_vectors): Add loop control for 
decrement IV.
        * tree-vect-loop.cc (vect_get_loop_len): Adjust loop len for SLP.
        * tree-vect-stmts.cc (get_select_vl_data_ref_ptr): New function.
        (vectorizable_store): Support data reference IV added by outcome of 
SELECT_VL.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): New macro.
        (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P): Ditto.
        (vect_get_loop_len): Adjust loop len for SLP.

---
 gcc/doc/md.texi             |  36 ++++
 gcc/internal-fn.def         |   1 +
 gcc/optabs.def              |   1 +
 gcc/tree-vect-loop-manip.cc | 380 +++++++++++++++++++++++++++++++++++-
 gcc/tree-vect-loop.cc       |  31 ++-
 gcc/tree-vect-stmts.cc      |  79 +++++++-
 gcc/tree-vectorizer.h       |  12 +-
 7 files changed, 526 insertions(+), 14 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8ebce31ba78..a94ffc4456d 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of active elements in a vector to be updated 
+in a loop iteration based on the total number of elements to be updated, 
+the vectorization factor and vector properties of the target.
+operand 1 is the total elements in the vector to be updated.
+operand 2 is the vectorization factor.
+The value of operand 0 is target dependent and flexible in each iteration.
+The operation of this pattern can be:
+
+@smallexample
+Case 1:
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) 
so
+that we can reduce a use of general purpose register.
+
+In this case, only the last iteration of the loop is partial iteration.
+@end smallexample
+
+@smallexample
+Case 2:
+if (operand1 <= operand2)
+  operand0 = operand1;
+else if (operand1 < 2 * operand2)
+  operand0 = ceil (operand1 / 2);
+else
+  operand0 = operand2;
+
+This case will evenly distribute work over the last 2 iterations of a 
stripmine loop.
+@end smallexample
+
+The output of this pattern is not only used as IV of loop control counter, but 
also
+is used as the IV of address calculation with multiply/shift operation. This 
allows
+dynamic adjustment of the number of elements processed each loop iteration.
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern
 @item @samp{check_raw_ptrs@var{m}}
 Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..6f6fa7d37f9 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
                       check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..b637471b76e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
 OPTAB_D (len_load_optab, "len_load_$a")
 OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..81334f4f171 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,353 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, 
rgroup_controls *dest_rgm,
   return false;
 }
 
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+     _36 = MIN_EXPR <ivtmp_34, POLY_INT_CST [8, 8]>;
+
+     First length (MIN (X, VF/N)):
+       loop_len_15 = MIN_EXPR <_36, POLY_INT_CST [2, 2]>;
+
+     Second length (X - MIN (X, 1 * VF/N)):
+       loop_len_16 = _36 - loop_len_15;
+
+     Third length (X - MIN (X, 2 * VF/N)):
+       _38 = MIN_EXPR <_36, POLY_INT_CST [4, 4]>;
+       loop_len_17 = _36 - _38;
+
+     Forth length (X - MIN (X, 3 * VF/N)):
+       _39 = MIN_EXPR <_36, POLY_INT_CST [6, 6]>;
+       loop_len_18 = _36 - _39;  */
+
+static void
+vect_adjust_loop_lens (tree iv_type, gimple_seq *seq, rgroup_controls 
*dest_rgm,
+                      rgroup_controls *src_rgm)
+{
+  tree ctrl_type = dest_rgm->type;
+  poly_uint64 nitems_per_ctrl
+    = TYPE_VECTOR_SUBPARTS (ctrl_type) * dest_rgm->factor;
+
+  for (unsigned int i = 0; i < dest_rgm->controls.length (); ++i)
+    {
+      tree src = src_rgm->controls[i / dest_rgm->controls.length ()];
+      tree dest = dest_rgm->controls[i];
+      gassign *stmt;
+      if (i == 0)
+       {
+         /* MIN (X, VF*I/N) capped to the range [0, VF/N].  */
+         tree factor = build_int_cst (iv_type, nitems_per_ctrl);
+         stmt = gimple_build_assign (dest, MIN_EXPR, src, factor);
+         gimple_seq_add_stmt (seq, stmt);
+       }
+      else
+       {
+         /* (X - MIN (X, VF*I/N)) capped to the range [0, VF/N].  */
+         tree factor = build_int_cst (iv_type, nitems_per_ctrl * i);
+         tree temp = make_ssa_name (iv_type);
+         stmt = gimple_build_assign (temp, MIN_EXPR, src, factor);
+         gimple_seq_add_stmt (seq, stmt);
+         stmt = gimple_build_assign (dest, MINUS_EXPR, src, temp);
+         gimple_seq_add_stmt (seq, stmt);
+       }
+    }
+}
+
+/* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
+   for all the rgroup controls in RGC and return a control that is nonzero
+   when the loop needs to iterate.  Add any new preheader statements to
+   PREHEADER_SEQ.  Use LOOP_COND_GSI to insert code before the exit gcond.
+
+   RGC belongs to loop LOOP.  The loop originally iterated NITERS
+   times and has been vectorized according to LOOP_VINFO.
+
+   Unlike vect_set_loop_controls_directly which is iterating from 0-based IV
+   to TEST_LIMIT - bias.
+
+   In vect_set_loop_controls_by_select_vl, we are iterating from start at
+   IV = TEST_LIMIT - bias and keep subtract IV by the length calculated by
+   IFN_SELECT_VL pattern.
+
+   1. Single rgroup, the Gimple IR should be:
+
+       # vectp_B.6_8 = PHI <vectp_B.6_13(6), &B(5)>
+       # vectp_B.8_16 = PHI <vectp_B.8_17(6), &B(5)>
+       # vectp_A.11_19 = PHI <vectp_A.11_20(6), &A(5)>
+       # vectp_A.13_22 = PHI <vectp_A.13_23(6), &A(5)>
+       # ivtmp_26 = PHI <ivtmp_27(6), _25(5)>
+       _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
+       ivtmp_15 = _28 * 4;
+       vect__1.10_18 = .LEN_LOAD (vectp_B.8_16, 128B, _28, 0);
+       _1 = B[i_10];
+       .LEN_STORE (vectp_A.13_22, 128B, _28, vect__1.10_18, 0);
+       i_7 = i_10 + 1;
+       vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
+       vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
+       ivtmp_27 = ivtmp_26 - _28;
+       if (ivtmp_27 != 0)
+         goto <bb 6>; [83.33%]
+       else
+         goto <bb 7>; [16.67%]
+
+   Note: We use the outcome of .SELECT_VL to adjust both loop control IV and
+   data reference pointer IV.
+
+   1). The result of .SELECT_VL:
+       _28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);
+       The _28 is not necessary to be VF in any iteration, instead, we allow
+       _28 to be any value as long as _28 <= VF. Such flexible SELECT_VL
+       pattern allows target have various flexible optimizations in vector
+       loop iterations. Target like RISC-V has special application vector
+       length calculation instruction which will distribute even workload
+       in the last 2 iterations.
+
+       Other example is that we can allow even generate _28 <= VF / 2 so
+       that some machine can run vector codes in low power mode.
+
+   2). Loop control IV:
+       ivtmp_27 = ivtmp_26 - _28;
+       if (ivtmp_27 != 0)
+        goto <bb 6>; [83.33%]
+       else
+        goto <bb 7>; [16.67%]
+
+       This is the saturating-subtraction towards zero, the outcome of
+       .SELECT_VL wil make ivtmp_27 never underflow zero.
+
+   3). Data reference pointer IV:
+       ivtmp_15 = _28 * 4;
+       vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
+       vectp_A.13_23 = vectp_A.13_22 + ivtmp_15;
+
+       The pointer IV is adjusted accurately according to the .SELECT_VL.
+
+   2. Multiple rgroup, the Gimple IR should be:
+
+       # i_23 = PHI <i_20(6), 0(11)>
+       # vectp_f.8_51 = PHI <vectp_f.8_52(6), f_15(D)(11)>
+       # vectp_d.10_59 = PHI <vectp_d.10_60(6), d_18(D)(11)>
+       # ivtmp_70 = PHI <ivtmp_71(6), _69(11)>
+       # ivtmp_73 = PHI <ivtmp_74(6), _67(11)>
+       _72 = MIN_EXPR <ivtmp_70, 16>;
+       _75 = MIN_EXPR <ivtmp_73, 16>;
+       _1 = i_23 * 2;
+       _2 = (long unsigned int) _1;
+       _3 = _2 * 2;
+       _4 = f_15(D) + _3;
+       _5 = _2 + 1;
+       _6 = _5 * 2;
+       _7 = f_15(D) + _6;
+       .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+       vectp_f.8_56 = vectp_f.8_51 + 16;
+       .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+       _8 = (long unsigned int) i_23;
+       _9 = _8 * 4;
+       _10 = d_18(D) + _9;
+       _61 = _75 / 2;
+       .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+       vectp_d.10_63 = vectp_d.10_59 + 16;
+       _64 = _72 / 2;
+       .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+       i_20 = i_23 + 1;
+       vectp_f.8_52 = vectp_f.8_56 + 16;
+       vectp_d.10_60 = vectp_d.10_63 + 16;
+       ivtmp_74 = ivtmp_73 - _75;
+       ivtmp_71 = ivtmp_70 - _72;
+       if (ivtmp_74 != 0)
+         goto <bb 6>; [83.33%]
+       else
+         goto <bb 13>; [16.67%]
+
+   Note: We DO NOT use .SELECT_VL in SLP auto-vectorization for multiple
+   rgroups. Instead, we use MIN_EXPR to guarantee we always use VF as the
+   iteration amount for mutiple rgroups.
+
+   The analysis of the flow of multiple rgroups:
+       _72 = MIN_EXPR <ivtmp_70, 16>;
+       _75 = MIN_EXPR <ivtmp_73, 16>;
+       ...
+       .LEN_STORE (vectp_f.8_51, 128B, _75, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+       vectp_f.8_56 = vectp_f.8_51 + 16;
+       .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+       ...
+       _61 = _75 / 2;
+       .LEN_STORE (vectp_d.10_59, 128B, _61, { 3, 3, 3, 3 }, 0);
+       vectp_d.10_63 = vectp_d.10_59 + 16;
+       _64 = _72 / 2;
+       .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+
+  We use _72 = MIN_EXPR <ivtmp_70, 16>; to generate the number of the elements
+  to be processed in each iteration.
+
+  The related STOREs:
+    _72 = MIN_EXPR <ivtmp_70, 16>;
+    .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2, 1, 2, 1, 2 }, 0);
+    _64 = _72 / 2;
+    .LEN_STORE (vectp_d.10_63, 128B, _64, { 3, 3, 3, 3 }, 0);
+  Since these 2 STOREs store 2 vectors that the second vector is half elements
+  of the first vector. So the length of second STORE will be _64 = _72 / 2;
+  It's similar to the VIEW_CONVERT of handling masks in SLP.
+
+  3. Multiple rgroups for non-SLP auto-vectorization.
+
+     # ivtmp_26 = PHI <ivtmp_27(4), _25(3)>
+     # ivtmp.35_10 = PHI <ivtmp.35_11(4), ivtmp.35_1(3)>
+     # ivtmp.36_2 = PHI <ivtmp.36_8(4), ivtmp.36_23(3)>
+     _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+     loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+     loop_len_16 = _28 - loop_len_15;
+     _29 = (void *) ivtmp.35_10;
+     _7 = &MEM <vector([4,4]) int> [(int *)_29];
+     vect__1.25_17 = .LEN_LOAD (_7, 128B, loop_len_15, 0);
+     _33 = _29 + POLY_INT_CST [16, 16];
+     _34 = &MEM <vector([4,4]) int> [(int *)_33];
+     vect__1.26_19 = .LEN_LOAD (_34, 128B, loop_len_16, 0);
+     vect__2.27_20 = VEC_PACK_TRUNC_EXPR <vect__1.25_17, vect__1.26_19>;
+     _30 = (void *) ivtmp.36_2;
+     _31 = &MEM <vector([8,8]) short int> [(short int *)_30];
+     .LEN_STORE (_31, 128B, _28, vect__2.27_20, 0);
+     ivtmp_27 = ivtmp_26 - _28;
+     ivtmp.35_11 = ivtmp.35_10 + POLY_INT_CST [32, 32];
+     ivtmp.36_8 = ivtmp.36_2 + POLY_INT_CST [16, 16];
+     if (ivtmp_27 != 0)
+       goto <bb 4>; [83.33%]
+     else
+       goto <bb 5>; [16.67%]
+
+     The total length: _28 = MIN_EXPR <ivtmp_26, POLY_INT_CST [8, 8]>;
+
+     The length of first half vector:
+       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+
+     The length of second half vector:
+       loop_len_15 = MIN_EXPR <_28, POLY_INT_CST [4, 4]>;
+       loop_len_16 = _28 - loop_len_15;
+
+     1). _28 always <= POLY_INT_CST [8, 8].
+     2). When _28 <= POLY_INT_CST [4, 4], second half vector is not processed.
+     3). When _28 > POLY_INT_CST [4, 4], second half vector is processed.
+*/
+
+static tree
+vect_set_loop_controls_by_select_vl (class loop *loop, loop_vec_info 
loop_vinfo,
+                                    gimple_seq *preheader_seq,
+                                    gimple_seq *header_seq,
+                                    rgroup_controls *rgc, tree niters)
+{
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  /* We are not allowing masked approach in SELECT_VL.  */
+  gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+
+  tree ctrl_type = rgc->type;
+  unsigned int nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor;
+  poly_uint64 nitems_per_ctrl = TYPE_VECTOR_SUBPARTS (ctrl_type) * rgc->factor;
+  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  /* Calculate the maximum number of item values that the rgroup
+     handles in total, the number that it handles for each iteration
+     of the vector loop.  */
+  tree nitems_total = niters;
+  if (nitems_per_iter != 1)
+    {
+      /* We checked before setting LOOP_VINFO_USING_PARTIAL_VECTORS_P that
+        these multiplications don't overflow.  */
+      tree compare_factor = build_int_cst (compare_type, nitems_per_iter);
+      nitems_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+                                  nitems_total, compare_factor);
+    }
+
+  /* Convert the comparison value to the IV type (either a no-op or
+     a promotion).  */
+  nitems_total = gimple_convert (preheader_seq, iv_type, nitems_total);
+
+  /* Create an induction variable that counts the number of items
+     processed.  */
+  tree index_before_incr, index_after_incr;
+  gimple_stmt_iterator incr_gsi;
+  bool insert_after;
+  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+
+  /* Test the decremented IV, which will never underflow 0 since we have
+     IFN_SELECT_VL to gurantee that.  */
+  tree test_limit = nitems_total;
+
+  /* Provide a definition of each control in the group.  */
+  tree ctrl;
+  unsigned int i;
+  FOR_EACH_VEC_ELT_REVERSE (rgc->controls, i, ctrl)
+    {
+      /* Previous controls will cover BIAS items.  This control covers the
+        next batch.  */
+      poly_uint64 bias = nitems_per_ctrl * i;
+      tree bias_tree = build_int_cst (iv_type, bias);
+
+      /* Rather than have a new IV that starts at TEST_LIMIT and goes down to
+        BIAS, prefer to use the same TEST_LIMIT - BIAS based IV for each
+        control and adjust the bound down by BIAS.  */
+      tree this_test_limit = test_limit;
+      if (i != 0)
+       {
+         this_test_limit = gimple_build (preheader_seq, MAX_EXPR, iv_type,
+                                         this_test_limit, bias_tree);
+         this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, iv_type,
+                                         this_test_limit, bias_tree);
+       }
+
+      /* Create decrement IV.  */
+      create_iv (this_test_limit, MINUS_EXPR, ctrl, NULL_TREE, loop, &incr_gsi,
+                insert_after, &index_before_incr, &index_after_incr);
+
+      poly_uint64 final_vf = vf * nitems_per_iter;
+      tree vf_step = build_int_cst (iv_type, final_vf);
+      tree res_len;
+      if (LOOP_VINFO_LENS (loop_vinfo).length () == 1)
+       {
+         res_len = gimple_build (header_seq, IFN_SELECT_VL, iv_type,
+                                 index_before_incr, vf_step);
+         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
+       }
+      else
+       {
+         /* For SLP, we can't allow non-VF number of elements to be processed
+            in non-final iteration. We force the number of elements to be
+            processed in each non-final iteration is VF elements. If we allow
+            non-VF elements processing in non-final iteration will make SLP too
+            complicated and produce inferior codegen.
+
+              For example:
+
+               If non-final iteration process VF elements.
+
+                 ...
+                 .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+                 .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+                 ...
+
+               If non-final iteration process non-VF elements.
+
+                 ...
+                 .LEN_STORE (vectp_f.8_51, 128B, _71, { 1, 2, 1, 2 }, 0);
+                 if (_71 % 2 == 0)
+                  .LEN_STORE (vectp_f.8_56, 128B, _72, { 1, 2, 1, 2 }, 0);
+                 else
+                  .LEN_STORE (vectp_f.8_56, 128B, _72, { 2, 1, 2, 1 }, 0);
+                 ...
+
+            This is the simple case of 2-elements interleaved vector SLP. We
+            consider other interleave vector, the situation will become more
+            complicated.  */
+         res_len = gimple_build (header_seq, MIN_EXPR, iv_type,
+                                 index_before_incr, vf_step);
+         if (rgc->max_nscalars_per_iter != 1)
+           LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P (loop_vinfo) = true;
+       }
+      gassign *assign = gimple_build_assign (ctrl, res_len);
+      gimple_seq_add_stmt (header_seq, assign);
+    }
+
+  return index_after_incr;
+}
+
 /* Helper for vect_set_loop_condition_partial_vectors.  Generate definitions
    for all the rgroup controls in RGC and return a control that is nonzero
    when the loop needs to iterate.  Add any new preheader statements to
@@ -704,6 +1051,10 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
 
   bool use_masks_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
+  bool use_vl_p = !use_masks_p
+                 && direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
+                                                    OPTIMIZE_FOR_SPEED);
   unsigned int compare_precision = TYPE_PRECISION (compare_type);
   tree orig_niters = niters;
 
@@ -753,17 +1104,34 @@ vect_set_loop_condition_partial_vectors (class loop 
*loop,
              continue;
          }
 
+       if (use_vl_p && rgc->max_nscalars_per_iter == 1
+           && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0])
+         {
+           rgroup_controls *sub_rgc
+             = &(*controls)[nmasks / rgc->controls.length () - 1];
+           if (!sub_rgc->controls.is_empty ())
+             {
+               vect_adjust_loop_lens (iv_type, &header_seq, rgc, sub_rgc);
+               continue;
+             }
+         }
+
        /* See whether zero-based IV would ever generate all-false masks
           or zero length before wrapping around.  */
        bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc);
 
        /* Set up all controls for this group.  */
-       test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
-                                                    &preheader_seq,
-                                                    &header_seq,
-                                                    loop_cond_gsi, rgc,
-                                                    niters, niters_skip,
-                                                    might_wrap_p);
+       if (use_vl_p)
+         test_ctrl
+           = vect_set_loop_controls_by_select_vl (loop, loop_vinfo,
+                                                  &preheader_seq, &header_seq,
+                                                  rgc, niters);
+       else
+         test_ctrl
+           = vect_set_loop_controls_directly (loop, loop_vinfo, &preheader_seq,
+                                              &header_seq, loop_cond_gsi, rgc,
+                                              niters, niters_skip,
+                                              might_wrap_p);
       }
 
   /* Emit all accumulated statements.  */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ed0166fedab..fe6af4286bf 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -973,6 +973,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     vectorizable (false),
     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
+    using_select_vl_p (false),
+    using_slp_adjusted_len_p (false),
     epil_using_partial_vectors_p (false),
     partial_load_store_bias (0),
     peeling_for_gaps (false),
@@ -10361,15 +10363,18 @@ vect_record_loop_len (loop_vec_info loop_vinfo, 
vec_loop_lens *lens,
 }
 
 /* Given a complete set of length LENS, extract length number INDEX for an
-   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.
+   Insert any set-up statements before GSI.  */
 
 tree
-vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
-                  unsigned int nvectors, unsigned int index)
+vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
+                  vec_loop_lens *lens, unsigned int nvectors, tree vectype,
+                  unsigned int index)
 {
   rgroup_controls *rgl = &(*lens)[nvectors - 1];
   bool use_bias_adjusted_len =
     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
+  tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
 
   /* Populate the rgroup's len array, if this is the first time we've
      used it.  */
@@ -10400,6 +10405,26 @@ vect_get_loop_len (loop_vec_info loop_vinfo, 
vec_loop_lens *lens,
 
   if (use_bias_adjusted_len)
     return rgl->bias_adjusted_ctrl;
+  else if (LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P (loop_vinfo))
+    {
+      tree loop_len = rgl->controls[index];
+      poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
+      poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
+      if (maybe_ne (nunits1, nunits2))
+       {
+         /* A loop len for data type X can be reused for data type Y
+            if X has N times more elements than Y and if Y's elements
+            are N times bigger than X's.  */
+         gcc_assert (multiple_p (nunits1, nunits2));
+         unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
+         gimple_seq seq = NULL;
+         loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
+                                  build_int_cst (iv_type, factor));
+         if (seq)
+           gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
+       }
+      return loop_len;
+    }
   else
     return rgl->controls[index];
 }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7313191b0db..15b22132bd6 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3147,6 +3147,61 @@ vect_get_data_ptr_increment (vec_info *vinfo,
   return iv_step;
 }
 
+/* Prepare the pointer IVs which needs to be updated by a variable amount.
+   Such variable amount is the outcome of .SELECT_VL. In this case, we can
+   allow each iteration process the flexible number of elements as long as
+   the number <= vf elments.
+
+   Return data reference according to SELECT_VL.
+   If new statements are needed, insert them before GSI.  */
+
+static tree
+get_select_vl_data_ref_ptr (vec_info *vinfo, stmt_vec_info stmt_info,
+                           tree aggr_type, class loop *at_loop, tree offset,
+                           tree *dummy, gimple_stmt_iterator *gsi,
+                           bool simd_lane_access_p, vec_loop_lens *loop_lens,
+                           dr_vec_info *dr_info,
+                           vect_memory_access_type memory_access_type)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+  tree step = vect_dr_behavior (vinfo, dr_info)->step;
+
+  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for 
pointer
+     IVs are updated by variable amount but we will support them in the future.
+   */
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
+             && memory_access_type != VMAT_LOAD_STORE_LANES);
+
+  /* When we support SELECT_VL pattern, we dynamic adjust
+     the memory address by .SELECT_VL result.
+
+     The result of .SELECT_VL is the number of elements to
+     be processed of each iteration. So the memory address
+     adjustment operation should be:
+
+     bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+     addr = addr + .SELECT_VL (ARG..) * bytesize;
+  */
+  gimple *ptr_incr;
+  tree loop_len
+    = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, aggr_type, 0);
+  tree len_type = TREE_TYPE (loop_len);
+  poly_uint64 bytesize = GET_MODE_SIZE (element_mode (aggr_type));
+  /* Since the outcome of .SELECT_VL is element size, we should adjust
+     it into bytesize so that it can be used in address pointer variable
+     amount IVs adjustment.  */
+  tree tmp = fold_build2 (MULT_EXPR, len_type, loop_len,
+                         build_int_cst (len_type, bytesize));
+  if (tree_int_cst_sgn (step) == -1)
+    tmp = fold_build1 (NEGATE_EXPR, len_type, tmp);
+  tree bump = make_temp_ssa_name (len_type, NULL, "ivtmp");
+  gassign *assign = gimple_build_assign (bump, tmp);
+  gsi_insert_before (gsi, assign, GSI_SAME_STMT);
+  return vect_create_data_ref_ptr (vinfo, stmt_info, aggr_type, at_loop, 
offset,
+                                  dummy, gsi, &ptr_incr, simd_lane_access_p,
+                                  bump);
+}
+
 /* Check and perform vectorization of BUILT_IN_BSWAP{16,32,64,128}.  */
 
 static bool
@@ -8547,6 +8602,14 @@ vectorizable_store (vec_info *vinfo,
            vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
                                         slp_node, &gs_info, &dataref_ptr,
                                         &vec_offsets);
+         else if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+                  && memory_access_type != VMAT_INVARIANT)
+           dataref_ptr
+             = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+                                           simd_lane_access_p ? loop : NULL,
+                                           offset, &dummy, gsi,
+                                           simd_lane_access_p, loop_lens,
+                                           dr_info, memory_access_type);
          else
            dataref_ptr
              = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -8795,8 +8858,9 @@ vectorizable_store (vec_info *vinfo,
              else if (loop_lens)
                {
                  tree final_len
-                   = vect_get_loop_len (loop_vinfo, loop_lens,
-                                        vec_num * ncopies, vec_num * j + i);
+                   = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                        vec_num * ncopies, vectype,
+                                        vec_num * j + i);
                  tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT);
                  machine_mode vmode = TYPE_MODE (vectype);
                  opt_machine_mode new_ovmode
@@ -9935,6 +9999,13 @@ vectorizable_load (vec_info *vinfo,
                                           slp_node, &gs_info, &dataref_ptr,
                                           &vec_offsets);
            }
+         else if (loop_vinfo && LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)
+                  && memory_access_type != VMAT_INVARIANT)
+           dataref_ptr
+             = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
+                                           at_loop, offset, &dummy, gsi,
+                                           simd_lane_access_p, loop_lens,
+                                           dr_info, memory_access_type);
          else
            dataref_ptr
              = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -10151,8 +10222,8 @@ vectorizable_load (vec_info *vinfo,
                    else if (loop_lens && memory_access_type != VMAT_INVARIANT)
                      {
                        tree final_len
-                         = vect_get_loop_len (loop_vinfo, loop_lens,
-                                              vec_num * ncopies,
+                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+                                              vec_num * ncopies, vectype,
                                               vec_num * j + i);
                        tree ptr = build_int_cst (ref_type,
                                                  align * BITS_PER_UNIT);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9cf2fb23fe3..3d21e23513d 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -818,6 +818,13 @@ public:
      the vector loop can handle fewer than VF scalars.  */
   bool using_partial_vectors_p;
 
+  /* True if we've decided to use SELECT_VL to get the number of active
+     elements in a vector loop to be updated.  */
+  bool using_select_vl_p;
+
+  /* True if use adjusted loop length for SLP.  */
+  bool using_slp_adjusted_len_p;
+
   /* True if we've decided to use partially-populated vectors for the
      epilogue of loop.  */
   bool epil_using_partial_vectors_p;
@@ -890,6 +897,8 @@ public:
 #define LOOP_VINFO_VECTORIZABLE_P(L)       (L)->vectorizable
 #define LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P(L) (L)->can_use_partial_vectors_p
 #define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
+#define LOOP_VINFO_USING_SELECT_VL_P(L) (L)->using_select_vl_p
+#define LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P(L) (L)->using_slp_adjusted_len_p
 #define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L)                             
\
   (L)->epil_using_partial_vectors_p
 #define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
@@ -2293,7 +2302,8 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, 
vec_loop_masks *,
                                unsigned int, tree, unsigned int);
 extern void vect_record_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
                                  tree, unsigned int);
-extern tree vect_get_loop_len (loop_vec_info, vec_loop_lens *, unsigned int,
+extern tree vect_get_loop_len (loop_vec_info, gimple_stmt_iterator *,
+                              vec_loop_lens *, unsigned int, tree,
                               unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern stmt_vec_info info_for_reduction (vec_info *, stmt_vec_info);
-- 
2.36.3

Reply via email to