Hi.

This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer.

Consider this following case:
#include <stdint.h>

#define N 32

/* Simple condition reduction.  */

int __attribute__ ((noinline, noclone))
condition_reduction (int *a, int min_v)
{
  int last = 66; /* High start value.  */

  for (int i = 0; i < N; i++)
    if (a[i] < min_v)
      last = i;

  return last;
}

With this patch, we can generate this following IR:

  _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]);
  _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... };
  ivtmp_36 = _44 * 4;
  vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0);

  mask__11.9_41 = vect__4.8_39 < vect_cst__40;
  last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, 
_44, 0);
  ...

gcc/ChangeLog:

        * tree-vect-loop.cc (vectorizable_reduction): Apply 
LEN_FOLD_EXTRACT_LAST.
        * tree-vect-stmts.cc (vectorizable_condition): Ditto.

---
 gcc/tree-vect-loop.cc  |  7 ++++--
 gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1cd6c291377..ebee8037e02 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
        }
 
       if (reduc_chain_length == 1
-         && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
-                                            vectype_in, OPTIMIZE_FOR_SPEED))
+         && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
+                                             OPTIMIZE_FOR_SPEED)
+             || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+                                                vectype_in,
+                                                OPTIMIZE_FOR_SPEED)))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 413a88750d6..be9f3a280bd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo,
          && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
        {
          if (reduction_type == EXTRACT_LAST_REDUCTION)
-           vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo),
-                                  ncopies * vec_num, vectype, NULL);
+           {
+             if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+                                                 vectype, OPTIMIZE_FOR_SPEED))
+               vect_record_loop_len (loop_vinfo,
+                                     &LOOP_VINFO_LENS (loop_vinfo),
+                                     ncopies * vec_num, vectype, 1);
+             else
+               vect_record_loop_mask (loop_vinfo,
+                                      &LOOP_VINFO_MASKS (loop_vinfo),
+                                      ncopies * vec_num, vectype, NULL);
+           }
          /* Extra inactive lanes should be safe for vect_nested_cycle.  */
          else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle)
            {
@@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo,
      mask to the condition, or to its inverse.  */
 
   vec_loop_masks *masks = NULL;
-  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  vec_loop_lens *lens = NULL;
+  if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+    {
+      if (reduction_type == EXTRACT_LAST_REDUCTION)
+       lens = &LOOP_VINFO_LENS (loop_vinfo);
+    }
+  else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
     {
       if (reduction_type == EXTRACT_LAST_REDUCTION)
        masks = &LOOP_VINFO_MASKS (loop_vinfo);
@@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo,
       /* Force vec_compare to be an SSA_NAME rather than a comparison,
         in cases where that's necessary.  */
 
-      if (masks || reduction_type == EXTRACT_LAST_REDUCTION)
+      tree len = NULL_TREE, bias = NULL_TREE;
+      if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION)
        {
          if (!is_gimple_val (vec_compare))
            {
@@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo,
              vec_compare = vec_compare_name;
            }
 
+         if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
+                                             vectype, OPTIMIZE_FOR_SPEED))
+           {
+             if (lens)
+               {
+                 len = vect_get_loop_len (loop_vinfo, gsi, lens,
+                                          vec_num * ncopies, vectype, i, 1);
+                 signed char biasval
+                   = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+                 bias = build_int_cst (intQI_type_node, biasval);
+               }
+             else
+               {
+                 len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+                 bias = build_int_cst (intQI_type_node, 0);
+               }
+           }
          if (masks)
            {
              tree loop_mask
@@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo,
        {
          gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt;
          tree lhs = gimple_get_lhs (old_stmt);
-         new_stmt = gimple_build_call_internal
-             (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
-              vec_then_clause);
+         if (len)
+           new_stmt = gimple_build_call_internal
+               (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare,
+                vec_then_clause, len, bias);
+         else
+           new_stmt = gimple_build_call_internal
+               (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare,
+                vec_then_clause);
          gimple_call_set_lhs (new_stmt, lhs);
          SSA_NAME_DEF_STMT (lhs) = new_stmt;
          if (old_stmt == gsi_stmt (*gsi))
-- 
2.36.3

Reply via email to