The following adds vectorizer support for reduc_sbool_{and,ior,xor}_scal
in the epilogue of bool reductions.

        * config/i386/sse.md (reduc_sbool_and_scal_qi): Dummy for testing.
        * tree-vectorizer.h (reduction_fn_for_scalar_code): Add
        optional vector type argument.
        * tree-vect-loop.cc (reduction_fn_for_scalar_code): When a
        mask vector type is specified, return the corresponding
        MASK functions for AND, IOR and XOR.
        (vect_create_epilog_for_reduction): Pun to masks to an
        integer vector type only when we do not support direct mask
        reduction.
        (vectorizable_reduction): Prefer direct mask reduction over
        integer vector reduction.
---
 gcc/config/i386/sse.md | 11 ++++++
 gcc/tree-vect-loop.cc  | 90 +++++++++++++++++++++++++-----------------
 gcc/tree-vectorizer.h  |  3 +-
 3 files changed, 66 insertions(+), 38 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8b28c8edb19..7f3361f8781 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4013,6 +4013,17 @@
   DONE;
 })
 
+(define_expand "reduc_sbool_and_scal_qi"
+ [(and:QI
+    (match_operand:QI 0 "register_operand")
+    (match_operand:QI 1 "register_operand")
+    (match_operand:SI 2 "const_0_to_255_operand"))]
+ "TARGET_AVX512F"
+{
+  emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "<mask_codefor>reducep<mode><mask_name><round_saeonly_name>"
   [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v")
        (unspec:VFH_AVX512VL
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f656437ea5c..f523b264dfc 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3244,7 +3244,8 @@ fold_left_reduction_fn (code_helper code, internal_fn 
*reduc_fn)
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
 bool
-reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
+reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn,
+                             tree vectype)
 {
   if (code.is_tree_code ())
     switch (tree_code (code))
@@ -3262,15 +3263,18 @@ reduction_fn_for_scalar_code (code_helper code, 
internal_fn *reduc_fn)
        return true;
 
       case BIT_AND_EXPR:
-       *reduc_fn = IFN_REDUC_AND;
+       *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+                    ? IFN_REDUC_SBOOL_AND : IFN_REDUC_AND);
        return true;
 
       case BIT_IOR_EXPR:
-       *reduc_fn = IFN_REDUC_IOR;
+       *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+                    ? IFN_REDUC_SBOOL_IOR : IFN_REDUC_IOR);
        return true;
 
       case BIT_XOR_EXPR:
-       *reduc_fn = IFN_REDUC_XOR;
+       *reduc_fn = ((vectype && VECTOR_BOOLEAN_TYPE_P (vectype))
+                    ? IFN_REDUC_SBOOL_XOR : IFN_REDUC_XOR);
        return true;
 
       case MULT_EXPR:
@@ -5559,9 +5563,12 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* Shouldn't be used beyond this point.  */
   exit_bb = nullptr;
 
-  /* For the actual reduction work on a bool data vector instead of a
-     mask vector.  */
-  if (VECTOR_BOOLEAN_TYPE_P (vectype))
+  /* If we are operating on a mask vector and do not support direct mask
+     reduction, work on a bool data vector instead of a mask vector.  */
+  if (VECTOR_BOOLEAN_TYPE_P (vectype)
+      && reduc_fn != IFN_REDUC_SBOOL_AND
+      && reduc_fn != IFN_REDUC_SBOOL_IOR
+      && reduc_fn != IFN_REDUC_SBOOL_XOR)
     {
       gcc_assert (reduc_inputs.length () == 1);
       vectype = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
@@ -7295,29 +7302,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   VECT_REDUC_INFO_CODE (reduc_info) = orig_code;
 
-  /* For now see to implement the epilogue reduction on a bool data,
-     not the mask type.  */
-  tree orig_vectype_out = vectype_out;
-  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
-    {
-      vectype_out
-       = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
-                                              TREE_TYPE (vectype_out),
-                                              TYPE_VECTOR_SUBPARTS
-                                                (orig_vectype_out));
-      if (!vectype_out
-         || maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_out),
-                      TYPE_VECTOR_SUBPARTS (orig_vectype_out))
-         || !expand_vec_cond_expr_p (vectype_out, orig_vectype_out))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "cannot turn mask into bool data vector for "
-                            "reduction epilogue.\n");
-         return false;
-       }
-    }
-
   reduction_type = VECT_REDUC_INFO_TYPE (reduc_info);
   if (reduction_type == TREE_CODE_REDUCTION)
     {
@@ -7383,6 +7367,29 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
+  /* See if we can convert a mask vector to a corresponding bool data vector
+     to perform the epilogue reduction.  */
+  tree alt_vectype_out = NULL_TREE;
+  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
+    {
+      alt_vectype_out
+       = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+                                              TREE_TYPE (vectype_out),
+                                              TYPE_VECTOR_SUBPARTS
+                                                (vectype_out));
+      if (!alt_vectype_out
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
+                      TYPE_VECTOR_SUBPARTS (vectype_out))
+         || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "cannot turn mask into bool data vector for "
+                            "reduction epilogue.\n");
+         alt_vectype_out = NULL_TREE;
+       }
+    }
+
   internal_fn reduc_fn = IFN_LAST;
   if (reduction_type == TREE_CODE_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION
@@ -7391,17 +7398,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       if (reduction_type == FOLD_LEFT_REDUCTION
          ? fold_left_reduction_fn (orig_code, &reduc_fn)
-         : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
+         : reduction_fn_for_scalar_code (orig_code, &reduc_fn, vectype_out))
        {
          if (reduc_fn != IFN_LAST
              && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
                                                  OPTIMIZE_FOR_SPEED))
            {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "reduc op not supported by target.\n");
+             if (reduction_type != FOLD_LEFT_REDUCTION
+                 && alt_vectype_out
+                 && reduction_fn_for_scalar_code (orig_code, &reduc_fn,
+                                                  alt_vectype_out)
+                 && reduc_fn != IFN_LAST
+                 && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
+                                                    OPTIMIZE_FOR_SPEED))
+               ;
+             else
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "reduc op not supported by target.\n");
 
-             reduc_fn = IFN_LAST;
+                 reduc_fn = IFN_LAST;
+               }
            }
        }
       else
@@ -7438,8 +7456,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
-  vectype_out = orig_vectype_out;
-
   /* For SLP reductions, see if there is a neutral value we can use.  */
   tree neutral_op = NULL_TREE;
   tree initial_value = NULL_TREE;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 4785cbdd61d..f5827fd26f5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2658,7 +2658,8 @@ extern tree vect_gen_loop_len_mask (loop_vec_info, 
gimple_stmt_iterator *,
                                    unsigned int);
 extern gimple_seq vect_gen_len (tree, tree, tree, tree);
 extern vect_reduc_info info_for_reduction (loop_vec_info, slp_tree);
-extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *);
+extern bool reduction_fn_for_scalar_code (code_helper, internal_fn *,
+                                         tree = NULL_TREE);
 
 /* Drive for loop transformation stage.  */
 extern class loop *vect_transform_loop (loop_vec_info, gimple *);
-- 
2.51.0

Reply via email to