Currently we mess up here in two places.  One is pattern recognition
which computes a mask-precision for a bool reduction PHI that's
inconsistent with that of the latch definition.  This is solved by
iterating the mask-precision computation.  The second is that the
reduction epilogue generation and the code querying support for it
isn't ready for mask inputs.  The following fixes this by falling
back to doing all the epilogue processing on a data type again, if
the target does not support a direct mask reduction.  For that we
utilize the newly added reduc_sbool_{and,ior,xor}_scal optabs
so we can go the direct IFN path on masks if the target supports
that.  In the future we can also implement an additional fallback
for IOR and AND reductions using a scalar cond-expr like
mask != 0 ? true : false, but the new optabs provide more information
to the target.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Also
bootstrapped and tested with x86 patterns for 
reduc_sbool_{and,ior,xor}_scal, those were mostly done by
Hongtao, I fixed them up for the last changes.

I'll wait for the RISC-V CI result and comments and plan to push
in Monday if nothing irregular pops up.

Richard.

        PR tree-optimization/101639
        PR tree-optimization/103495
        * tree-vectorizer.h (vect_reduc_info_s): Add reduc_type_for_mask.
        (VECT_REDUC_INFO_VECTYPE_FOR_MASK): New.
        * tree-vect-patterns.cc (vect_determine_mask_precision):
        Return whether the mask precision changed.
        (vect_determine_precisions): Iterate mask precision computation
        for loop vectorization.
        * tree-vect-loop.cc (get_initial_defs_for_reduction): Properly
        convert non-mask initial values to a mask initial def for
        the reduction.
        (sbool_reduction_fn_for_fn): New function.
        (vect_create_epilog_for_reduction): For a mask input convert
        it to the vector type analysis decided to use.  Use a regular
        conversion for the final convert to the scalar code type.
        (vectorizable_reduction): Support mask reductions.  Verify
        we can compute a data vector from the mask result or a direct
        maks reduction is provided by the target.

        * gcc.dg/vect/vect-reduc-bool-1.c: New testcase.
        * gcc.dg/vect/vect-reduc-bool-2.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-3.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-4.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-5.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-6.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-7.c: Likewise.
        * gcc.dg/vect/vect-reduc-bool-8.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c |  52 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c |  52 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c |  52 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c |  52 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c |  50 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c |  50 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c |  50 +++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c |  50 +++++++
 gcc/tree-vect-loop.cc                         | 129 +++++++++++++++---
 gcc/tree-vect-patterns.cc                     |  82 +++++++----
 gcc/tree-vectorizer.h                         |   5 +
 11 files changed, 574 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
new file mode 100644
index 00000000000..38aead8a1c7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-1.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+char p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
new file mode 100644
index 00000000000..2949b8308a2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-2.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+short p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
new file mode 100644
index 00000000000..893aa4bb290
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-3.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+int p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c
new file mode 100644
index 00000000000..dc37e06133b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-4.c
@@ -0,0 +1,52 @@
+#include "tree-vect.h"
+
+long long p[128];
+
+bool __attribute__((noipa))
+fand (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r &= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fior (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r |= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fand (n))
+      abort ();
+
+  p[0] = 0;
+  for (int n = 1; n < 77; ++n)
+    if (fand (n))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fior (n))
+      abort ();
+
+  p[0] = 1;
+  for (int n = 1; n < 77; ++n)
+    if (!fior (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c
new file mode 100644
index 00000000000..9bafc09927c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-5.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+char p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c
new file mode 100644
index 00000000000..ee1b9649e55
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-6.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+short p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c
new file mode 100644
index 00000000000..ab5f3ae89b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-7.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+int p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c
new file mode 100644
index 00000000000..6b0a6565951
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-bool-8.c
@@ -0,0 +1,50 @@
+#include "tree-vect.h"
+
+long long p[128];
+
+bool __attribute__((noipa))
+fxort (int n)
+{
+  bool r = true;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+bool __attribute__((noipa))
+fxorf (int n)
+{
+  bool r = false;
+  for (int i = 0; i < n; ++i)
+    r ^= (p[i] != 0);
+  return r;
+}
+
+int main()
+{
+  check_vect ();
+
+  __builtin_memset (p, 1, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (fxort (n) != !(n & 1))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n) != (n & 1))
+      abort ();
+
+  __builtin_memset (p, 0, sizeof(p));
+
+  for (int n = 0; n < 77; ++n)
+    if (!fxort (n))
+      abort ();
+
+  for (int n = 0; n < 77; ++n)
+    if (fxorf (n))
+      abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
target { vect_int && vect_condition } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4a6b029af9b..55d1b91a22b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3303,6 +3303,28 @@ reduction_fn_for_scalar_code (code_helper code, 
internal_fn *reduc_fn)
       }
 }
 
+/* Set *SBOOL_FN to the corresponding function working on vector masks
+   for REDUC_FN.  Return true if that exists, false otherwise.  */
+
+static bool
+sbool_reduction_fn_for_fn (internal_fn reduc_fn, internal_fn *sbool_fn)
+{
+  switch (reduc_fn)
+    {
+    case IFN_REDUC_AND:
+      *sbool_fn = IFN_REDUC_SBOOL_AND;
+      return true;
+    case IFN_REDUC_IOR:
+      *sbool_fn = IFN_REDUC_SBOOL_IOR;
+      return true;
+    case IFN_REDUC_XOR:
+      *sbool_fn = IFN_REDUC_SBOOL_XOR;
+      return true;
+    default:
+      return false;
+    }
+}
+
 /* If there is a neutral value X such that a reduction would not be affected
    by the introduction of additional X elements, return that X, otherwise
    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
@@ -4908,17 +4930,16 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
     nunits = group_size;
 
+  tree vector_elt_type = TREE_TYPE (vector_type);
   number_of_places_left_in_vector = nunits;
   bool constant_p = true;
   tree_vector_builder elts (vector_type, nunits, 1);
   elts.quick_grow (nunits);
   gimple_seq ctor_seq = NULL;
   if (neutral_op
-      && !useless_type_conversion_p (TREE_TYPE (vector_type),
+      && !useless_type_conversion_p (vector_elt_type,
                                     TREE_TYPE (neutral_op)))
-    neutral_op = gimple_convert (&ctor_seq,
-                                TREE_TYPE (vector_type),
-                                neutral_op);
+    neutral_op = gimple_convert (&ctor_seq, vector_elt_type, neutral_op);
   for (j = 0; j < nunits * number_of_vectors; ++j)
     {
       tree op;
@@ -4930,11 +4951,22 @@ get_initial_defs_for_reduction (loop_vec_info 
loop_vinfo,
        op = neutral_op;
       else
        {
-         if (!useless_type_conversion_p (TREE_TYPE (vector_type),
+         if (!useless_type_conversion_p (vector_elt_type,
                                          TREE_TYPE (initial_values[i])))
-           initial_values[i] = gimple_convert (&ctor_seq,
-                                               TREE_TYPE (vector_type),
-                                               initial_values[i]);
+           {
+             if (VECTOR_BOOLEAN_TYPE_P (vector_type))
+               initial_values[i] = gimple_build (&ctor_seq, COND_EXPR,
+                                                 vector_elt_type,
+                                                 initial_values[i],
+                                                 build_all_ones_cst
+                                                   (vector_elt_type),
+                                                 build_zero_cst
+                                                   (vector_elt_type));
+             else
+               initial_values[i] = gimple_convert (&ctor_seq,
+                                                   vector_elt_type,
+                                                   initial_values[i]);
+           }
          op = initial_values[i];
        }
 
@@ -5555,6 +5587,22 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   /* Shouldn't be used beyond this point.  */
   exit_bb = nullptr;
 
+  /* If we are operating on a mask vector and do not support direct mask
+     reduction, work on a bool data vector instead of a mask vector.  */
+  if (VECTOR_BOOLEAN_TYPE_P (vectype)
+      && VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info)
+      && vectype != VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info))
+    {
+      gcc_assert (reduc_inputs.length () == 1);
+      vectype = VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info);
+      gimple_seq stmts = NULL;
+      reduc_inputs[0] = gimple_build (&stmts, VEC_COND_EXPR, vectype,
+                                     reduc_inputs[0],
+                                     build_one_cst (vectype),
+                                     build_zero_cst (vectype));
+      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
+    }
+
   if (VECT_REDUC_INFO_TYPE (reduc_info) == COND_REDUCTION
       && reduc_fn != IFN_LAST)
     {
@@ -5949,8 +5997,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 
          new_temp = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype1),
                                   new_temp, bitsize, bitsize_zero_node);
-         new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR,
-                                  scalar_type, new_temp);
+         new_temp = gimple_convert (&stmts, scalar_type, new_temp);
          gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
          scalar_results.safe_push (new_temp);
         }
@@ -7023,15 +7070,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   tree vectype_out = SLP_TREE_VECTYPE (slp_for_stmt_info);
   VECT_REDUC_INFO_VECTYPE (reduc_info) = vectype_out;
 
-  /* We do not handle mask reductions correctly in the epilogue.  */
-  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "mask reduction not supported.\n");
-      return false;
-    }
-
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7349,6 +7387,23 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       return false;
     }
 
+  /* See if we can convert a mask vector to a corresponding bool data vector
+     to perform the epilogue reduction.  */
+  tree alt_vectype_out = NULL_TREE;
+  if (VECTOR_BOOLEAN_TYPE_P (vectype_out))
+    {
+      alt_vectype_out
+       = get_related_vectype_for_scalar_type (loop_vinfo->vector_mode,
+                                              TREE_TYPE (vectype_out),
+                                              TYPE_VECTOR_SUBPARTS
+                                                (vectype_out));
+      if (!alt_vectype_out
+         || maybe_ne (TYPE_VECTOR_SUBPARTS (alt_vectype_out),
+                      TYPE_VECTOR_SUBPARTS (vectype_out))
+         || !expand_vec_cond_expr_p (alt_vectype_out, vectype_out))
+       alt_vectype_out = NULL_TREE;
+    }
+
   internal_fn reduc_fn = IFN_LAST;
   if (reduction_type == TREE_CODE_REDUCTION
       || reduction_type == FOLD_LEFT_REDUCTION
@@ -7359,9 +7414,26 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
          ? fold_left_reduction_fn (orig_code, &reduc_fn)
          : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
        {
-         if (reduc_fn != IFN_LAST
-             && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
-                                                 OPTIMIZE_FOR_SPEED))
+         internal_fn sbool_fn = IFN_LAST;
+         if (reduc_fn == IFN_LAST)
+           ;
+         else if ((!VECTOR_BOOLEAN_TYPE_P (vectype_out)
+                   || (GET_MODE_CLASS (TYPE_MODE (vectype_out))
+                       == MODE_VECTOR_BOOL))
+                  && direct_internal_fn_supported_p (reduc_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           ;
+         else if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
+                  && sbool_reduction_fn_for_fn (reduc_fn, &sbool_fn)
+                  && direct_internal_fn_supported_p (sbool_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           reduc_fn = sbool_fn;
+         else if (reduction_type != FOLD_LEFT_REDUCTION
+                  && alt_vectype_out
+                  && direct_internal_fn_supported_p (reduc_fn, alt_vectype_out,
+                                                     OPTIMIZE_FOR_SPEED))
+           VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
+         else
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7378,6 +7450,19 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
          return false;
        }
+      if (reduc_fn == IFN_LAST
+         && VECTOR_BOOLEAN_TYPE_P (vectype_out))
+       {
+         if (!alt_vectype_out)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "cannot turn mask into bool data vector for "
+                                "reduction epilogue.\n");
+             return false;
+           }
+         VECT_REDUC_INFO_VECTYPE_FOR_MASK (reduc_info) = alt_vectype_out;
+       }
     }
   else if (reduction_type == COND_REDUCTION)
     {
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index a5c721e6153..e845e5d15e2 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -7157,13 +7157,14 @@ possible_vector_mask_operation_p (stmt_vec_info 
stmt_info)
 
 /* If STMT_INFO sets a boolean SSA_NAME, see whether we should use
    a vector mask type instead of a normal vector type.  Record the
-   result in STMT_INFO->mask_precision.  */
+   result in STMT_INFO->mask_precision.  Returns true when the
+   precision changed.  */
 
-static void
+static bool
 vect_determine_mask_precision (vec_info *vinfo, stmt_vec_info stmt_info)
 {
   if (!possible_vector_mask_operation_p (stmt_info))
-    return;
+    return false;
 
   /* If at least one boolean input uses a vector mask type,
      pick the mask type with the narrowest elements.
@@ -7245,8 +7246,11 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
          scalar_mode mode;
          tree vectype, mask_type;
          if (is_a <scalar_mode> (TYPE_MODE (op0_type), &mode)
-             && (vectype = get_vectype_for_scalar_type (vinfo, op0_type))
-             && (mask_type = get_mask_type_for_scalar_type (vinfo, op0_type))
+             /* Do not allow this to set vinfo->vector_mode, this might
+                disrupt the result for the next iteration.  */
+             && (vectype = get_related_vectype_for_scalar_type
+                                               (vinfo->vector_mode, op0_type))
+             && (mask_type = truth_type_for (vectype))
              && expand_vec_cmp_expr_p (vectype, mask_type, code))
            precision = GET_MODE_BITSIZE (mode);
        }
@@ -7272,19 +7276,30 @@ vect_determine_mask_precision (vec_info *vinfo, 
stmt_vec_info stmt_info)
        }
     }
 
-  if (dump_enabled_p ())
+  if (stmt_info->mask_precision != precision)
     {
-      if (precision == ~0U)
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using normal nonmask vectors for %G",
-                        stmt_info->stmt);
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using boolean precision %d for %G",
-                        precision, stmt_info->stmt);
-    }
+      if (dump_enabled_p ())
+       {
+         if (precision == ~0U)
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using normal nonmask vectors for %G",
+                            stmt_info->stmt);
+         else
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "using boolean precision %d for %G",
+                            precision, stmt_info->stmt);
+       }
 
-  stmt_info->mask_precision = precision;
+      /* ???  We'd like to assert stmt_info->mask_precision == 0
+        || stmt_info->mask_precision > precision, thus that we only
+        decrease mask precisions throughout iteration, but the
+        tcc_comparison handling above means for comparisons of bools
+        we start with 8 but might increase in case the bools get mask
+        precision on their own.  */
+      stmt_info->mask_precision = precision;
+      return true;
+    }
+  return false;
 }
 
 /* Handle vect_determine_precisions for STMT_INFO, given that we
@@ -7317,22 +7332,33 @@ vect_determine_precisions (vec_info *vinfo)
 
   DUMP_VECT_SCOPE ("vect_determine_precisions");
 
-  for (unsigned int i = 0; i < nbbs; i++)
+  /* For mask precisions we have to iterate since otherwise we do not
+     get reduction PHI precision correct.  For now do this only for
+     loop vectorization.  */
+  bool changed;
+  do
     {
-      basic_block bb = bbs[i];
-      for (auto gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
-       }
-      for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      changed = false;
+      for (unsigned int i = 0; i < nbbs; i++)
        {
-         stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
-         if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
-           vect_determine_mask_precision (vinfo, stmt_info);
+         basic_block bb = bbs[i];
+         for (auto gsi = gsi_start_phis (bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi.phi ());
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
+         for (auto gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             stmt_vec_info stmt_info = vinfo->lookup_stmt (gsi_stmt (gsi));
+             if (stmt_info && STMT_VINFO_VECTORIZABLE (stmt_info))
+               changed |= vect_determine_mask_precision (vinfo, stmt_info);
+           }
        }
     }
+  while (changed && is_a <loop_vec_info> (vinfo));
+
   for (unsigned int i = 0; i < nbbs; i++)
     {
       basic_block bb = bbs[nbbs - i - 1];
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7b6daf81b3..0ea238ee4bf 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -855,6 +855,10 @@ public:
   /* The vector type for performing the actual reduction operation.  */
   tree reduc_vectype;
 
+  /* The vector type we should use for the final reduction in the epilogue
+     when we reduce a mask.  */
+  tree reduc_vectype_for_mask;
+
   /* For INTEGER_INDUC_COND_REDUCTION, the initial value to be used.  */
   tree induc_cond_initial_val;
 
@@ -888,6 +892,7 @@ typedef class vect_reduc_info_s *vect_reduc_info;
 #define VECT_REDUC_INFO_INDUC_COND_INITIAL_VAL(I) ((I)->induc_cond_initial_val)
 #define VECT_REDUC_INFO_EPILOGUE_ADJUSTMENT(I) ((I)->reduc_epilogue_adjustment)
 #define VECT_REDUC_INFO_VECTYPE(I) ((I)->reduc_vectype)
+#define VECT_REDUC_INFO_VECTYPE_FOR_MASK(I) ((I)->reduc_vectype_for_mask)
 #define VECT_REDUC_INFO_FORCE_SINGLE_CYCLE(I) ((I)->force_single_cycle)
 #define VECT_REDUC_INFO_RESULT_POS(I) ((I)->reduc_result_pos)
 
-- 
2.51.0

Reply via email to