The following supports vectorizing BB reductions involving a
constant or an invariant.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

        * tree-vectorizer.h (_slp_instance::remain_stmts): Change
        to ...
        (_slp_instance::remain_defs): ... this.
        (SLP_INSTANCE_REMAIN_STMTS): Rename to ...
        (SLP_INSTANCE_REMAIN_DEFS): ... this.
        (slp_root::remain): New.
        (slp_root::slp_root): Adjust.
        * tree-vect-slp.cc (vect_free_slp_instance): Adjust.
        (vect_build_slp_instance): Get extra remain parameter,
        adjust former handling of a cut off stmt.
        (vect_analyze_slp_instance): Adjust.
        (vect_analyze_slp): Likewise.
        (_bb_vec_info::~_bb_vec_info): Likewise.
        (vectorizable_bb_reduc_epilogue): Dump something if we fail.
        (vect_slp_check_for_constructors): Handle non-internal
        defs as remain defs of a reduction.
        (vectorize_slp_instance_root_stmt): Adjust.

        * gcc.dg/vect/bb-slp-75.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-75.c | 25 +++++++++++
 gcc/tree-vect-slp.cc                  | 60 ++++++++++++++++++---------
 gcc/tree-vectorizer.h                 |  9 ++--
 3 files changed, 71 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-75.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-75.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c
new file mode 100644
index 00000000000..1abac136f72
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-75.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-ffast-math" } */
+/* { dg-additional-options "-msse2 -mfpmath=sse" { target { x86_64-*-* 
i?86-*-* } } } */
+
+float x[4];
+
+float test1 (float a)
+{
+  return x[0] + x[2] + x[1] + x[3] + a;
+}
+
+float test2 (void)
+{
+  return x[3] + x[2] + x[1] + 1.f + x[0];
+}
+
+float test3 (float a)
+{
+  return x[0] + a + x[2] + x[1] + x[3] + 1.f;
+}
+
+/* We currently require a .REDUC_PLUS direct internal function but do not
+   have a dejagnu target for this.  */
+/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using 
SLP" 3 "slp2" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 41997d5a546..cf91b21cf7d 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -209,7 +209,7 @@ vect_free_slp_instance (slp_instance instance)
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
   SLP_INSTANCE_LOADS (instance).release ();
   SLP_INSTANCE_ROOT_STMTS (instance).release ();
-  SLP_INSTANCE_REMAIN_STMTS (instance).release ();
+  SLP_INSTANCE_REMAIN_DEFS (instance).release ();
   instance->subgraph_entries.release ();
   instance->cost_vec.release ();
   free (instance);
@@ -3115,6 +3115,7 @@ vect_build_slp_instance (vec_info *vinfo,
                         slp_instance_kind kind,
                         vec<stmt_vec_info> &scalar_stmts,
                         vec<stmt_vec_info> &root_stmt_infos,
+                        vec<tree> &remain,
                         unsigned max_tree_size, unsigned *limit,
                         scalar_stmts_to_slp_tree_map_t *bst_map,
                         /* ???  We need stmt_info for group splitting.  */
@@ -3134,10 +3135,9 @@ vect_build_slp_instance (vec_info *vinfo,
      ???  Selecting the optimal set of lanes to vectorize would be nice
      but SLP build for all lanes will fail quickly because we think
      we're going to need unrolling.  */
-  auto_vec<stmt_vec_info> remain;
   if (kind == slp_inst_kind_bb_reduc
       && (scalar_stmts.length () & 1))
-    remain.safe_push (scalar_stmts.pop ());
+    remain.safe_insert (0, gimple_get_lhs (scalar_stmts.pop ()->stmt));
 
   /* Build the tree for the SLP instance.  */
   unsigned int group_size = scalar_stmts.length ();
@@ -3186,10 +3186,7 @@ vect_build_slp_instance (vec_info *vinfo,
          SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
          SLP_INSTANCE_LOADS (new_instance) = vNULL;
          SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
-         if (!remain.is_empty ())
-           SLP_INSTANCE_REMAIN_STMTS (new_instance) = remain.copy ();
-         else
-           SLP_INSTANCE_REMAIN_STMTS (new_instance) = vNULL;
+         SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
          SLP_INSTANCE_KIND (new_instance) = kind;
          new_instance->reduc_phis = NULL;
          new_instance->cost_vec = vNULL;
@@ -3469,6 +3466,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
     gcc_unreachable ();
 
   vec<stmt_vec_info> roots = vNULL;
+  vec<tree> remain = vNULL;
   if (kind == slp_inst_kind_ctor)
     {
       roots.create (1);
@@ -3476,7 +3474,7 @@ vect_analyze_slp_instance (vec_info *vinfo,
     }
   /* Build the tree for the SLP instance.  */
   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
-                                     roots,
+                                     roots, remain,
                                      max_tree_size, limit, bst_map,
                                      kind == slp_inst_kind_store
                                      ? stmt_info : NULL);
@@ -3521,10 +3519,12 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
          if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
                                       bb_vinfo->roots[i].stmts,
                                       bb_vinfo->roots[i].roots,
+                                      bb_vinfo->roots[i].remain,
                                       max_tree_size, &limit, bst_map, NULL))
            {
              bb_vinfo->roots[i].stmts = vNULL;
              bb_vinfo->roots[i].roots = vNULL;
+             bb_vinfo->roots[i].remain = vNULL;
            }
        }
     }
@@ -5955,6 +5955,7 @@ _bb_vec_info::~_bb_vec_info ()
     {
       roots[i].stmts.release ();
       roots[i].roots.release ();
+      roots[i].remain.release ();
     }
   roots.release ();
 }
@@ -6405,7 +6406,13 @@ vectorizable_bb_reduc_epilogue (slp_instance instance,
       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
                                     TREE_TYPE (vectype)))
-    return false;
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: basic block reduction epilogue "
+                        "operation unsupported.\n");
+      return false;
+    }
 
   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
      cost log2 vector operations plus shuffles and one extraction.  */
@@ -7262,22 +7269,37 @@ vect_slp_check_for_constructors (bb_vec_info bb_vinfo)
                 but record those to be handled in the epilogue.  */
              /* ???  For now do not allow mixing ops or externs/constants.  */
              bool invalid = false;
+             unsigned remain_cnt = 0;
              for (unsigned i = 0; i < chain.length (); ++i)
-               if (chain[i].dt != vect_internal_def
-                   || chain[i].code != code)
-                 invalid = true;
-             if (!invalid)
+               {
+                 if (chain[i].code != code)
+                   {
+                     invalid = true;
+                     break;
+                   }
+                 if (chain[i].dt != vect_internal_def)
+                   remain_cnt++;
+               }
+             if (!invalid && chain.length () - remain_cnt > 1)
                {
                  vec<stmt_vec_info> stmts;
+                 vec<tree> remain = vNULL;
                  stmts.create (chain.length ());
+                 if (remain_cnt > 0)
+                   remain.create (remain_cnt);
                  for (unsigned i = 0; i < chain.length (); ++i)
-                   stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
+                   {
+                     if (chain[i].dt == vect_internal_def)
+                       stmts.quick_push (bb_vinfo->lookup_def (chain[i].op));
+                     else
+                       remain.quick_push (chain[i].op);
+                   }
                  vec<stmt_vec_info> roots;
                  roots.create (chain_stmts.length ());
                  for (unsigned i = 0; i < chain_stmts.length (); ++i)
                    roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
                  bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
-                                                      stmts, roots));
+                                                      stmts, roots, remain));
                }
            }
        }
@@ -9160,16 +9182,16 @@ vectorize_slp_instance_root_stmt (slp_tree node, 
slp_instance instance)
        gcc_unreachable ();
       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
                                      TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
-      if (!SLP_INSTANCE_REMAIN_STMTS (instance).is_empty ())
+      if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
        {
          tree rem_def = NULL_TREE;
-         for (auto rem : SLP_INSTANCE_REMAIN_STMTS (instance))
+         for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
            if (!rem_def)
-             rem_def = gimple_get_lhs (rem->stmt);
+             rem_def = def;
            else
              rem_def = gimple_build (&epilogue, reduc_code,
                                      TREE_TYPE (scalar_def),
-                                     rem_def, gimple_get_lhs (rem->stmt));
+                                     rem_def, def);
          scalar_def = gimple_build (&epilogue, reduc_code,
                                     TREE_TYPE (scalar_def),
                                     scalar_def, rem_def);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 5987a327332..1de144988c8 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -259,7 +259,7 @@ public:
 
   /* For slp_inst_kind_bb_reduc the defs that were not vectorized, NULL
      otherwise.  */
-  vec<stmt_vec_info> remain_stmts;
+  vec<tree> remain_defs;
 
   /* The unrolling factor required to vectorized this SLP instance.  */
   poly_uint64 unrolling_factor;
@@ -289,7 +289,7 @@ public:
 #define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
 #define SLP_INSTANCE_LOADS(S)                    (S)->loads
 #define SLP_INSTANCE_ROOT_STMTS(S)               (S)->root_stmts
-#define SLP_INSTANCE_REMAIN_STMTS(S)             (S)->remain_stmts
+#define SLP_INSTANCE_REMAIN_DEFS(S)              (S)->remain_defs
 #define SLP_INSTANCE_KIND(S)                     (S)->kind
 
 #define SLP_TREE_CHILDREN(S)                     (S)->children
@@ -1027,11 +1027,12 @@ loop_vec_info_for_loop (class loop *loop)
 struct slp_root
 {
   slp_root (slp_instance_kind kind_, vec<stmt_vec_info> stmts_,
-           vec<stmt_vec_info> roots_)
-    : kind(kind_), stmts(stmts_), roots(roots_) {}
+           vec<stmt_vec_info> roots_, vec<tree> remain_ = vNULL)
+    : kind(kind_), stmts(stmts_), roots(roots_), remain(remain_) {}
   slp_instance_kind kind;
   vec<stmt_vec_info> stmts;
   vec<stmt_vec_info> roots;
+  vec<tree> remain;
 };
 
 typedef class _bb_vec_info : public vec_info
-- 
2.35.3

Reply via email to