This makes the BB vectorizer cost independent SLP subgraphs
separately.  While on pristine trunk and for x86_64 I failed to
distill a testcase where the vectorizer would think _any_
basic-block vectorization opportunity is not profitable I do
have pending work that would make the cost savings of a
profitable opportunity make another independently not
profitable opportunity vectorized.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

CCing some people to double-check my graph partitioning algorithm.

Richard.

2020-09-08  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/96043
        * tree-vectorizer.h (_slp_instance::cost_vec): New.
        (_slp_instance::subgraph_entries): Likewise.
        (BB_VINFO_TARGET_COST_DATA): Remove.
        * tree-vect-slp.c (vect_free_slp_instance): Free
        cost_vec and subgraph_entries.
        (vect_analyze_slp_instance): Initialize them.
        (vect_slp_analyze_operations): Defer passing costs to
        the target, instead record them in the SLP graph entry.
        (get_ultimate_leader): New helper for graph partitioning.
        (vect_bb_partition_graph_r): Likewise.
        (vect_bb_partition_graph): New function to partition the
        SLP graph into independently costable parts.
        (vect_bb_vectorization_profitable_p): Adjust to work on
        a subgraph.
        (vect_bb_vectorization_profitable_p): New wrapper,
        discarding non-profitable vectorization of subgraphs.
        (vect_slp_analyze_bb_1): Call vect_bb_partition_graph before
        costing.
---
 gcc/tree-vect-slp.c   | 190 ++++++++++++++++++++++++++++++++++++++----
 gcc/tree-vectorizer.h |   8 +-
 2 files changed, 180 insertions(+), 18 deletions(-)

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2b7fd685ef9..35e8985d159 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -126,6 +126,8 @@ vect_free_slp_instance (slp_instance instance, bool final_p)
 {
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
   SLP_INSTANCE_LOADS (instance).release ();
+  instance->subgraph_entries.release ();
+  instance->cost_vec.release ();
   free (instance);
 }
 
@@ -2269,6 +2271,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
          SLP_INSTANCE_LOADS (new_instance) = vNULL;
          SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : 
NULL;
          new_instance->reduc_phis = NULL;
+         new_instance->cost_vec = vNULL;
+         new_instance->subgraph_entries = vNULL;
 
          vect_gather_slp_loads (new_instance, node);
          if (dump_enabled_p ())
@@ -3153,8 +3157,8 @@ vect_slp_analyze_operations (vec_info *vinfo)
            visited.add (*x);
          i++;
 
-         add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
-         cost_vec.release ();
+         /* Remember the SLP graph entry cost for later.  */
+         instance->cost_vec = cost_vec;
        }
     }
 
@@ -3162,18 +3166,113 @@ vect_slp_analyze_operations (vec_info *vinfo)
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
     {
       hash_set<stmt_vec_info> svisited;
-      stmt_vector_for_cost cost_vec;
-      cost_vec.create (2);
       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
        vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-                                    instance, &cost_vec, svisited);
-      add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
-      cost_vec.release ();
+                                    instance, &instance->cost_vec, svisited);
     }
 
   return !vinfo->slp_instances.is_empty ();
 }
 
+/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
+   closing the eventual chain.  */
+
+static slp_instance
+get_ultimate_leader (slp_instance instance,
+                    hash_map<slp_instance, slp_instance> &instance_leader)
+{
+  auto_vec<slp_instance *, 8> chain;
+  slp_instance *tem;
+  while (*(tem = instance_leader.get (instance)) != instance)
+    {
+      chain.safe_push (tem);
+      instance = *tem;
+    }
+  while (!chain.is_empty ())
+    *chain.pop () = instance;
+  return instance;
+}
+
+/* Worker of vect_bb_partition_graph, recurse on NODE.  */
+
+static void
+vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
+                          slp_instance instance, slp_tree node,
+                          hash_map<stmt_vec_info, slp_instance> 
&stmt_to_instance,
+                          hash_map<slp_instance, slp_instance> 
&instance_leader)
+{
+  stmt_vec_info stmt_info;
+  unsigned i;
+  bool all = true;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    {
+      bool existed_p;
+      slp_instance &stmt_instance
+       = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
+      if (!existed_p)
+       {
+         all = false;
+       }
+      else if (stmt_instance != instance)
+       {
+         /* If we're running into a previously marked stmt make us the
+            leader of the current ultimate leader.  This keeps the
+            leader chain acyclic and works even when the current instance
+            connects two previously independent graph parts.  */
+         stmt_instance = get_ultimate_leader (stmt_instance, instance_leader);
+         if (stmt_instance != instance)
+           instance_leader.put (stmt_instance, instance);
+       }
+      stmt_instance = instance;
+    }
+  /* If not all stmts had been visited we have to recurse on children.  */
+  if (all)
+    return;
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
+      vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
+                                instance_leader);
+}
+
+/* Partition the SLP graph into pieces that can be costed independently.  */
+
+static void
+vect_bb_partition_graph (bb_vec_info bb_vinfo)
+{
+  DUMP_VECT_SCOPE ("vect_bb_partition_graph");
+
+  /* First walk the SLP graph assigning each involved scalar stmt a
+     corresponding SLP graph entry and upon visiting a previously
+     marked stmt, make the stmts leader the current SLP graph entry.  */
+  hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
+  hash_map<slp_instance, slp_instance> instance_leader;
+  slp_instance instance;
+  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
+    {
+      instance_leader.put (instance, instance);
+      vect_bb_partition_graph_r (bb_vinfo,
+                                instance, SLP_INSTANCE_TREE (instance),
+                                stmt_to_instance, instance_leader);
+    }
+
+  /* Then collect entries to each independent subgraphs.  */
+  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
+    {
+      slp_instance leader = get_ultimate_leader (instance, instance_leader);
+      if (leader == instance)
+       leader->subgraph_entries.safe_push (leader);
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "instance %p is leader of %p\n",
+                            leader, instance);
+         leader->subgraph_entries.safe_push (instance);
+       }
+    }
+}
 
 /* Compute the scalar cost of the SLP node NODE and its children
    and return it.  Do not account defs that are marked in LIFE and
@@ -3281,18 +3380,22 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
     }
 }
 
-/* Check if vectorization of the basic block is profitable.  */
+/* Check if vectorization of the basic block is profitable for the
+   subgraph denoted by SLP_INSTANCES.  */
 
 static bool
-vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
+vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
+                                   vec<slp_instance> slp_instances)
 {
-  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
   slp_instance instance;
   int i;
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
 
-  /* Calculate scalar cost.  */
+  void *vect_target_cost_data = init_cost (NULL);
+
+  /* Calculate scalar cost and sum the cost for the vector stmts
+     previously collected.  */
   stmt_vector_for_cost scalar_costs;
   scalar_costs.create (0);
   hash_set<slp_tree> visited;
@@ -3304,22 +3407,25 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
bb_vinfo)
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
                               &life, &scalar_costs, visited);
+      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+      instance->cost_vec.release ();
     }
   /* Unset visited flag.  */
   stmt_info_for_cost *si;
   FOR_EACH_VEC_ELT (scalar_costs, i, si)
     gimple_set_visited  (si->stmt_info->stmt, false);
 
-  void *target_cost_data = init_cost (NULL);
-  add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
+  void *scalar_target_cost_data = init_cost (NULL);
+  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
   scalar_costs.release ();
   unsigned dummy;
-  finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
-  destroy_cost_data (target_cost_data);
+  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+  destroy_cost_data (scalar_target_cost_data);
 
-  /* Complete the target-specific cost calculation.  */
-  finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
+  /* Complete the target-specific vector cost calculation.  */
+  finish_cost (vect_target_cost_data, &vec_prologue_cost,
               &vec_inside_cost, &vec_epilogue_cost);
+  destroy_cost_data (vect_target_cost_data);
 
   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
 
@@ -3344,6 +3450,54 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
+/* For each SLP subgraph determine profitability and remove parts not so.
+   Returns true if any profitable to vectorize subgraph remains.  */
+
+static bool
+vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
+{
+  slp_instance instance;
+  unsigned i;
+
+  auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length 
());
+  FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+    if (!instance->subgraph_entries.is_empty ())
+      subgraphs.quick_push (instance);
+  BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
+  for (i = 0; i < subgraphs.length ();)
+    {
+      instance = subgraphs[i];
+      if (!vect_bb_vectorization_profitable_p (bb_vinfo,
+                                              instance->subgraph_entries))
+       {
+         /* ???  We need to think of providing better dump/opt-report
+            locations here.  */
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: vectorization is not "
+                              "profitable.\n");
+           }
+         slp_instance entry;
+         unsigned j;
+         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
+           if (entry != instance)
+             vect_free_slp_instance (entry, false);
+         vect_free_slp_instance (instance, false);
+         subgraphs.ordered_remove (i);
+       }
+      else
+       {
+         slp_instance entry;
+         unsigned j;
+         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
+           BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
+         ++i;
+       }
+    }
+  return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
+}
+
 /* Find any vectorizable constructors and add them to the grouped_store
    array.  */
 
@@ -3721,6 +3875,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, 
bool &fatal,
       return false;
     }
 
+  vect_bb_partition_graph (bb_vinfo);
+
   /* Cost model: check if the vectorization is worthwhile.  */
   if (!unlimited_cost_model (NULL)
       && !vect_bb_vectorization_profitable_p (bb_vinfo))
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e555f465421..41a5cdbb26b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -183,6 +183,13 @@ public:
 
   /* The SLP node containing the reduction PHIs.  */
   slp_tree reduc_phis;
+
+  /* Vector cost of this entry to the SLP graph.  */
+  stmt_vector_for_cost cost_vec;
+
+  /* If this instance is the main entry of a subgraph the set of
+     entries into the same subgraph, including itself.  */
+  vec<_slp_instance *> subgraph_entries;
 } *slp_instance;
 
 
@@ -915,7 +922,6 @@ public:
 #define BB_VINFO_SLP_INSTANCES(B)    (B)->slp_instances
 #define BB_VINFO_DATAREFS(B)         (B)->shared->datarefs
 #define BB_VINFO_DDRS(B)             (B)->shared->ddrs
-#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data
 
 static inline bb_vec_info
 vec_info_for_bb (basic_block bb)
-- 
2.26.2

Reply via email to