[PATCH] tree-optimization/98855 - redo BB vectorization costing

Richard Biener Fri, 05 Feb 2021 02:40:56 -0800

The following attempts to account for the fact that BB vectorization
regions now can span multiple loop levels and that an unprofitable
inner loop vectorization shouldn't be offsetted by a profitable
outer loop vectorization to make it overall profitable.


For now I've implemented a heuristic based on the premise that
vectorization should be profitable even if loops may not be entered
or if they iterate any number of times.  Especially the first
assumption then requires that stmts directly belonging to loop A
need to be costed separately from stmts belonging to another loop
which also simplifies the implementation.

On x86 the added testcase has in the outer loop

t.c:38:20: note: Cost model analysis for part in loop 1:
  Vector cost: 56
  Scalar cost: 192

and the inner loop

t.c:38:20: note: Cost model analysis for part in loop 2:
  Vector cost: 132
  Scalar cost: 48

and thus the vectorization is considered not profitable
(note the same would happen in case the 2nd cost were for
a loop outer to the 1st costing).

Future enhancements may consider static knowledge of whether
a loop is always entered which would allow some inefficiency
in the vectorization of its loop header.  Likewise stmts only
reachable from a loop exit can be treated this way.

Bootstrapped and tested on x86_64-unknown-linux-gnu and it fixes
the regression reported in the PR.

Does this look sensible and good enough for GCC 11?

Thanks,
Richard.

2021-02-05  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/98855
        * tree-vectorizer.h (add_stmt_cost): New overload.
        * tree-vect-slp.c (li_cost_vec_cmp): New.
        (vect_bb_slp_scalar_cost): Cost individual loop regions
        separately.  Account for the scalar instance root stmt.

        * g++.dg/vect/slp-pr98855.cc: New testcase.
---
 gcc/testsuite/g++.dg/vect/slp-pr98855.cc |  84 +++++++++++
 gcc/tree-vect-slp.c                      | 171 ++++++++++++++++++-----
 gcc/tree-vectorizer.h                    |   9 ++
 3 files changed, 230 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/vect/slp-pr98855.cc

diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc 
b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
new file mode 100644
index 00000000000..0b4e479b513
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
@@ -0,0 +1,84 @@
+// { dg-do compile }
+// { dg-additional-options "-fvect-cost-model=cheap" }
+// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } }
+
+#include <stdint.h>
+#include <stdlib.h>
+
+inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
+{
+  return ((static_cast<uint32_t>(i0) << 24) |
+         (static_cast<uint32_t>(i1) << 16) |
+         (static_cast<uint32_t>(i2) <<  8) |
+         (static_cast<uint32_t>(i3)));
+}
+
+inline uint32_t load_be(const uint8_t in[], size_t off)
+{
+  in += off * sizeof(uint32_t);
+  return make_uint32(in[0], in[1], in[2], in[3]);
+}
+
+template<typename T>
+inline void load_be(const uint8_t in[],
+                   T& x0, T& x1, T& x2, T& x3,
+                   T& x4, T& x5, T& x6, T& x7)
+{
+  x0 = load_be(in, 0);
+  x1 = load_be(in, 1);
+  x2 = load_be(in, 2);
+  x3 = load_be(in, 3);
+  x4 = load_be(in, 4);
+  x5 = load_be(in, 5);
+  x6 = load_be(in, 6);
+  x7 = load_be(in, 7);
+}
+
+inline void store_be(uint32_t in, uint8_t out[4])
+{
+  uint32_t o = __builtin_bswap32 (in);
+  __builtin_memcpy (out, &o, sizeof (uint32_t));
+}
+
+template<typename T>
+inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3,
+                    T x4, T x5, T x6, T x7)
+{
+  store_be(x0, out + (0 * sizeof(T)));
+  store_be(x1, out + (1 * sizeof(T)));
+  store_be(x2, out + (2 * sizeof(T)));
+  store_be(x3, out + (3 * sizeof(T)));
+  store_be(x4, out + (4 * sizeof(T)));
+  store_be(x5, out + (5 * sizeof(T)));
+  store_be(x6, out + (6 * sizeof(T)));
+  store_be(x7, out + (7 * sizeof(T)));
+}
+
+#define BLOCK_SIZE 8
+void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
+{
+  const size_t blocks4 = blocks / 4;
+
+  for (size_t i = 0; i < blocks4; i++)
+    {
+      uint32_t L0, R0, L1, R1, L2, R2, L3, R3;
+      load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+
+      for(size_t r = 0; r != 32; ++r)
+       {
+         L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r];
+         L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r];
+         L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r];
+         L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r];
+
+         R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1];
+         R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1];
+         R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1];
+         R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1];
+       }
+
+      store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+    }
+}
+
+// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not 
profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } }
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2305bbdec3a..25f58af3a43 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4340,6 +4340,20 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
     }
 }
 
+/* Comparator for the loop-index sorted cost vectors.  */
+
+static int
+li_cost_vec_cmp (const void *a_, const void *b_)
+{
+  auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
+  auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
+  if (a->first < b->first)
+    return -1;
+  else if (a->first == b->first)
+    return 0;
+  return 1;
+}
+
 /* Check if vectorization of the basic block is profitable for the
    subgraph denoted by SLP_INSTANCES.  */
 
@@ -4352,61 +4366,150 @@ vect_bb_vectorization_profitable_p (bb_vec_info 
bb_vinfo,
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
 
-  void *vect_target_cost_data = init_cost (NULL);
-
   /* Calculate scalar cost and sum the cost for the vector stmts
      previously collected.  */
-  stmt_vector_for_cost scalar_costs;
-  scalar_costs.create (0);
+  stmt_vector_for_cost scalar_costs = vNULL;
+  stmt_vector_for_cost vector_costs = vNULL;
   hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       auto_vec<bool, 20> life;
       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
                              true);
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       record_stmt_cost (&scalar_costs, 1, scalar_stmt,
+                         SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
                               &life, &scalar_costs, visited);
-      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+      vector_costs.safe_splice (instance->cost_vec);
       instance->cost_vec.release ();
     }
   /* Unset visited flag.  */
-  stmt_info_for_cost *si;
-  FOR_EACH_VEC_ELT (scalar_costs, i, si)
-    gimple_set_visited  (si->stmt_info->stmt, false);
+  stmt_info_for_cost *cost;
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    gimple_set_visited  (cost->stmt_info->stmt, false);
 
-  void *scalar_target_cost_data = init_cost (NULL);
-  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
-  scalar_costs.release ();
-  unsigned dummy;
-  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
-  destroy_cost_data (scalar_target_cost_data);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
+
+  /* When costing non-loop vectorization we need to consider each covered
+     loop independently and make sure vectorization is profitable.  For
+     now we assume a loop may be not entered or executed an arbitrary
+     number of iterations (???  static information can provide more
+     precise info here) which means we can simply cost each containing
+     loops stmts separately.  */
+
+  /* First produce cost vectors sorted by loop index.  */
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_scalar_costs (scalar_costs.length ());
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_vector_costs (vector_costs.length ());
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    {
+      unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_scalar_costs.quick_push (std::make_pair (l, cost));
+    }
+  unsigned l = li_scalar_costs[0].first;
+  FOR_EACH_VEC_ELT (vector_costs, i, cost)
+    {
+      /* We inherit from the previous SI, invariants, externals and
+        extracts immediately follow the cost for the related stmt.  */
+      if (cost->stmt_info)
+       l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_vector_costs.quick_push (std::make_pair (l, cost));
+    }
+  li_scalar_costs.qsort (li_cost_vec_cmp);
+  li_vector_costs.qsort (li_cost_vec_cmp);
+
+  /* Now cost the portions individually.  */
+  unsigned vi = 0;
+  unsigned si = 0;
+  do
+    {
+      unsigned sl = li_scalar_costs[si].first;
+      unsigned vl = li_vector_costs[vi].first;
+      if (sl != vl)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Scalar %d and vector %d loop part do not "
+                            "match up, skipping scalar part\n", sl, vl);
+         /* Skip the scalar part, assuming zero cost on the vector side.  */
+         do
+           {
+             si++;
+           }
+         while (si < li_scalar_costs.length ()
+                && li_scalar_costs[si].first == sl);
+         continue;
+       }
 
-  /* Complete the target-specific vector cost calculation.  */
-  finish_cost (vect_target_cost_data, &vec_prologue_cost,
-              &vec_inside_cost, &vec_epilogue_cost);
-  destroy_cost_data (vect_target_cost_data);
+      void *scalar_target_cost_data = init_cost (NULL);
+      do
+       {
+         add_stmt_cost (bb_vinfo, scalar_target_cost_data,
+                        li_scalar_costs[si].second);
+         si++;
+       }
+      while (si < li_scalar_costs.length ()
+            && li_scalar_costs[si].first == sl);
+      unsigned dummy;
+      finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+      destroy_cost_data (scalar_target_cost_data);
+
+      /* Complete the target-specific vector cost calculation.  */
+      void *vect_target_cost_data = init_cost (NULL);
+      do
+       {
+         add_stmt_cost (bb_vinfo, vect_target_cost_data,
+                        li_vector_costs[vi].second);
+         vi++;
+       }
+      while (vi < li_vector_costs.length ()
+            && li_vector_costs[vi].first == vl);
+      finish_cost (vect_target_cost_data, &vec_prologue_cost,
+                  &vec_inside_cost, &vec_epilogue_cost);
+      destroy_cost_data (vect_target_cost_data);
 
-  vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
+      vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
 
-  if (dump_enabled_p ())
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Cost model analysis for part in loop %d:\n", sl);
+         dump_printf (MSG_NOTE, "  Vector cost: %d\n",
+                      vec_inside_cost + vec_outside_cost);
+         dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
+       }
+
+      /* Vectorization is profitable if its cost is more than the cost of 
scalar
+        version.  Note that we err on the vector side for equal cost because
+        the cost estimate is otherwise quite pessimistic (constant uses are
+        free on the scalar side but cost a load on the vector side for
+        example).  */
+      if (vec_outside_cost + vec_inside_cost > scalar_cost)
+       {
+         scalar_costs.release ();
+         vector_costs.release ();
+         return false;
+       }
+    }
+  while (si < li_scalar_costs.length ()
+        && vi < li_vector_costs.length ());
+  if (vi < li_vector_costs.length ())
     {
-      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
-      dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
-                  vec_inside_cost);
-      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", 
vec_prologue_cost);
-      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", 
vec_epilogue_cost);
-      dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", 
scalar_cost);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Excess vector cost for part in loop %d:\n",
+                        li_vector_costs[vi].first);
+      scalar_costs.release ();
+      vector_costs.release ();
+      return false;
     }
 
-  /* Vectorization is profitable if its cost is more than the cost of scalar
-     version.  Note that we err on the vector side for equal cost because
-     the cost estimate is otherwise quite pessimistic (constant uses are
-     free on the scalar side but cost a load on the vector side for
-     example).  */
-  if (vec_outside_cost + vec_inside_cost > scalar_cost)
-    return false;
-
+  scalar_costs.release ();
+  vector_costs.release ();
   return true;
 }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index e564fcf835a..b861c97ab3a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1481,6 +1481,15 @@ add_stmt_cost (vec_info *vinfo, void *data, int count,
   return cost;
 }
 
+/* Alias targetm.vectorize.add_stmt_cost.  */
+
+static inline unsigned
+add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i)
+{
+  return add_stmt_cost (vinfo, data, i->count, i->kind, i->stmt_info,
+                       i->vectype, i->misalign, i->where);
+}
+
 /* Alias targetm.vectorize.finish_cost.  */
 
 static inline void
-- 
2.26.2

[PATCH] tree-optimization/98855 - redo BB vectorization costing

Reply via email to