This change adds a function that checks for SLP nodes with multiple occurrences
of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the node
so that there are no duplicates. A vec_perm is then introduced to recreate the
original ordering. These duplicates can appear due to how two_operators nodes
are handled, and they prevent vectorization in some cases.

This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
In some processors a larger than 10% improvement on x264 has been observed.

See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138

gcc/ChangeLog:

        * tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for rearrangement
        patterns.
        (try_rearrange_oprnd_info): Detect if a node corresponds to one of the
        patterns.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-slp-two-operator.c: New test.

Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu>
---

 .../aarch64/vect-slp-two-operator.c           |  42 ++++
 gcc/tree-vect-slp.cc                          | 234 ++++++++++++++++++
 2 files changed, 276 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c

diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c 
b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
new file mode 100644
index 00000000000..2db066a0b6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect 
-fdump-tree-vect-details" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+    int t0 = s0 + s1;\
+    int t1 = s0 - s1;\
+    int t2 = s2 + s3;\
+    int t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d1 = t1 + t3;\
+    d2 = t0 - t2;\
+    d3 = t1 - t3;\
+}
+
+static uint32_t abs2( uint32_t a )
+{
+    uint32_t s = ((a>>15)&0x10001)*0xffff;
+    return (a+s)^s;
+}
+
+void sink(uint32_t tmp[4][4]);
+
+int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+    uint32_t tmp[4][4];
+    int sum = 0;
+    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+    {
+        uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+        uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+        uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+        uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
+    }
+    sink(tmp);
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bf1f467f53f..e395db0e185 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1829,6 +1830,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree 
vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+enum slp_oprnd_pattern
+{
+  SLP_OPRND_PATTERN_NONE,
+  SLP_OPRND_PATTERN_ABAB,
+  SLP_OPRND_PATTERN_AABB,
+  SLP_OPRND_PATTERN_ABBA
+};
+
+/* Check if OPRNDS_INFO has duplicated nodes that correspond to a predefined
+   pattern described by SLP_OPRND_PATTERN and return it.  */
+
+static int
+try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned 
group_size)
+{
+  unsigned i;
+  slp_oprnd_info info;
+
+  if (oprnds_info.length () != 2 || group_size % 4 != 0)
+    return SLP_OPRND_PATTERN_NONE;
+
+  if (!oprnds_info[0]->def_stmts[0]
+      || !is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  enum tree_code code
+    = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+    for (unsigned int j = 0; j < group_size; j += 1)
+      {
+       if (!info->def_stmts[j]
+           || !is_a<gassign *> (info->def_stmts[j]->stmt)
+           || STMT_VINFO_DATA_REF (info->def_stmts[j]))
+         return SLP_OPRND_PATTERN_NONE;
+       /* Don't mix different operations.  */
+       if (gimple_assign_rhs_code (info->def_stmts[j]->stmt) != code)
+         return SLP_OPRND_PATTERN_NONE;
+      }
+
+  if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt)
+      != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt))
+    return SLP_OPRND_PATTERN_NONE;
+
+  int pattern = SLP_OPRND_PATTERN_NONE;
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+       int cur_pattern = SLP_OPRND_PATTERN_NONE;
+       /* Check for an ABAB... pattern.  */
+       if ((info->def_stmts[j] == info->def_stmts[j + 2])
+           && (info->def_stmts[j + 1] == info->def_stmts[j + 3])
+           && (info->def_stmts[j] != info->def_stmts[j + 1]))
+         cur_pattern = SLP_OPRND_PATTERN_ABAB;
+       /* Check for an AABB... pattern.  */
+       else if ((info->def_stmts[j] == info->def_stmts[j + 1])
+                && (info->def_stmts[j + 2] == info->def_stmts[j + 3])
+                && (info->def_stmts[j] != info->def_stmts[j + 2]))
+         cur_pattern = SLP_OPRND_PATTERN_AABB;
+       /* Check for an ABBA... pattern.  */
+       else if ((info->def_stmts[j] == info->def_stmts[j + 3])
+                && (info->def_stmts[j + 1] == info->def_stmts[j + 2])
+                && (info->def_stmts[j] != info->def_stmts[j + 1]))
+         cur_pattern = SLP_OPRND_PATTERN_ABBA;
+       /* Unrecognised pattern.  */
+       else
+         return SLP_OPRND_PATTERN_NONE;
+
+       if (pattern == SLP_OPRND_PATTERN_NONE)
+         pattern = cur_pattern;
+       /* Multiple patterns detected.  */
+       else if (cur_pattern != pattern)
+         return SLP_OPRND_PATTERN_NONE;
+      }
+
+  gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE);
+
+  if (dump_enabled_p ())
+    {
+      if (pattern == SLP_OPRND_PATTERN_ABAB)
+       dump_printf (MSG_NOTE, "ABAB");
+      else if (pattern == SLP_OPRND_PATTERN_AABB)
+       dump_printf (MSG_NOTE, "AABB");
+      else if (pattern == SLP_OPRND_PATTERN_ABBA)
+       dump_printf (MSG_NOTE, "ABBA");
+      dump_printf (MSG_NOTE, " pattern detected.\n");
+    }
+
+  if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+       /* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+          Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ...
+          Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...  */
+       oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j];
+       oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j];
+       oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1];
+       oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1];
+      }
+  else if (pattern == SLP_OPRND_PATTERN_AABB)
+    for (unsigned int j = 0; j < group_size; j += 4)
+      {
+       /* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+          Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ...
+          Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...  */
+
+       /* The ordering here is at least to some extent arbitrary.
+          A generilized version needs to use some explicit ordering.  */
+       oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j];
+       oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j];
+       oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2];
+       oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2];
+       oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2];
+       oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2];
+      }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf (MSG_NOTE, "Recurse with:\n");
+      for (unsigned int j = 0; j < group_size; j++)
+       {
+         dump_printf (MSG_NOTE, "  ");
+         print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0);
+       }
+    }
+
+  /* Since we've merged the two nodes in one, make the second one a copy of
+     the first.  */
+  for (unsigned int j = 0; j < group_size; j++)
+    {
+      oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j];
+      oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j];
+    }
+
+  return pattern;
+}
+
 /* Recursively build an SLP tree starting from NODE.
    Fail (and return a value not equal to zero) if def-stmts are not
    isomorphic, require data permutation or are of unsupported types of
@@ -2409,6 +2545,10 @@ out:
 
   stmt_info = stmts[0];
 
+  int rearrange_pattern = SLP_OPRND_PATTERN_NONE;
+  if (is_a<gassign *> (stmt_info->stmt))
+    rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size);
+
   /* Create SLP_TREE nodes for the definition node/s.  */
   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     {
@@ -2669,6 +2809,100 @@ fail:
   *tree_size += this_tree_size + 1;
   *max_nunits = this_max_nunits;
 
+  /* If we applied any rearrangmenets then we need to reconstruct the original
+     elements with an additional permutation layer.  */
+  if (rearrange_pattern != SLP_OPRND_PATTERN_NONE)
+    {
+      slp_tree one =  new _slp_tree;
+      slp_tree two = new _slp_tree;
+      SLP_TREE_DEF_TYPE (one) = vect_internal_def;
+      SLP_TREE_DEF_TYPE (two) = vect_internal_def;
+      SLP_TREE_VECTYPE (one) = vectype;
+      SLP_TREE_VECTYPE (two) = vectype;
+      SLP_TREE_CHILDREN (one).safe_splice (children);
+      SLP_TREE_CHILDREN (two).safe_splice (children);
+
+      SLP_TREE_CODE (one) = VEC_PERM_EXPR;
+      SLP_TREE_CODE (two) = VEC_PERM_EXPR;
+      SLP_TREE_REPRESENTATIVE (one) = stmts[0];
+      SLP_TREE_REPRESENTATIVE (two) = stmts[2];
+      lane_permutation_t &perm_one = SLP_TREE_LANE_PERMUTATION (one);
+      lane_permutation_t &perm_two = SLP_TREE_LANE_PERMUTATION (two);
+
+      if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB)
+       {
+          /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+             Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ...
+             Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ...  */
+
+         for (unsigned int j = 0; j < group_size; j += 4)
+           {
+             perm_one.safe_push (std::make_pair (0, j + 0));
+             perm_one.safe_push (std::make_pair (0, j + 1));
+             perm_one.safe_push (std::make_pair (0, j + 0));
+             perm_one.safe_push (std::make_pair (0, j + 1));
+
+             perm_two.safe_push (std::make_pair (0, j + 2));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+             perm_two.safe_push (std::make_pair (0, j + 2));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+           }
+       }
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB)
+       {
+          /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ...
+             Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ...
+             Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ...  */
+
+         for (unsigned int j = 0; j < group_size; j += 4)
+           {
+             perm_one.safe_push (std::make_pair (0, j + 0));
+             perm_one.safe_push (std::make_pair (0, j + 0));
+             perm_one.safe_push (std::make_pair (0, j + 2));
+             perm_one.safe_push (std::make_pair (0, j + 2));
+
+             perm_two.safe_push (std::make_pair (0, j + 1));
+             perm_two.safe_push (std::make_pair (0, j + 1));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+           }
+       }
+      else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA)
+       {
+          /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ...
+             Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ...
+             Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ...  */
+
+         for (unsigned int j = 0; j < group_size; j += 4)
+           {
+             perm_one.safe_push (std::make_pair (0, j + 0));
+             perm_one.safe_push (std::make_pair (0, j + 1));
+             perm_one.safe_push (std::make_pair (0, j + 1));
+             perm_one.safe_push (std::make_pair (0, j + 0));
+
+             perm_two.safe_push (std::make_pair (0, j + 2));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+             perm_two.safe_push (std::make_pair (0, j + 3));
+             perm_two.safe_push (std::make_pair (0, j + 2));
+           }
+       }
+
+      slp_tree child;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
+       SLP_TREE_REF_COUNT (child)++;
+
+      node = vect_create_new_slp_node (node, stmts, 2);
+      SLP_TREE_VECTYPE (node) = vectype;
+      SLP_TREE_CHILDREN (node).quick_push (one);
+      SLP_TREE_CHILDREN (node).quick_push (two);
+
+      SLP_TREE_LANES (one) = stmts.length ();
+      SLP_TREE_LANES (two) = stmts.length ();
+
+      children.truncate (0);
+      children.safe_splice (SLP_TREE_CHILDREN (node));
+    }
+
   if (two_operators)
     {
       /* ???  We'd likely want to either cache in bst_map sth like
-- 
2.44.0

Reply via email to