This change adds a function that checks for SLP nodes with multiple occurrences of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the node so that there are no duplicates. A vec_perm is then introduced to recreate the original ordering. These duplicates can appear due to how two_operators nodes are handled, and they prevent vectorization in some cases.
This targets the vectorization of the SPEC2017 x264 pixel_satd functions. In some processors a larger than 10% improvement on x264 has been observed. See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138 gcc/ChangeLog: * tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for rearrangement patterns. (try_rearrange_oprnd_info): Detect if a node corresponds to one of the patterns. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-slp-two-operator.c: New test. Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu> --- .../aarch64/vect-slp-two-operator.c | 42 ++++ gcc/tree-vect-slp.cc | 234 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c new file mode 100644 index 00000000000..2db066a0b6e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c @@ -0,0 +1,42 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect -fdump-tree-vect-details" } */ + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d1 = t1 + t3;\ + d2 = t0 - t2;\ + d3 = t1 - t3;\ +} + +static uint32_t abs2( uint32_t a ) +{ + uint32_t s = ((a>>15)&0x10001)*0xffff; + return (a+s)^s; +} + +void sink(uint32_t tmp[4][4]); + +int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + uint32_t tmp[4][4]; + int sum = 0; + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) + { + uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 ); + } + sink(tmp); +} + +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index bf1f467f53f..e395db0e185 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -40,6 +40,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "langhooks.h" #include "gimple-walk.h" +#include "gimple-pretty-print.h" #include "dbgcnt.h" #include "tree-vector-builder.h" #include "vec-perm-indices.h" @@ -1829,6 +1830,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype, SLP_TREE_CHILDREN (perm).quick_push (child2); } +enum slp_oprnd_pattern +{ + SLP_OPRND_PATTERN_NONE, + SLP_OPRND_PATTERN_ABAB, + SLP_OPRND_PATTERN_AABB, + SLP_OPRND_PATTERN_ABBA +}; + +/* Check if OPRNDS_INFO has duplicated nodes that correspond to a predefined + pattern described by SLP_OPRND_PATTERN and return it. */ + +static int +try_rearrange_oprnd_info (vec<slp_oprnd_info> &oprnds_info, unsigned group_size) +{ + unsigned i; + slp_oprnd_info info; + + if (oprnds_info.length () != 2 || group_size % 4 != 0) + return SLP_OPRND_PATTERN_NONE; + + if (!oprnds_info[0]->def_stmts[0] + || !is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt)) + return SLP_OPRND_PATTERN_NONE; + + enum tree_code code + = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt); + FOR_EACH_VEC_ELT (oprnds_info, i, info) + for (unsigned int j = 0; j < group_size; j += 1) + { + if (!info->def_stmts[j] + || !is_a<gassign *> (info->def_stmts[j]->stmt) + || STMT_VINFO_DATA_REF (info->def_stmts[j])) + return SLP_OPRND_PATTERN_NONE; + /* Don't mix different operations. */ + if (gimple_assign_rhs_code (info->def_stmts[j]->stmt) != code) + return SLP_OPRND_PATTERN_NONE; + } + + if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt) + != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt)) + return SLP_OPRND_PATTERN_NONE; + + int pattern = SLP_OPRND_PATTERN_NONE; + FOR_EACH_VEC_ELT (oprnds_info, i, info) + for (unsigned int j = 0; j < group_size; j += 4) + { + int cur_pattern = SLP_OPRND_PATTERN_NONE; + /* Check for an ABAB... pattern. */ + if ((info->def_stmts[j] == info->def_stmts[j + 2]) + && (info->def_stmts[j + 1] == info->def_stmts[j + 3]) + && (info->def_stmts[j] != info->def_stmts[j + 1])) + cur_pattern = SLP_OPRND_PATTERN_ABAB; + /* Check for an AABB... pattern. */ + else if ((info->def_stmts[j] == info->def_stmts[j + 1]) + && (info->def_stmts[j + 2] == info->def_stmts[j + 3]) + && (info->def_stmts[j] != info->def_stmts[j + 2])) + cur_pattern = SLP_OPRND_PATTERN_AABB; + /* Check for an ABBA... pattern. */ + else if ((info->def_stmts[j] == info->def_stmts[j + 3]) + && (info->def_stmts[j + 1] == info->def_stmts[j + 2]) + && (info->def_stmts[j] != info->def_stmts[j + 1])) + cur_pattern = SLP_OPRND_PATTERN_ABBA; + /* Unrecognised pattern. */ + else + return SLP_OPRND_PATTERN_NONE; + + if (pattern == SLP_OPRND_PATTERN_NONE) + pattern = cur_pattern; + /* Multiple patterns detected. */ + else if (cur_pattern != pattern) + return SLP_OPRND_PATTERN_NONE; + } + + gcc_checking_assert (pattern != SLP_OPRND_PATTERN_NONE); + + if (dump_enabled_p ()) + { + if (pattern == SLP_OPRND_PATTERN_ABAB) + dump_printf (MSG_NOTE, "ABAB"); + else if (pattern == SLP_OPRND_PATTERN_AABB) + dump_printf (MSG_NOTE, "AABB"); + else if (pattern == SLP_OPRND_PATTERN_ABBA) + dump_printf (MSG_NOTE, "ABBA"); + dump_printf (MSG_NOTE, " pattern detected.\n"); + } + + if (pattern == SLP_OPRND_PATTERN_ABAB || pattern == SLP_OPRND_PATTERN_ABBA) + for (unsigned int j = 0; j < group_size; j += 4) + { + /* Given oprnd[0] -> A1, B1, A1, B1, A2, B2, A2, B2, ... + Given oprnd[1] -> C1, D1, C1, D1, C2, D2, C2, D2, ... + Create a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... */ + oprnds_info[0]->def_stmts[j+2] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+2] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+1]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+1]; + } + else if (pattern == SLP_OPRND_PATTERN_AABB) + for (unsigned int j = 0; j < group_size; j += 4) + { + /* Given oprnd[0] -> A1, A1, B1, B1, A2, A2, B2, B2, ... + Given oprnd[1] -> C1, C1, D1, D1, C2, C2, D2, D2, ... + Create a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ... */ + + /* The ordering here is at least to some extent arbitrary. + A generilized version needs to use some explicit ordering. */ + oprnds_info[0]->def_stmts[j+1] = oprnds_info[1]->def_stmts[j]; + oprnds_info[0]->ops[j+1] = oprnds_info[1]->ops[j]; + oprnds_info[0]->def_stmts[j+2] = oprnds_info[0]->def_stmts[j+2]; + oprnds_info[0]->ops[j+2] = oprnds_info[0]->ops[j+2]; + oprnds_info[0]->def_stmts[j+3] = oprnds_info[1]->def_stmts[j+2]; + oprnds_info[0]->ops[j+3] = oprnds_info[1]->ops[j+2]; + } + + if (dump_enabled_p ()) + { + dump_printf (MSG_NOTE, "Recurse with:\n"); + for (unsigned int j = 0; j < group_size; j++) + { + dump_printf (MSG_NOTE, " "); + print_gimple_stmt (dump_file, oprnds_info[0]->def_stmts[j]->stmt, 0); + } + } + + /* Since we've merged the two nodes in one, make the second one a copy of + the first. */ + for (unsigned int j = 0; j < group_size; j++) + { + oprnds_info[1]->def_stmts[j] = oprnds_info[0]->def_stmts[j]; + oprnds_info[1]->ops[j] = oprnds_info[0]->ops[j]; + } + + return pattern; +} + /* Recursively build an SLP tree starting from NODE. Fail (and return a value not equal to zero) if def-stmts are not isomorphic, require data permutation or are of unsupported types of @@ -2409,6 +2545,10 @@ out: stmt_info = stmts[0]; + int rearrange_pattern = SLP_OPRND_PATTERN_NONE; + if (is_a<gassign *> (stmt_info->stmt)) + rearrange_pattern = try_rearrange_oprnd_info (oprnds_info, group_size); + /* Create SLP_TREE nodes for the definition node/s. */ FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) { @@ -2669,6 +2809,100 @@ fail: *tree_size += this_tree_size + 1; *max_nunits = this_max_nunits; + /* If we applied any rearrangmenets then we need to reconstruct the original + elements with an additional permutation layer. */ + if (rearrange_pattern != SLP_OPRND_PATTERN_NONE) + { + slp_tree one = new _slp_tree; + slp_tree two = new _slp_tree; + SLP_TREE_DEF_TYPE (one) = vect_internal_def; + SLP_TREE_DEF_TYPE (two) = vect_internal_def; + SLP_TREE_VECTYPE (one) = vectype; + SLP_TREE_VECTYPE (two) = vectype; + SLP_TREE_CHILDREN (one).safe_splice (children); + SLP_TREE_CHILDREN (two).safe_splice (children); + + SLP_TREE_CODE (one) = VEC_PERM_EXPR; + SLP_TREE_CODE (two) = VEC_PERM_EXPR; + SLP_TREE_REPRESENTATIVE (one) = stmts[0]; + SLP_TREE_REPRESENTATIVE (two) = stmts[2]; + lane_permutation_t &perm_one = SLP_TREE_LANE_PERMUTATION (one); + lane_permutation_t &perm_two = SLP_TREE_LANE_PERMUTATION (two); + + if (rearrange_pattern == SLP_OPRND_PATTERN_ABAB) + { + /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... + Create node "one" -> A1, B1, A1, B1, A2, B2, A2, B2, ... + Create node "two" -> C1, D1, C1, D1, C2, D2, C2, D2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + perm_one.safe_push (std::make_pair (0, j + 0)); + perm_one.safe_push (std::make_pair (0, j + 1)); + perm_one.safe_push (std::make_pair (0, j + 0)); + perm_one.safe_push (std::make_pair (0, j + 1)); + + perm_two.safe_push (std::make_pair (0, j + 2)); + perm_two.safe_push (std::make_pair (0, j + 3)); + perm_two.safe_push (std::make_pair (0, j + 2)); + perm_two.safe_push (std::make_pair (0, j + 3)); + } + } + else if (rearrange_pattern == SLP_OPRND_PATTERN_AABB) + { + /* Given a single node -> A1, C1, B1, D1, A2, C2, B2, D2, ... + Create node "one" -> A1, A1, B1, B1, A2, A2, B2, B2, ... + Create node "two" -> C1, C1, D1, D1, C2, C2, D2, D2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + perm_one.safe_push (std::make_pair (0, j + 0)); + perm_one.safe_push (std::make_pair (0, j + 0)); + perm_one.safe_push (std::make_pair (0, j + 2)); + perm_one.safe_push (std::make_pair (0, j + 2)); + + perm_two.safe_push (std::make_pair (0, j + 1)); + perm_two.safe_push (std::make_pair (0, j + 1)); + perm_two.safe_push (std::make_pair (0, j + 3)); + perm_two.safe_push (std::make_pair (0, j + 3)); + } + } + else if (rearrange_pattern == SLP_OPRND_PATTERN_ABBA) + { + /* Given a single node -> A1, B1, C1, D1, A2, B2, C2, D2, ... + Create node "one" -> A1, B1, B1, A1, A2, B2, B2, A2, ... + Create node "two" -> C1, D1, D1, C1, C2, D2, D2, C2, ... */ + + for (unsigned int j = 0; j < group_size; j += 4) + { + perm_one.safe_push (std::make_pair (0, j + 0)); + perm_one.safe_push (std::make_pair (0, j + 1)); + perm_one.safe_push (std::make_pair (0, j + 1)); + perm_one.safe_push (std::make_pair (0, j + 0)); + + perm_two.safe_push (std::make_pair (0, j + 2)); + perm_two.safe_push (std::make_pair (0, j + 3)); + perm_two.safe_push (std::make_pair (0, j + 3)); + perm_two.safe_push (std::make_pair (0, j + 2)); + } + } + + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child) + SLP_TREE_REF_COUNT (child)++; + + node = vect_create_new_slp_node (node, stmts, 2); + SLP_TREE_VECTYPE (node) = vectype; + SLP_TREE_CHILDREN (node).quick_push (one); + SLP_TREE_CHILDREN (node).quick_push (two); + + SLP_TREE_LANES (one) = stmts.length (); + SLP_TREE_LANES (two) = stmts.length (); + + children.truncate (0); + children.safe_splice (SLP_TREE_CHILDREN (node)); + } + if (two_operators) { /* ??? We'd likely want to either cache in bst_map sth like -- 2.44.0