On Tue, Aug 6, 2024 at 12:38 PM Manolis Tsamis <manolis.tsa...@vrull.eu> wrote: > > Pinging this for a review and/or further feedback. > > Thanks, > Manolis > > On Wed, Jun 26, 2024 at 3:06 PM Manolis Tsamis <manolis.tsa...@vrull.eu> > wrote: > > > > This change checks when a two_operators SLP node has multiple occurrences of > > the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the > > operands > > so that there are no duplicates. Two vec_perm expressions are then > > introduced > > to recreate the original ordering. These duplicates can appear due to how > > two_operators nodes are handled, and they prevent vectorization in some > > cases. > > > > This targets the vectorization of the SPEC2017 x264 pixel_satd functions. > > In some processors a larger than 10% improvement on x264 has been observed. > > > > See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
This patch is OK. Sorry for the slow reply/review. Thanks, Richard. > > gcc/ChangeLog: > > > > * tree-vect-slp.cc: Avoid duplicates in two_operators nodes. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/vect-slp-two-operator.c: New test. > > > > Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu> > > --- > > > > Changes in v2: > > - Do not use predefined patterns; support rearrangement of arbitrary > > node orderings. > > - Only apply for two_operators nodes. > > - Recurse with single SLP operand instead of two duplicated ones. > > - Refactoring of code. > > > > .../aarch64/vect-slp-two-operator.c | 36 ++++++ > > gcc/tree-vect-slp.cc | 114 ++++++++++++++++++ > > 2 files changed, 150 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > new file mode 100644 > > index 00000000000..b6b093ffc34 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > @@ -0,0 +1,36 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect > > -fdump-tree-vect-details" } */ > > + > > +typedef unsigned char uint8_t; > > +typedef unsigned int uint32_t; > > + > > +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ > > + int t0 = s0 + s1;\ > > + int t1 = s0 - s1;\ > > + int t2 = s2 + s3;\ > > + int t3 = s2 - s3;\ > > + d0 = t0 + t2;\ > > + d1 = t1 + t3;\ > > + d2 = t0 - t2;\ > > + d3 = t1 - t3;\ > > +} > > + > > +void sink(uint32_t tmp[4][4]); > > + > > +int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int > > i_pix2 ) > > +{ > > + uint32_t tmp[4][4]; > > + int sum = 0; > > + for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) > > + { > > + uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); > > + uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); > > + uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); > > + uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); > > + HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 > > ); > > + } > > + sink(tmp); > > +} > > + > > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > index b47b7e8c979..60d0d388dff 100644 > > --- a/gcc/tree-vect-slp.cc > > +++ b/gcc/tree-vect-slp.cc > > @@ -2420,6 +2420,95 @@ out: > > } > > swap = NULL; > > > > + bool has_two_operators_perm = false; > > + auto_vec<unsigned> two_op_perm_indices[2]; > > + vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL}; > > + > > + if (two_operators && oprnds_info.length () == 2 && group_size > 2) > > + { > > + unsigned idx = 0; > > + hash_map<gimple *, unsigned> seen; > > + vec<slp_oprnd_info> new_oprnds_info > > + = vect_create_oprnd_info (1, group_size); > > + bool success = true; > > + > > + enum tree_code code = ERROR_MARK; > > + if (oprnds_info[0]->def_stmts[0] > > + && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt)) > > + code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt); > > + > > + for (unsigned j = 0; j < group_size; ++j) > > + { > > + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) > > + { > > + stmt_vec_info stmt_info = oprnd_info->def_stmts[j]; > > + if (!stmt_info || !stmt_info->stmt > > + || !is_a<gassign *> (stmt_info->stmt) > > + || gimple_assign_rhs_code (stmt_info->stmt) != code > > + || skip_args[i]) > > + { > > + success = false; > > + break; > > + } > > + > > + bool exists; > > + unsigned &stmt_idx > > + = seen.get_or_insert (stmt_info->stmt, &exists); > > + > > + if (!exists) > > + { > > + new_oprnds_info[0]->def_stmts.safe_push (stmt_info); > > + new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]); > > + stmt_idx = idx; > > + idx++; > > + } > > + > > + two_op_perm_indices[i].safe_push (stmt_idx); > > + } > > + > > + if (!success) > > + break; > > + } > > + > > + if (success && idx == group_size) > > + { > > + if (dump_enabled_p ()) > > + { > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "Replace two_operators operands:\n"); > > + > > + FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) > > + { > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "Operand %u:\n", i); > > + for (unsigned j = 0; j < group_size; j++) > > + dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u > > %G", > > + j, oprnd_info->def_stmts[j]->stmt); > > + } > > + > > + dump_printf_loc (MSG_NOTE, vect_location, > > + "With a single operand:\n"); > > + for (unsigned j = 0; j < group_size; j++) > > + dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G", > > + j, new_oprnds_info[0]->def_stmts[j]->stmt); > > + } > > + > > + two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts); > > + two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts); > > + > > + new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type; > > + new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt; > > + new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern; > > + new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p; > > + new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info; > > + > > + vect_free_oprnd_info (oprnds_info); > > + oprnds_info = new_oprnds_info; > > + nops = 1; > > + has_two_operators_perm = true; > > + } > > + } > > + > > auto_vec<slp_tree, 4> children; > > > > stmt_info = stmts[0]; > > @@ -2691,6 +2780,29 @@ fail: > > the true { a+b, a+b, a+b, a+b } ... but there we don't have > > explicit stmts to put in so the keying on 'stmts' doesn't > > work (but we have the same issue with nodes that use 'ops'). */ > > + > > + if (has_two_operators_perm) > > + { > > + slp_tree child = children[0]; > > + children.truncate (0); > > + for (i = 0; i < 2; i++) > > + { > > + slp_tree pnode > > + = vect_create_new_slp_node (two_op_scalar_stmts[i], 2); > > + SLP_TREE_CODE (pnode) = VEC_PERM_EXPR; > > + SLP_TREE_VECTYPE (pnode) = vectype; > > + SLP_TREE_CHILDREN (pnode).quick_push (child); > > + SLP_TREE_CHILDREN (pnode).quick_push (child); > > + lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode); > > + children.safe_push (pnode); > > + > > + for (unsigned j = 0; j < stmts.length (); j++) > > + perm.safe_push (std::make_pair (0, > > two_op_perm_indices[i][j])); > > + } > > + > > + SLP_TREE_REF_COUNT (child) += 4; > > + } > > + > > slp_tree one = new _slp_tree; > > slp_tree two = new _slp_tree; > > SLP_TREE_DEF_TYPE (one) = vect_internal_def; > > @@ -2727,12 +2839,14 @@ fail: > > else > > SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, > > i)); > > } > > + > > SLP_TREE_CODE (one) = code0; > > SLP_TREE_CODE (two) = ocode; > > SLP_TREE_LANES (one) = stmts.length (); > > SLP_TREE_LANES (two) = stmts.length (); > > SLP_TREE_REPRESENTATIVE (one) = stmts[0]; > > SLP_TREE_REPRESENTATIVE (two) = stmts[j]; > > + > > return node; > > } > > > > -- > > 2.44.0 > >