On Tue, Aug 6, 2024 at 12:38 PM Manolis Tsamis <manolis.tsa...@vrull.eu> wrote:
>
> Pinging this for a review and/or further feedback.
>
> Thanks,
> Manolis
>
> On Wed, Jun 26, 2024 at 3:06 PM Manolis Tsamis <manolis.tsa...@vrull.eu> 
> wrote:
> >
> > This change checks when a two_operators SLP node has multiple occurrences of
> > the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the 
> > operands
> > so that there are no duplicates. Two vec_perm expressions are then 
> > introduced
> > to recreate the original ordering. These duplicates can appear due to how
> > two_operators nodes are handled, and they prevent vectorization in some 
> > cases.
> >
> > This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
> > In some processors a larger than 10% improvement on x264 has been observed.
> >
> > See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138

This patch is OK.

Sorry for the slow reply/review.

Thanks,
Richard.

> > gcc/ChangeLog:
> >
> >         * tree-vect-slp.cc: Avoid duplicates in two_operators nodes.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/vect-slp-two-operator.c: New test.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsa...@vrull.eu>
> > ---
> >
> > Changes in v2:
> >         - Do not use predefined patterns; support rearrangement of arbitrary
> >         node orderings.
> >         - Only apply for two_operators nodes.
> >         - Recurse with single SLP operand instead of two duplicated ones.
> >         - Refactoring of code.
> >
> >  .../aarch64/vect-slp-two-operator.c           |  36 ++++++
> >  gcc/tree-vect-slp.cc                          | 114 ++++++++++++++++++
> >  2 files changed, 150 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c 
> > b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> > new file mode 100644
> > index 00000000000..b6b093ffc34
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect 
> > -fdump-tree-vect-details" } */
> > +
> > +typedef unsigned char uint8_t;
> > +typedef unsigned int uint32_t;
> > +
> > +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
> > +    int t0 = s0 + s1;\
> > +    int t1 = s0 - s1;\
> > +    int t2 = s2 + s3;\
> > +    int t3 = s2 - s3;\
> > +    d0 = t0 + t2;\
> > +    d1 = t1 + t3;\
> > +    d2 = t0 - t2;\
> > +    d3 = t1 - t3;\
> > +}
> > +
> > +void sink(uint32_t tmp[4][4]);
> > +
> > +int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int 
> > i_pix2 )
> > +{
> > +    uint32_t tmp[4][4];
> > +    int sum = 0;
> > +    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
> > +    {
> > +        uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> > +        uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> > +        uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> > +        uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> > +        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 
> > );
> > +    }
> > +    sink(tmp);
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > index b47b7e8c979..60d0d388dff 100644
> > --- a/gcc/tree-vect-slp.cc
> > +++ b/gcc/tree-vect-slp.cc
> > @@ -2420,6 +2420,95 @@ out:
> >        }
> >    swap = NULL;
> >
> > +  bool has_two_operators_perm = false;
> > +  auto_vec<unsigned> two_op_perm_indices[2];
> > +  vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
> > +
> > +  if (two_operators && oprnds_info.length () == 2 && group_size > 2)
> > +    {
> > +      unsigned idx = 0;
> > +      hash_map<gimple *, unsigned> seen;
> > +      vec<slp_oprnd_info> new_oprnds_info
> > +       = vect_create_oprnd_info (1, group_size);
> > +      bool success = true;
> > +
> > +      enum tree_code code = ERROR_MARK;
> > +      if (oprnds_info[0]->def_stmts[0]
> > +         && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
> > +       code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
> > +
> > +      for (unsigned j = 0; j < group_size; ++j)
> > +       {
> > +         FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
> > +           {
> > +             stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
> > +             if (!stmt_info || !stmt_info->stmt
> > +                 || !is_a<gassign *> (stmt_info->stmt)
> > +                 || gimple_assign_rhs_code (stmt_info->stmt) != code
> > +                 || skip_args[i])
> > +               {
> > +                 success = false;
> > +                 break;
> > +               }
> > +
> > +             bool exists;
> > +             unsigned &stmt_idx
> > +               = seen.get_or_insert (stmt_info->stmt, &exists);
> > +
> > +             if (!exists)
> > +               {
> > +                 new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
> > +                 new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
> > +                 stmt_idx = idx;
> > +                 idx++;
> > +               }
> > +
> > +             two_op_perm_indices[i].safe_push (stmt_idx);
> > +           }
> > +
> > +         if (!success)
> > +           break;
> > +       }
> > +
> > +      if (success && idx == group_size)
> > +       {
> > +         if (dump_enabled_p ())
> > +           {
> > +             dump_printf_loc (MSG_NOTE, vect_location,
> > +                              "Replace two_operators operands:\n");
> > +
> > +             FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
> > +               {
> > +                 dump_printf_loc (MSG_NOTE, vect_location,
> > +                                  "Operand %u:\n", i);
> > +                 for (unsigned j = 0; j < group_size; j++)
> > +                   dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u 
> > %G",
> > +                                    j, oprnd_info->def_stmts[j]->stmt);
> > +               }
> > +
> > +             dump_printf_loc (MSG_NOTE, vect_location,
> > +                              "With a single operand:\n");
> > +             for (unsigned j = 0; j < group_size; j++)
> > +               dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
> > +                                j, new_oprnds_info[0]->def_stmts[j]->stmt);
> > +           }
> > +
> > +         two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
> > +         two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
> > +
> > +         new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
> > +         new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
> > +         new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
> > +         new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
> > +         new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
> > +
> > +         vect_free_oprnd_info (oprnds_info);
> > +         oprnds_info = new_oprnds_info;
> > +         nops = 1;
> > +         has_two_operators_perm = true;
> > +       }
> > +    }
> > +
> >    auto_vec<slp_tree, 4> children;
> >
> >    stmt_info = stmts[0];
> > @@ -2691,6 +2780,29 @@ fail:
> >          the true { a+b, a+b, a+b, a+b } ... but there we don't have
> >          explicit stmts to put in so the keying on 'stmts' doesn't
> >          work (but we have the same issue with nodes that use 'ops').  */
> > +
> > +      if (has_two_operators_perm)
> > +       {
> > +         slp_tree child = children[0];
> > +         children.truncate (0);
> > +         for (i = 0; i < 2; i++)
> > +           {
> > +             slp_tree pnode
> > +               = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
> > +             SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
> > +             SLP_TREE_VECTYPE (pnode) = vectype;
> > +             SLP_TREE_CHILDREN (pnode).quick_push (child);
> > +             SLP_TREE_CHILDREN (pnode).quick_push (child);
> > +             lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
> > +             children.safe_push (pnode);
> > +
> > +             for (unsigned j = 0; j < stmts.length (); j++)
> > +               perm.safe_push (std::make_pair (0, 
> > two_op_perm_indices[i][j]));
> > +           }
> > +
> > +         SLP_TREE_REF_COUNT (child) += 4;
> > +       }
> > +
> >        slp_tree one = new _slp_tree;
> >        slp_tree two = new _slp_tree;
> >        SLP_TREE_DEF_TYPE (one) = vect_internal_def;
> > @@ -2727,12 +2839,14 @@ fail:
> >           else
> >             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, 
> > i));
> >         }
> > +
> >        SLP_TREE_CODE (one) = code0;
> >        SLP_TREE_CODE (two) = ocode;
> >        SLP_TREE_LANES (one) = stmts.length ();
> >        SLP_TREE_LANES (two) = stmts.length ();
> >        SLP_TREE_REPRESENTATIVE (one) = stmts[0];
> >        SLP_TREE_REPRESENTATIVE (two) = stmts[j];
> > +
> >        return node;
> >      }
> >
> > --
> > 2.44.0
> >

Reply via email to