On Wed, Jun 24, 2020 at 1:31 AM Ilya Leoshkevich via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Bootstrapped and regtested x86_64-redhat-linux, ppc64le-redhat-linux and
> s390x-redhat-linux.  I also ran SPEC 2006 and 2017 on these platforms,
> and the only measurable regression was 3% in 520.omnetpp_r on ppc, which
> went away after inserting a single nop at the beginning of
> cDynamicExpression::evaluate.
>
> OK for master?

As you might know this is incredibly fragile so I'd prefer if you submit
and push the change to disable PHI biasing in the early pass instance
separately.  At first glance that change looks reasonable (but we'll
watch for fallout).

Comments on the other changes inline

>
> ---
>
> PR tree-optimization/49749 introduced code that shortens dependency
> chains containing loop accumulators by placing them last on operand
> lists of associative operations.
>
> 456.hmmer benchmark on s390 could benefit from this, however, the code
> that needs it modifies loop accumulator before using it, and since only
> so-called loop-carried phis are are treated as loop accumulators, the
> code in the present form doesn't really help.   According to Bill
> Schmidt - the original author - such a conservative approach was chosen
> so as to avoid unnecessarily swapping operands, which might cause
> unpredictable effects.  However, giving special treatment to forms of
> loop accumulators is acceptable.
>
> The definition of loop-carried phi is: it's a single-use phi, which is
> used in the same innermost loop it's defined in, at least one argument
> of which is defined in the same innermost loop as the phi itself.
> Given this, it seems natural to treat single uses of such phis as phis
> themselves.
>
> gcc/ChangeLog:
>
> 2020-05-06  Ilya Leoshkevich  <i...@linux.ibm.com>
>
>         * passes.def (pass_reassoc): Rename parameter to early_p.
>         * tree-ssa-reassoc.c (reassoc_bias_loop_carried_phi_ranks_p):
>         New variable.
>         (phi_rank): Don't bias loop-carried phi ranks
>         before vectorization pass.
>         (loop_carried_phi): Remove (superseded by
>         operand_rank::biased_p).
>         (propagate_rank): Propagate bias along single uses.
>         (get_rank): Pass stmt to propagate_rank.
>         (execute_reassoc): Add bias_loop_carried_phi_ranks_p parameter.
>         (pass_reassoc::pass_reassoc): Add bias_loop_carried_phi_ranks_p
>         initializer.
>         (pass_reassoc::set_param): Set bias_loop_carried_phi_ranks_p
>         value.
>         (pass_reassoc::execute): Pass bias_loop_carried_phi_ranks_p to
>         execute_reassoc.
>         (pass_reassoc::bias_loop_carried_phi_ranks_p): New member.
>
> gcc/testsuite/ChangeLog:
>
> 2020-05-06  Ilya Leoshkevich  <i...@linux.ibm.com>
>
>         * gcc.target/s390/reassoc-1.c: New test.
>         * gcc.target/s390/reassoc-2.c: New test.
>         * gcc.target/s390/reassoc-3.c: New test.
>         * gcc.target/s390/reassoc.h: New test.
> ---
>  gcc/passes.def                            |  4 +-
>  gcc/testsuite/gcc.target/s390/reassoc-1.c |  6 ++
>  gcc/testsuite/gcc.target/s390/reassoc-2.c |  7 ++
>  gcc/testsuite/gcc.target/s390/reassoc-3.c |  8 ++
>  gcc/testsuite/gcc.target/s390/reassoc.h   | 22 +++++
>  gcc/tree-ssa-reassoc.c                    | 97 ++++++++++++++---------
>  6 files changed, 105 insertions(+), 39 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-1.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-2.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-3.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/reassoc.h
>
> diff --git a/gcc/passes.def b/gcc/passes.def
> index 2b1e09fdda3..6864f583f20 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -235,7 +235,7 @@ along with GCC; see the file COPYING3.  If not see
>          program and isolate those paths.  */
>        NEXT_PASS (pass_isolate_erroneous_paths);
>        NEXT_PASS (pass_dse);
> -      NEXT_PASS (pass_reassoc, true /* insert_powi_p */);
> +      NEXT_PASS (pass_reassoc, true /* early_p */);
>        NEXT_PASS (pass_dce);
>        NEXT_PASS (pass_forwprop);
>        NEXT_PASS (pass_phiopt, false /* early_p */);
> @@ -312,7 +312,7 @@ along with GCC; see the file COPYING3.  If not see
>        NEXT_PASS (pass_lower_vector_ssa);
>        NEXT_PASS (pass_lower_switch);
>        NEXT_PASS (pass_cse_reciprocals);
> -      NEXT_PASS (pass_reassoc, false /* insert_powi_p */);
> +      NEXT_PASS (pass_reassoc, false /* early_p */);
>        NEXT_PASS (pass_strength_reduction);
>        NEXT_PASS (pass_split_paths);
>        NEXT_PASS (pass_tracer);
> diff --git a/gcc/testsuite/gcc.target/s390/reassoc-1.c 
> b/gcc/testsuite/gcc.target/s390/reassoc-1.c
> new file mode 100644
> index 00000000000..8343f1cd4b7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/reassoc-1.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include "reassoc.h"
> +
> +/* { dg-final { scan-assembler 
> {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */
> diff --git a/gcc/testsuite/gcc.target/s390/reassoc-2.c 
> b/gcc/testsuite/gcc.target/s390/reassoc-2.c
> new file mode 100644
> index 00000000000..5e393ed4937
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/reassoc-2.c
> @@ -0,0 +1,7 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define MODIFY
> +#include "reassoc.h"
> +
> +/* { dg-final { scan-assembler 
> {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */
> diff --git a/gcc/testsuite/gcc.target/s390/reassoc-3.c 
> b/gcc/testsuite/gcc.target/s390/reassoc-3.c
> new file mode 100644
> index 00000000000..ccbb627be7a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/reassoc-3.c
> @@ -0,0 +1,8 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define MODIFY
> +#define STORE_MODIFIED
> +#include "reassoc.h"
> +
> +/* { dg-final { scan-assembler 
> {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */

So this is all s390 target specific testcases.  Can you include a
generic testcase
for gcc.dg/tree-ssa/ please?  From the above testcases I cannot see
what is supposed
to be tested / improved - a comment would also help there like

"make sure we are doing XYZ and not ZXX"

> diff --git a/gcc/testsuite/gcc.target/s390/reassoc.h 
> b/gcc/testsuite/gcc.target/s390/reassoc.h
> new file mode 100644
> index 00000000000..b0e2b41202a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/reassoc.h
> @@ -0,0 +1,22 @@
> +#define M 1024
> +unsigned int arr1[M];
> +unsigned int arr2[M];
> +volatile unsigned int sink;
> +
> +unsigned int
> +test ()
> +{
> +  unsigned int sum = 0;
> +  for (int i = 0; i < M; i++)
> +    {
> +#ifdef MODIFY
> +      sum |= 1;
> +#ifdef STORE_MODIFIED
> +      sink = sum;
> +#endif
> +#endif
> +      sum += arr1[i];
> +      sum += arr2[i];
> +    }
> +  return sum;
> +}
> diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c
> index 2e67987f6c6..67be5312f42 100644
> --- a/gcc/tree-ssa-reassoc.c
> +++ b/gcc/tree-ssa-reassoc.c
> @@ -177,6 +177,10 @@ along with GCC; see the file COPYING3.  If not see
>     point 3a in the pass header comment.  */
>  static bool reassoc_insert_powi_p;
>
> +/* Enable biasing ranks of loop accumulators.  We don't want this before
> +   vectorization, since it interferes with reduction chains.  */
> +static bool reassoc_bias_loop_carried_phi_ranks_p;
> +
>  /* Statistics */
>  static struct
>  {
> @@ -275,6 +279,9 @@ phi_rank (gimple *stmt)
>    use_operand_p use;
>    gimple *use_stmt;
>
> +  if (!reassoc_bias_loop_carried_phi_ranks_p)
> +    return bb_rank[bb->index];
> +
>    /* We only care about real loops (those with a latch).  */
>    if (!father->latch)
>      return bb_rank[bb->index];
> @@ -312,48 +319,59 @@ phi_rank (gimple *stmt)
>    return bb_rank[bb->index];
>  }
>
> -/* If EXP is an SSA_NAME defined by a PHI statement that represents a
> -   loop-carried dependence of an innermost loop, return TRUE; else
> -   return FALSE.  */
> -static bool
> -loop_carried_phi (tree exp)
> -{
> -  gimple *phi_stmt;
> -  long block_rank;
> -
> -  if (TREE_CODE (exp) != SSA_NAME
> -      || SSA_NAME_IS_DEFAULT_DEF (exp))
> -    return false;
> -
> -  phi_stmt = SSA_NAME_DEF_STMT (exp);
> -
> -  if (gimple_code (SSA_NAME_DEF_STMT (exp)) != GIMPLE_PHI)
> -    return false;
> -
> -  /* Non-loop-carried phis have block rank.  Loop-carried phis have
> -     an additional bias added in.  If this phi doesn't have block rank,
> -     it's biased and should not be propagated.  */
> -  block_rank = bb_rank[gimple_bb (phi_stmt)->index];
> -
> -  if (phi_rank (phi_stmt) != block_rank)
> -    return true;
> -
> -  return false;
> -}
> -
>  /* Return the maximum of RANK and the rank that should be propagated
>     from expression OP.  For most operands, this is just the rank of OP.
> -   For loop-carried phis, the value is zero to avoid undoing the bias
> +   For loop-carried phis, the value is block rank to avoid undoing the bias
>     in favor of the phi.  */
>  static long
> -propagate_rank (long rank, tree op)
> +propagate_rank (long rank, tree op, gimple *stmt)
>  {
>    long op_rank;
>
> -  if (loop_carried_phi (op))
> -    return rank;
> -
>    op_rank = get_rank (op);
> +  if (op_rank & PHI_LOOP_BIAS)

This looks like a fragile test.  Factoring out a bias_phi_p ()
predicate to be used both here and in get_phi_rank would be
surely better.  Here you can see whether op is defined by
a PHI and pass that to the predicate.

> +    {
> +      /* Treat loop-carried phis and elements of innermost-loop-internal
> +        single-use chains originating from them identically: propagate ranks
> +        (which are always biased) only along innermost-loop-internal
> +        single-use chains.  Treat only GIMPLE_ASSIGNs to SSA_NAMEs as uses,
> +        because only these participate in rank propagation.  This way in the
> +        following code:
> +
> +          x_1 = phi(x_0, x_2)
> +          q = x_1 & 1
> +          .MEM = q
> +          b = a + q
> +          c = b + d
> +          x_2 = c + e
> +
> +        the rank of q, which is essentially a form of loop accumulator, will
> +        be biased.  Ranks of b and c will also be biased, which seems
> +        unnecessary at first glance, however, this will have no consequences,
> +        since b and c are intermediate results, and their ranks will be
> +        discarded when creating + operand lists.  */
> +      use_operand_p use;
> +      imm_use_iterator use_iter;
> +      gimple *use_stmt = NULL;
> +      FOR_EACH_IMM_USE_FAST (use, use_iter, gimple_assign_lhs (stmt))
> +      if (is_gimple_assign (USE_STMT (use))
> +         && TREE_CODE (gimple_assign_lhs (USE_STMT (use))) == SSA_NAME)
> +       {
> +         if (use_stmt == NULL)
> +           {
> +             use_stmt = USE_STMT (use);
> +           }
> +         else
> +           {
> +             use_stmt = NULL;
> +             break;
> +           }
> +       }
> +      if (use_stmt == NULL
> +         || gimple_bb (stmt)->loop_father
> +                != gimple_bb (use_stmt)->loop_father)
> +       return bb_rank[gimple_bb (stmt)->index];

It's far from obvious what this does.  If we end up here we're
returning the un-biased PHI rank.  This happens when the
def of the stmt the PHI def is a use in does not have exactly
a single use in an assignment that is not a store or if the
stmt with the PHI use is in a different loop than that single stmt.
Other uses are disregarded.

That means that biased PHI ranks are only propagating along certain
SSA edges.  Or rather no, they only propagate along certain SSA
edges but only if there are more conditions met depending on unrelated
SSA edges.

That looks very much too iffy for rank propagation along SSA edges.

That said, it looks like a mess that isn't easy to understand and will
be quite fragile because of that "other" SSA edges.

Since I do not actually understand the issue you are trying to fix
it's hard to recommend something else :/

Sorry.
Richard.

> +    }
>
>    return MAX (rank, op_rank);
>  }
> @@ -448,7 +466,7 @@ get_rank (tree e)
>          thus have rank 0.  */
>        rank = 0;
>        FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
> -       rank = propagate_rank (rank, op);
> +       rank = propagate_rank (rank, op, stmt);
>
>        if (dump_file && (dump_flags & TDF_DETAILS))
>         {
> @@ -6662,9 +6680,10 @@ fini_reassoc (void)
>     optimization of a gimple conditional.  Otherwise returns zero.  */
>
>  static unsigned int
> -execute_reassoc (bool insert_powi_p)
> +execute_reassoc (bool insert_powi_p, bool bias_loop_carried_phi_ranks_p)
>  {
>    reassoc_insert_powi_p = insert_powi_p;
> +  reassoc_bias_loop_carried_phi_ranks_p = bias_loop_carried_phi_ranks_p;
>
>    init_reassoc ();
>
> @@ -6705,15 +6724,19 @@ public:
>      {
>        gcc_assert (n == 0);
>        insert_powi_p = param;
> +      bias_loop_carried_phi_ranks_p = !param;
>      }
>    virtual bool gate (function *) { return flag_tree_reassoc != 0; }
>    virtual unsigned int execute (function *)
> -    { return execute_reassoc (insert_powi_p); }
> +  {
> +    return execute_reassoc (insert_powi_p, bias_loop_carried_phi_ranks_p);
> +  }
>
>   private:
>    /* Enable insertion of __builtin_powi calls during execute_reassoc.  See
>       point 3a in the pass header comment.  */
>    bool insert_powi_p;
> +  bool bias_loop_carried_phi_ranks_p;
>  }; // class pass_reassoc
>
>  } // anon namespace
> --
> 2.25.4
>

Reply via email to