Bootstrapped and regtested x86_64-redhat-linux, ppc64le-redhat-linux and s390x-redhat-linux. I also ran SPEC 2006 and 2017 on these platforms, and the only measurable regression was 3% in 520.omnetpp_r on ppc, which went away after inserting a single nop at the beginning of cDynamicExpression::evaluate.
OK for master? --- PR tree-optimization/49749 introduced code that shortens dependency chains containing loop accumulators by placing them last on operand lists of associative operations. 456.hmmer benchmark on s390 could benefit from this, however, the code that needs it modifies loop accumulator before using it, and since only so-called loop-carried phis are are treated as loop accumulators, the code in the present form doesn't really help. According to Bill Schmidt - the original author - such a conservative approach was chosen so as to avoid unnecessarily swapping operands, which might cause unpredictable effects. However, giving special treatment to forms of loop accumulators is acceptable. The definition of loop-carried phi is: it's a single-use phi, which is used in the same innermost loop it's defined in, at least one argument of which is defined in the same innermost loop as the phi itself. Given this, it seems natural to treat single uses of such phis as phis themselves. gcc/ChangeLog: 2020-05-06 Ilya Leoshkevich <i...@linux.ibm.com> * passes.def (pass_reassoc): Rename parameter to early_p. * tree-ssa-reassoc.c (reassoc_bias_loop_carried_phi_ranks_p): New variable. (phi_rank): Don't bias loop-carried phi ranks before vectorization pass. (loop_carried_phi): Remove (superseded by operand_rank::biased_p). (propagate_rank): Propagate bias along single uses. (get_rank): Pass stmt to propagate_rank. (execute_reassoc): Add bias_loop_carried_phi_ranks_p parameter. (pass_reassoc::pass_reassoc): Add bias_loop_carried_phi_ranks_p initializer. (pass_reassoc::set_param): Set bias_loop_carried_phi_ranks_p value. (pass_reassoc::execute): Pass bias_loop_carried_phi_ranks_p to execute_reassoc. (pass_reassoc::bias_loop_carried_phi_ranks_p): New member. gcc/testsuite/ChangeLog: 2020-05-06 Ilya Leoshkevich <i...@linux.ibm.com> * gcc.target/s390/reassoc-1.c: New test. * gcc.target/s390/reassoc-2.c: New test. * gcc.target/s390/reassoc-3.c: New test. * gcc.target/s390/reassoc.h: New test. --- gcc/passes.def | 4 +- gcc/testsuite/gcc.target/s390/reassoc-1.c | 6 ++ gcc/testsuite/gcc.target/s390/reassoc-2.c | 7 ++ gcc/testsuite/gcc.target/s390/reassoc-3.c | 8 ++ gcc/testsuite/gcc.target/s390/reassoc.h | 22 +++++ gcc/tree-ssa-reassoc.c | 97 ++++++++++++++--------- 6 files changed, 105 insertions(+), 39 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-1.c create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-2.c create mode 100644 gcc/testsuite/gcc.target/s390/reassoc-3.c create mode 100644 gcc/testsuite/gcc.target/s390/reassoc.h diff --git a/gcc/passes.def b/gcc/passes.def index 2b1e09fdda3..6864f583f20 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -235,7 +235,7 @@ along with GCC; see the file COPYING3. If not see program and isolate those paths. */ NEXT_PASS (pass_isolate_erroneous_paths); NEXT_PASS (pass_dse); - NEXT_PASS (pass_reassoc, true /* insert_powi_p */); + NEXT_PASS (pass_reassoc, true /* early_p */); NEXT_PASS (pass_dce); NEXT_PASS (pass_forwprop); NEXT_PASS (pass_phiopt, false /* early_p */); @@ -312,7 +312,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_lower_vector_ssa); NEXT_PASS (pass_lower_switch); NEXT_PASS (pass_cse_reciprocals); - NEXT_PASS (pass_reassoc, false /* insert_powi_p */); + NEXT_PASS (pass_reassoc, false /* early_p */); NEXT_PASS (pass_strength_reduction); NEXT_PASS (pass_split_paths); NEXT_PASS (pass_tracer); diff --git a/gcc/testsuite/gcc.target/s390/reassoc-1.c b/gcc/testsuite/gcc.target/s390/reassoc-1.c new file mode 100644 index 00000000000..8343f1cd4b7 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/reassoc-1.c @@ -0,0 +1,6 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#include "reassoc.h" + +/* { dg-final { scan-assembler {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */ diff --git a/gcc/testsuite/gcc.target/s390/reassoc-2.c b/gcc/testsuite/gcc.target/s390/reassoc-2.c new file mode 100644 index 00000000000..5e393ed4937 --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/reassoc-2.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#define MODIFY +#include "reassoc.h" + +/* { dg-final { scan-assembler {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */ diff --git a/gcc/testsuite/gcc.target/s390/reassoc-3.c b/gcc/testsuite/gcc.target/s390/reassoc-3.c new file mode 100644 index 00000000000..ccbb627be7a --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/reassoc-3.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#define MODIFY +#define STORE_MODIFIED +#include "reassoc.h" + +/* { dg-final { scan-assembler {(?n)\n\tl\t(%r\d+),.+(\n.*)*\n\ta\t(\1),.+(\n.*)*\n\tar\t(%r\d+),(\1)} } } */ diff --git a/gcc/testsuite/gcc.target/s390/reassoc.h b/gcc/testsuite/gcc.target/s390/reassoc.h new file mode 100644 index 00000000000..b0e2b41202a --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/reassoc.h @@ -0,0 +1,22 @@ +#define M 1024 +unsigned int arr1[M]; +unsigned int arr2[M]; +volatile unsigned int sink; + +unsigned int +test () +{ + unsigned int sum = 0; + for (int i = 0; i < M; i++) + { +#ifdef MODIFY + sum |= 1; +#ifdef STORE_MODIFIED + sink = sum; +#endif +#endif + sum += arr1[i]; + sum += arr2[i]; + } + return sum; +} diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c index 2e67987f6c6..67be5312f42 100644 --- a/gcc/tree-ssa-reassoc.c +++ b/gcc/tree-ssa-reassoc.c @@ -177,6 +177,10 @@ along with GCC; see the file COPYING3. If not see point 3a in the pass header comment. */ static bool reassoc_insert_powi_p; +/* Enable biasing ranks of loop accumulators. We don't want this before + vectorization, since it interferes with reduction chains. */ +static bool reassoc_bias_loop_carried_phi_ranks_p; + /* Statistics */ static struct { @@ -275,6 +279,9 @@ phi_rank (gimple *stmt) use_operand_p use; gimple *use_stmt; + if (!reassoc_bias_loop_carried_phi_ranks_p) + return bb_rank[bb->index]; + /* We only care about real loops (those with a latch). */ if (!father->latch) return bb_rank[bb->index]; @@ -312,48 +319,59 @@ phi_rank (gimple *stmt) return bb_rank[bb->index]; } -/* If EXP is an SSA_NAME defined by a PHI statement that represents a - loop-carried dependence of an innermost loop, return TRUE; else - return FALSE. */ -static bool -loop_carried_phi (tree exp) -{ - gimple *phi_stmt; - long block_rank; - - if (TREE_CODE (exp) != SSA_NAME - || SSA_NAME_IS_DEFAULT_DEF (exp)) - return false; - - phi_stmt = SSA_NAME_DEF_STMT (exp); - - if (gimple_code (SSA_NAME_DEF_STMT (exp)) != GIMPLE_PHI) - return false; - - /* Non-loop-carried phis have block rank. Loop-carried phis have - an additional bias added in. If this phi doesn't have block rank, - it's biased and should not be propagated. */ - block_rank = bb_rank[gimple_bb (phi_stmt)->index]; - - if (phi_rank (phi_stmt) != block_rank) - return true; - - return false; -} - /* Return the maximum of RANK and the rank that should be propagated from expression OP. For most operands, this is just the rank of OP. - For loop-carried phis, the value is zero to avoid undoing the bias + For loop-carried phis, the value is block rank to avoid undoing the bias in favor of the phi. */ static long -propagate_rank (long rank, tree op) +propagate_rank (long rank, tree op, gimple *stmt) { long op_rank; - if (loop_carried_phi (op)) - return rank; - op_rank = get_rank (op); + if (op_rank & PHI_LOOP_BIAS) + { + /* Treat loop-carried phis and elements of innermost-loop-internal + single-use chains originating from them identically: propagate ranks + (which are always biased) only along innermost-loop-internal + single-use chains. Treat only GIMPLE_ASSIGNs to SSA_NAMEs as uses, + because only these participate in rank propagation. This way in the + following code: + + x_1 = phi(x_0, x_2) + q = x_1 & 1 + .MEM = q + b = a + q + c = b + d + x_2 = c + e + + the rank of q, which is essentially a form of loop accumulator, will + be biased. Ranks of b and c will also be biased, which seems + unnecessary at first glance, however, this will have no consequences, + since b and c are intermediate results, and their ranks will be + discarded when creating + operand lists. */ + use_operand_p use; + imm_use_iterator use_iter; + gimple *use_stmt = NULL; + FOR_EACH_IMM_USE_FAST (use, use_iter, gimple_assign_lhs (stmt)) + if (is_gimple_assign (USE_STMT (use)) + && TREE_CODE (gimple_assign_lhs (USE_STMT (use))) == SSA_NAME) + { + if (use_stmt == NULL) + { + use_stmt = USE_STMT (use); + } + else + { + use_stmt = NULL; + break; + } + } + if (use_stmt == NULL + || gimple_bb (stmt)->loop_father + != gimple_bb (use_stmt)->loop_father) + return bb_rank[gimple_bb (stmt)->index]; + } return MAX (rank, op_rank); } @@ -448,7 +466,7 @@ get_rank (tree e) thus have rank 0. */ rank = 0; FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE) - rank = propagate_rank (rank, op); + rank = propagate_rank (rank, op, stmt); if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -6662,9 +6680,10 @@ fini_reassoc (void) optimization of a gimple conditional. Otherwise returns zero. */ static unsigned int -execute_reassoc (bool insert_powi_p) +execute_reassoc (bool insert_powi_p, bool bias_loop_carried_phi_ranks_p) { reassoc_insert_powi_p = insert_powi_p; + reassoc_bias_loop_carried_phi_ranks_p = bias_loop_carried_phi_ranks_p; init_reassoc (); @@ -6705,15 +6724,19 @@ public: { gcc_assert (n == 0); insert_powi_p = param; + bias_loop_carried_phi_ranks_p = !param; } virtual bool gate (function *) { return flag_tree_reassoc != 0; } virtual unsigned int execute (function *) - { return execute_reassoc (insert_powi_p); } + { + return execute_reassoc (insert_powi_p, bias_loop_carried_phi_ranks_p); + } private: /* Enable insertion of __builtin_powi calls during execute_reassoc. See point 3a in the pass header comment. */ bool insert_powi_p; + bool bias_loop_carried_phi_ranks_p; }; // class pass_reassoc } // anon namespace -- 2.25.4