Hi, As the discussion in PR96789, we found that some scalar stmts which can be eliminated by some passes after SLP, but we still modeled their costs when trying to SLP, it could impact vectorizer's decision. One typical case is the case in PR on target Power.
As Richard suggested there, this patch is to introduce one pass called scalar_cleanup which has some secondary clean up passes, for now they are FRE and DSE. It's only triggered when seeing one TODO flag called TODO_force_next_scalar_cleanup set, unlike normal TODO flags, the flag is kept in one global variable pending_TODOs and expects its downstream consumer to handle it. I verified that it can get x264's runtime performance back on Power, I also evaluated the compilation time for the SPEC2017 int benchmarks build with one single job, this patch increase the compilation time by 0.74%. Bootstrapped/regtested on powerpc64le-linux-gnu P9. Is it ok for trunk? BR, Kewen ----------- gcc/ChangeLog: PR tree-optimization/96789 * passes.c (execute_one_pass): Add support for TODO_force_next_scalar_cleanup. (pending_TODOs): Init. * passes.def (pass_scalar_cleanup): New pass, add pass_fre and pass_dse as its children. * timevar.def (TV_SCALAR_CLEANUP): New timevar. * tree-pass.h (TODO_force_next_scalar_cleanup): New TODO flag. (make_pass_scalar_cleanup): New function declare. (pending_TODOs): New variable declare. * tree-ssa-loop-ivcanon.c (pass_complete_unroll::execute): Set TODO_force_next_scalar_cleanup if there are no loops. (class pass_scalar_cleanup): New class. (pass_data_scalar_cleanup): New pass data. (make_pass_scalar_cleanup): New function. gcc/testsuite/ChangeLog: PR tree-optimization/96789 * gcc.dg/tree-ssa/ssa-dse-28.c: Adjust. * gcc.dg/tree-ssa/ssa-dse-29.c: Likewise. * gcc.dg/tree-ssa/pr96789.c: New test.
diff --git a/gcc/passes.c b/gcc/passes.c index 6ff31ec37d7..b0ab9f66557 100644 --- a/gcc/passes.c +++ b/gcc/passes.c @@ -71,6 +71,8 @@ using namespace gcc; The variable current_pass is also used for statistics and plugins. */ opt_pass *current_pass; +unsigned int pending_TODOs = 0; + /* Most passes are single-instance (within their context) and thus don't need to implement cloning, but passes that support multiple instances *must* provide their own implementation of the clone method. @@ -2538,6 +2540,12 @@ execute_one_pass (opt_pass *pass) return true; } + if (todo_after & TODO_force_next_scalar_cleanup) + { + todo_after &= ~TODO_force_next_scalar_cleanup; + pending_TODOs |= TODO_force_next_scalar_cleanup; + } + do_per_function (clear_last_verified, NULL); do_per_function (update_properties_after_pass, pass); diff --git a/gcc/passes.def b/gcc/passes.def index f865bdc19ac..3d9cccb6df1 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -290,11 +290,16 @@ along with GCC; see the file COPYING3. If not see /* pass_vectorize must immediately follow pass_if_conversion. Please do not add any other passes in between. */ NEXT_PASS (pass_vectorize); - PUSH_INSERT_PASSES_WITHIN (pass_vectorize) + PUSH_INSERT_PASSES_WITHIN (pass_vectorize) NEXT_PASS (pass_dce); - POP_INSERT_PASSES () - NEXT_PASS (pass_predcom); + POP_INSERT_PASSES () + NEXT_PASS (pass_predcom); NEXT_PASS (pass_complete_unroll); + NEXT_PASS (pass_scalar_cleanup); + PUSH_INSERT_PASSES_WITHIN (pass_scalar_cleanup) + NEXT_PASS (pass_fre, false /* may_iterate */); + NEXT_PASS (pass_dse); + POP_INSERT_PASSES () NEXT_PASS (pass_slp_vectorize); NEXT_PASS (pass_loop_prefetch); /* Run IVOPTs after the last pass that uses data-reference analysis diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c new file mode 100644 index 00000000000..46425d83b02 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr96789.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -funroll-loops -fdump-tree-dse-details" } */ + +/* Test if scalar cleanup pass takes effects, mainly check + its secondary pass DSE can remove dead stores on array + tmp. */ + +#include "stdint.h" + +static inline void +foo (int16_t *diff, int i_size, uint8_t *val1, int i_val1, uint8_t *val2, + int i_val2) +{ + for (int y = 0; y < i_size; y++) + { + for (int x = 0; x < i_size; x++) + diff[x + y * i_size] = val1[x] - val2[x]; + val1 += i_val1; + val2 += i_val2; + } +} + +void +bar (int16_t res[16], uint8_t *val1, uint8_t *val2) +{ + int16_t d[16]; + int16_t tmp[16]; + + foo (d, 4, val1, 16, val2, 32); + + for (int i = 0; i < 4; i++) + { + int s03 = d[i * 4 + 0] + d[i * 4 + 3]; + int s12 = d[i * 4 + 1] + d[i * 4 + 2]; + int d03 = d[i * 4 + 0] - d[i * 4 + 3]; + int d12 = d[i * 4 + 1] - d[i * 4 + 2]; + + tmp[0 * 4 + i] = s03 + s12; + tmp[1 * 4 + i] = 2 * d03 + d12; + tmp[2 * 4 + i] = s03 - s12; + tmp[3 * 4 + i] = d03 - 2 * d12; + } + + for (int i = 0; i < 4; i++) + { + int s03 = tmp[i * 4 + 0] + tmp[i * 4 + 3]; + int s12 = tmp[i * 4 + 1] + tmp[i * 4 + 2]; + int d03 = tmp[i * 4 + 0] - tmp[i * 4 + 3]; + int d12 = tmp[i * 4 + 1] - tmp[i * 4 + 2]; + + res[i * 4 + 0] = s03 + s12; + res[i * 4 + 1] = 2 * d03 + d12; + res[i * 4 + 2] = s03 - s12; + res[i * 4 + 3] = d03 - 2 * d12; + } +} + +/* { dg-final { scan-tree-dump {Deleted dead store:.*tmp} "dse3" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c index d3a1bbc5ce5..b81cabe604a 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-28.c @@ -17,5 +17,5 @@ int foo (int *p, int b) /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse1"} } */ /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */ -/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */ +/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c index 31529e7caed..f4ef89c590c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-dse-29.c @@ -22,5 +22,5 @@ foo(int cond, struct z *s) /* { dg-final { scan-tree-dump-times "Deleted dead store" 3 "dse1"} } */ /* { dg-final { scan-tree-dump-not "Deleted dead store" "dse2"} } */ -/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse3"} } */ +/* { dg-final { scan-tree-dump-not "Deleted dead store" "dse4"} } */ diff --git a/gcc/timevar.def b/gcc/timevar.def index 08c21c04009..a3031799700 100644 --- a/gcc/timevar.def +++ b/gcc/timevar.def @@ -194,6 +194,7 @@ DEFTIMEVAR (TV_TREE_LOOP_UNSWITCH , "tree loop unswitching") DEFTIMEVAR (TV_LOOP_SPLIT , "loop splitting") DEFTIMEVAR (TV_LOOP_JAM , "unroll and jam") DEFTIMEVAR (TV_COMPLETE_UNROLL , "complete unrolling") +DEFTIMEVAR (TV_SCALAR_CLEANUP , "scalar cleanup") DEFTIMEVAR (TV_TREE_PARALLELIZE_LOOPS, "tree parallelize loops") DEFTIMEVAR (TV_TREE_VECTORIZATION , "tree vectorization") DEFTIMEVAR (TV_TREE_SLP_VECTORIZATION, "tree slp vectorization") diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index 62e5b696cab..9086455fcd4 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -301,6 +301,11 @@ protected: /* Release function body and stop pass manager. */ #define TODO_discard_function (1 << 23) +/* Used as one pending action, it expects the following scalar + cleanup pass will clear it and do the cleanup work when it's + met. */ +#define TODO_force_next_scalar_cleanup (1 << 24) + /* Internally used in execute_function_todo(). */ #define TODO_update_ssa_any \ (TODO_update_ssa \ @@ -380,6 +385,7 @@ extern gimple_opt_pass *make_pass_simduid_cleanup (gcc::context *ctxt); extern gimple_opt_pass *make_pass_slp_vectorize (gcc::context *ctxt); extern gimple_opt_pass *make_pass_complete_unroll (gcc::context *ctxt); extern gimple_opt_pass *make_pass_complete_unrolli (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_scalar_cleanup (gcc::context *ctxt); extern gimple_opt_pass *make_pass_parallelize_loops (gcc::context *ctxt); extern gimple_opt_pass *make_pass_loop_prefetch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_iv_optimize (gcc::context *ctxt); @@ -629,6 +635,12 @@ extern gimple_opt_pass *make_pass_convert_switch (gcc::context *ctxt); extern gimple_opt_pass *make_pass_lower_vaarg (gcc::context *ctxt); extern gimple_opt_pass *make_pass_gimple_isel (gcc::context *ctxt); +/* Different from normal TODO_flags which are handled right at the begin + or the end of one pass execution, the pending TODO_flags are allowed + to be passed down in the pipeline until one of its consumers can take + over it. */ +extern unsigned int pending_TODOs; + /* Current optimization pass. */ extern opt_pass *current_pass; diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c index 298ab215530..7016f993339 100644 --- a/gcc/tree-ssa-loop-ivcanon.c +++ b/gcc/tree-ssa-loop-ivcanon.c @@ -1605,6 +1605,14 @@ pass_complete_unroll::execute (function *fun) peeled_loops = BITMAP_ALLOC (NULL); unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, true); + + /* There are no loops after unrolling, we assume that it's not so costly + to do the scalar cleanup since here. FIXME: Some heuristics can be + further added to guard the cost level, like nodes number total, all + the original loops should be with single exits, etc. */ + if (!current_loops->tree_root->inner) + val |= TODO_force_next_scalar_cleanup; + if (peeled_loops) { BITMAP_FREE (peeled_loops); @@ -1676,4 +1684,51 @@ make_pass_complete_unrolli (gcc::context *ctxt) return new pass_complete_unrolli (ctxt); } +/* Scalar cleanup, it has several gated cleanup passes like FRE, DSE. */ + +namespace { + +const pass_data pass_data_scalar_cleanup = +{ + GIMPLE_PASS, /* type */ + "*scalar_cleanup", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_SCALAR_CLEANUP, /* tv_id */ + ( PROP_cfg | PROP_ssa ), /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_scalar_cleanup : public gimple_opt_pass +{ +public: + pass_scalar_cleanup (gcc::context *ctxt) + : gimple_opt_pass (pass_data_scalar_cleanup, ctxt) + { + } + + virtual bool + gate (function *) + { + return pending_TODOs & TODO_force_next_scalar_cleanup; + } + + virtual unsigned int + execute (function *) + { + pending_TODOs &= ~TODO_force_next_scalar_cleanup; + return 0; + } + +}; // class pass_scalar_cleanup + +} // anon namespace + +gimple_opt_pass * +make_pass_scalar_cleanup (gcc::context *ctxt) +{ + return new pass_scalar_cleanup (ctxt); +}