On Thu, Jun 3, 2021 at 5:33 AM Kewen.Lin <li...@linux.ibm.com> wrote: > > Hi Richard, > > on 2021/6/3 上午1:19, Richard Sandiford wrote: > > "Kewen.Lin via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> Hi, > >> > >> As PR100794 shows, in the current implementation PRE bypasses > >> some optimization to avoid introducing loop carried dependence > >> which stops loop vectorizer to vectorize the loop. At -O2, > >> there is no downstream pass to re-catch this kind of opportunity > >> if loop vectorizer fails to vectorize that loop. > >> > >> This patch follows Richi's suggestion in the PR, if predcom flag > >> isn't set and loop vectorization will enable predcom without any > >> unrolling implicitly. The Power9 SPEC2017 evaluation showed it > >> can speed up 521.wrf_r 3.30% and 554.roms_r 1.08% at very-cheap > >> cost model, no remarkable impact at cheap cost model, the build > >> time and size impact is fine (see the PR for the details). > >> > >> By the way, I tested another proposal to guard PRE not skip the > >> optimization for cheap and very-cheap vect cost models, the > >> evaluation results showed it's fine with very cheap cost model, > >> but it can degrade some bmks like 521.wrf_r -9.17% and > >> 549.fotonik3d_r -2.07% etc. > >> > >> Bootstrapped/regtested on powerpc64le-linux-gnu P9, > >> x86_64-redhat-linux and aarch64-linux-gnu. > >> > >> Is it ok for trunk? > >> > >> BR, > >> Kewen > >> ----- > >> gcc/ChangeLog: > >> > >> PR tree-optimization/100794 > >> * tree-predcom.c (tree_predictive_commoning_loop): Add parameter > >> allow_unroll_p and only allow unrolling when it's true. > >> (tree_predictive_commoning): Add parameter allow_unroll_p and > >> adjust for it. > >> (run_tree_predictive_commoning): Likewise. > >> (class pass_predcom): Add private member allow_unroll_p. > >> (pass_predcom::pass_predcom): Init allow_unroll_p. > >> (pass_predcom::gate): Check flag_tree_loop_vectorize and > >> global_options_set.x_flag_predictive_commoning. > >> (pass_predcom::execute): Adjust for allow_unroll_p. > >> > >> gcc/testsuite/ChangeLog: > >> > >> PR tree-optimization/100794 > >> * gcc.dg/tree-ssa/pr100794.c: New test. > >> > >> gcc/testsuite/gcc.dg/tree-ssa/pr100794.c | 20 +++++++++ > >> gcc/tree-predcom.c | 57 +++++++++++++++++------- > >> 2 files changed, 60 insertions(+), 17 deletions(-) > >> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr100794.c > >> > >> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c > >> b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c > >> new file mode 100644 > >> index 00000000000..6f707ae7fba > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr100794.c > >> @@ -0,0 +1,20 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O2 -ftree-loop-vectorize -fdump-tree-pcom-details > >> -fdisable-tree-vect" } */ > >> + > >> +extern double arr[100]; > >> +extern double foo (double, double); > >> +extern double sum; > >> + > >> +void > >> +test (int i_0, int i_n) > >> +{ > >> + int i; > >> + for (i = i_0; i < i_n - 1; i++) > >> + { > >> + double a = arr[i]; > >> + double b = arr[i + 1]; > >> + sum += a * b; > >> + } > >> +} > >> + > >> +/* { dg-final { scan-tree-dump "Executing predictive commoning without > >> unrolling" "pcom" } } */ > >> diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c > >> index 02f911a08bb..65a93c8e505 100644 > >> --- a/gcc/tree-predcom.c > >> +++ b/gcc/tree-predcom.c > >> @@ -3178,13 +3178,13 @@ insert_init_seqs (class loop *loop, vec<chain_p> > >> chains) > >> applied to this loop. */ > >> > >> static unsigned > >> -tree_predictive_commoning_loop (class loop *loop) > >> +tree_predictive_commoning_loop (class loop *loop, bool allow_unroll_p) > >> { > >> vec<data_reference_p> datarefs; > >> vec<ddr_p> dependences; > >> struct component *components; > >> vec<chain_p> chains = vNULL; > >> - unsigned unroll_factor; > >> + unsigned unroll_factor = 0; > >> class tree_niter_desc desc; > >> bool unroll = false, loop_closed_ssa = false; > >> > >> @@ -3272,11 +3272,13 @@ tree_predictive_commoning_loop (class loop *loop) > >> dump_chains (dump_file, chains); > >> } > >> > >> - /* Determine the unroll factor, and if the loop should be unrolled, > >> ensure > >> - that its number of iterations is divisible by the factor. */ > >> - unroll_factor = determine_unroll_factor (chains); > >> - unroll = (unroll_factor > 1 > >> - && can_unroll_loop_p (loop, unroll_factor, &desc)); > >> + if (allow_unroll_p) > >> + /* Determine the unroll factor, and if the loop should be unrolled, > >> ensure > >> + that its number of iterations is divisible by the factor. */ > >> + unroll_factor = determine_unroll_factor (chains); > >> + > >> + if (unroll_factor > 1) > >> + unroll = can_unroll_loop_p (loop, unroll_factor, &desc); > >> > >> /* Execute the predictive commoning transformations, and possibly > >> unroll the > >> loop. */ > >> @@ -3319,7 +3321,7 @@ tree_predictive_commoning_loop (class loop *loop) > >> /* Runs predictive commoning. */ > >> > >> unsigned > >> -tree_predictive_commoning (void) > >> +tree_predictive_commoning (bool allow_unroll_p) > >> { > >> class loop *loop; > >> unsigned ret = 0, changed = 0; > >> @@ -3328,7 +3330,7 @@ tree_predictive_commoning (void) > >> FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) > >> if (optimize_loop_for_speed_p (loop)) > >> { > >> - changed |= tree_predictive_commoning_loop (loop); > >> + changed |= tree_predictive_commoning_loop (loop, allow_unroll_p); > >> } > >> free_original_copy_tables (); > >> > >> @@ -3355,12 +3357,12 @@ tree_predictive_commoning (void) > >> /* Predictive commoning Pass. */ > >> > >> static unsigned > >> -run_tree_predictive_commoning (struct function *fun) > >> +run_tree_predictive_commoning (struct function *fun, bool allow_unroll_p) > >> { > >> if (number_of_loops (fun) <= 1) > >> return 0; > >> > >> - return tree_predictive_commoning (); > >> + return tree_predictive_commoning (allow_unroll_p); > >> } > >> > >> namespace { > >> @@ -3382,15 +3384,36 @@ class pass_predcom : public gimple_opt_pass > >> { > >> public: > >> pass_predcom (gcc::context *ctxt) > >> - : gimple_opt_pass (pass_data_predcom, ctxt) > >> + : gimple_opt_pass (pass_data_predcom, ctxt), > >> + allow_unroll_p (true) > >> {} > >> > >> /* opt_pass methods: */ > >> - virtual bool gate (function *) { return flag_predictive_commoning != 0; > >> } > >> - virtual unsigned int execute (function *fun) > >> - { > >> - return run_tree_predictive_commoning (fun); > >> - } > >> + virtual bool > >> + gate (function *) > >> + { > >> + if (flag_predictive_commoning != 0) > >> + return true; > >> + /* Loop vectorization enables predictive commoning implicitly > >> + only if predictive commoning isn't set explicitly, and it > >> + doesn't allow unrolling. */ > >> + if (flag_tree_loop_vectorize > >> + && !global_options_set.x_flag_predictive_commoning) > >> + { > >> + allow_unroll_p = false; > >> + return true; > >> + } > >> + return false; > >> + } > >> + > >> + virtual unsigned int > >> + execute (function *fun) > >> + { > >> + return run_tree_predictive_commoning (fun, allow_unroll_p); > >> + } > >> + > >> +private: > >> + bool allow_unroll_p; > >> > >> }; // class pass_predcom > > > > Calculating allow_unroll_p this way doesn't look robust against > > changes in options caused by pragmas, etc. Would it work if we > > dropped the member variable and just passed flag_predictive_commoning != 0 > > to run_tree_predictive commoning? > > > > Thanks for the comments! Yeah, it would work well. The updated version > of patch has been attached.
I think this version is OK. Thanks, Richard. > BR, > Kewen > ----- > gcc/ChangeLog: > > PR tree-optimization/100794 > * tree-predcom.c (tree_predictive_commoning_loop): Add parameter > allow_unroll_p and only allow unrolling when it's true. > (tree_predictive_commoning): Add parameter allow_unroll_p and > adjust for it. > (run_tree_predictive_commoning): Likewise. > (pass_predcom::gate): Check flag_tree_loop_vectorize and > global_options_set.x_flag_predictive_commoning. > (pass_predcom::execute): Adjust for allow_unroll_p. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/100794 > * gcc.dg/tree-ssa/pr100794.c: New test.