On 04/09/18 17:52, Kyrill Tkachov wrote: > > On 04/09/18 15:31, Richard Biener wrote: >> On Tue, 4 Sep 2018, Kyrill Tkachov wrote: >>> Hi Richard, >>> >>> On 31/08/18 12:07, Richard Biener wrote: >>>> On Thu, 30 Aug 2018, Kyrill Tkachov wrote: >>>>> Ping. >>>>> >>>>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html >>>>> >>>>> Thanks, >>>>> Kyrill >>>>> >>>>> On 23/08/18 18:09, Kyrill Tkachov wrote: >>>>>> Hi Richard, >>>>>> >>>>>> On 23/08/18 11:13, Richard Sandiford wrote: >>>>>>> Kyrill Tkachov <kyrylo.tkac...@foss.arm.com> writes: >>>>>>>> Hi all, >>>>>>>> >>>>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) >>>>>>>> under -freciprocal-math and -funsafe-math-optimizations. >>>>>>>> In particular consider: >>>>>>>> >>>>>>>> x = 1.0 / sqrt (a); >>>>>>>> r1 = x * x; // same as 1.0 / a >>>>>>>> r2 = a * x; // same as sqrt (a) >>>>>>>> >>>>>>>> If x, r1 and r2 are all used further on in the code, this can be >>>>>>>> transformed into: >>>>>>>> tmp1 = 1.0 / a >>>>>>>> tmp2 = sqrt (a) >>>>>>>> tmp3 = tmp1 * tmp2 >>>>>>>> x = tmp3 >>>>>>>> r1 = tmp1 >>>>>>>> r2 = tmp2 >>>>>>> Nice optimisation :-) Someone who knows the pass better should review, >>>>>>> but: >>>>>> Thanks for the review. >>>>>>> There seems to be an implicit assumption that this is a win even >>>>>>> when the r1 and r2 assignments are only conditionally executed. >>>>>>> That's probably true, but it might be worth saying explicitly. >>>>>> I'll admit I had not considered that case. >>>>>> I think it won't make a difference in practice, as the really expensive >>>>>> operations here >>>>>> are the sqrt and the division and they are on the executed path in either >>>>>> case and them >>>>>> becoming independent should be a benefit of its own. >>>>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */ >>>>>>>> + >>>>>>>> +static inline bool >>>>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a) >>>>>>>> +{ >>>>>>>> + if (gimple_code (use_stmt) == GIMPLE_ASSIGN >>>>>>>> + && gimple_assign_rhs_code (use_stmt) == MULT_EXPR) >>>>>>>> + { >>>>>>>> + tree op0 = gimple_assign_rhs1 (use_stmt); >>>>>>>> + tree op1 = gimple_assign_rhs2 (use_stmt); >>>>>>>> + >>>>>>>> + return (op0 == def && op1 == a) >>>>>>>> + || (op0 == a && op1 == def); >>>>>>>> + } >>>>>>>> + return 0; >>>>>>>> +} >>>>>>> Seems like is_square_of could now be a light-weight wrapper around this. >>>>>> Indeed, I've done the wrapping now. >>>>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator >>>>>>>> *def_gsi, tree def) >>>>>>>> occ_head = NULL; >>>>>>>> } >>>>>>>> +/* Transform sequences like >>>>>>>> + x = 1.0 / sqrt (a); >>>>>>>> + r1 = x * x; >>>>>>>> + r2 = a * x; >>>>>>>> + into: >>>>>>>> + tmp1 = 1.0 / a; >>>>>>>> + tmp2 = sqrt (a); >>>>>>>> + tmp3 = tmp1 * tmp2; >>>>>>>> + x = tmp3; >>>>>>>> + r1 = tmp1; >>>>>>>> + r2 = tmp2; >>>>>>>> + depending on the uses of x, r1, r2. This removes one multiplication >>>>>>>> and >>>>>>>> + allows the sqrt and division operations to execute in parallel. >>>>>>>> + DEF_GSI is the gsi of the initial division by sqrt that defines >>>>>>>> + DEF (x in the example abovs). */ >>>>>>>> + >>>>>>>> +static void >>>>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def) >>>>>>>> +{ >>>>>>>> + use_operand_p use_p; >>>>>>>> + imm_use_iterator use_iter; >>>>>>>> + gimple *stmt = gsi_stmt (*def_gsi); >>>>>>>> + tree x = def; >>>>>>>> + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt); >>>>>>>> + tree div_rhs1 = gimple_assign_rhs1 (stmt); >>>>>>>> + >>>>>>>> + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME >>>>>>>> + || TREE_CODE (div_rhs1) != REAL_CST >>>>>>>> + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1)) >>>>>>>> + return; >>>>>>>> + >>>>>>>> + gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name); >>>>>>>> + if (!is_gimple_call (sqrt_stmt) >>>>>>>> + || !gimple_call_lhs (sqrt_stmt)) >>>>>>>> + return; >>>>>>>> + >>>>>>>> + gcall *call = as_a <gcall *> (sqrt_stmt); >>>>>>> Very minor, but: >>>>>>> >>>>>>> gcall *sqrt_stmt >>>>>>> = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name)); >>>>>>> if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt)) >>>>>>> return; >>>>>>> >>>>>>> would avoid the need for the separate as_a<>, and would mean that >>>>>>> we only call gimple_call_* on gcalls. >>>>>> Ok. >>>>>>>> + if (has_other_use) >>>>>>>> + { >>>>>>>> + /* Using the two temporaries tmp1, tmp2 from above >>>>>>>> + the original x is now: >>>>>>>> + x = tmp1 * tmp2. */ >>>>>>>> + gcc_assert (mult_ssa_name); >>>>>>>> + gcc_assert (sqr_ssa_name); >>>>>>>> + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt); >>>>>>>> + >>>>>>>> + tree new_ssa_name >>>>>>>> + = make_temp_ssa_name (TREE_TYPE (a), NULL, >>>>>>>> "recip_sqrt_transformed"); >>>>>>>> + gimple *new_stmt >>>>>>>> + = gimple_build_assign (new_ssa_name, MULT_EXPR, >>>>>>>> + mult_ssa_name, sqr_ssa_name); >>>>>>>> + gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT); >>>>>>>> + gcc_assert (gsi_stmt (gsi2) == stmt); >>>>>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name); >>>>>>>> + fold_stmt (&gsi2); >>>>>>>> + update_stmt (stmt); >>>>>>> In this case we're replacing the statement in its original position, >>>>>>> so there's no real need to use a temporary. It seems better to >>>>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same >>>>>>> lhs as before. >>>>>> Yes, that's cleaner. >>>>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun) >>>>>>>> if (optimize_bb_for_size_p (bb)) >>>>>>>> continue; >>>>>>> Seems unnecessary to skip the new optimisation when optimising for size. >>>>>>> Like you say, it saves a multiplication overall. Also: >>>>>> Indeed. >>>>>>>> + if (flag_unsafe_math_optimizations) >>>>>>>> + { >>>>>>>> + for (gimple_stmt_iterator gsi = gsi_after_labels (bb); >>>>>>>> + !gsi_end_p (gsi); >>>>>>>> + gsi_next (&gsi)) >>>>>>>> + { >>>>>>>> + gimple *stmt = gsi_stmt (gsi); >>>>>>>> + >>>>>>>> + if (gimple_has_lhs (stmt) >>>>>>>> + && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL >>>>>>>> + && FLOAT_TYPE_P (TREE_TYPE (def)) >>>>>>>> + && TREE_CODE (def) == SSA_NAME >>>>>>>> + && is_gimple_assign (stmt) >>>>>>>> + && gimple_assign_rhs_code (stmt) == RDIV_EXPR) >>>>>>>> + optimize_recip_sqrt (&gsi, def); >>>>>>>> + } >>>>>>>> + } >>>>>>> It looks like this could safely be done in one of the existing walks >>>>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising >>>>>>> for size). >>>>>> You're right. I've moved this into one of the walks above this. >>>>>> >>>>>> Bootstrapped and tested on aarch64-none-linux-gnu and >>>>>> x86_64-unknown-linux-gnu. >>>>>> CC'ing richi as he's reviewed these kinds of patches in the past. >>>>>> >>>>>> Is this ok for trunk? >>>> I wonder how it interacts with execute_cse_reciprocals_1 given it >>>> introduces a division 1.0 / a which will be not processed by >>>> execute_cse_reciprocals_1 given that operates by walking uses of 'a'. >>>> That may be just a missed optimization of course. >>> Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as >>> it's either one or the other. >>> I've left it as it is for now, but would you like us to call >>> execute_cse_reciprocals_1 on the potentially transformed division? >> I think that wouldn't "fix" it given execute_cse_reciprocals_1 works by >> seeing all reciprocal uses of 'a' so it may have alrady seen two and >> you introduced the third. But we can leave "solving" this for >> future enhacements (I'd avoid doing two passes over the IL just because >> of said possibility right now). >>>> + if (has_other_use) >>>> + { >>>> + /* Using the two temporaries tmp1, tmp2 from above >>>> + the original x is now: >>>> + x = tmp1 * tmp2. */ >>>> + gcc_assert (mult_ssa_name); >>>> + gcc_assert (sqr_ssa_name); >>>> + >>>> + gimple_assign_set_rhs1 (stmt, mult_ssa_name); >>>> + gimple_assign_set_rhs2 (stmt, sqr_ssa_name); >>>> + gimple_assign_set_rhs_code (stmt, MULT_EXPR); >>>> + fold_stmt_inplace (def_gsi); >>>> + update_stmt (stmt); >>>> + } >>>> >>>> so you are leaving the original stmt in place unchanged even if it is >>>> not used? Why? Note that with -fno-call-exceptions this stmt may >>>> throw, so you should arrange to code-generate 1./a in place of the >>>> original division to preserve EH behavior. Watch out because then >>>> with other uses you have to find a place to insert its computation. >>> Ok. These are oversights on my part. I've updated the patch to modify the >>> original division in place. The multiplication is placed after it. >> If the division throws internally you can't insert after it, you >> have to insert on the single non-EH edge outgoing from the BB >> with the division. Just try a C++ testcase with -fnon-call-exceptions >> and a try {} catch(...) {} around. >> >> Not sure if it's worth to handle so bailing out if >> stmt_can_throw_internal (stmt) would be an option as well. >>>> + if (is_square_of (stmt2, x)) >>>> + { >>>> + if (!sqr_stmts.contains (stmt2)) >>>> + sqr_stmts.safe_push (stmt2); >>>> + } >>>> >>>> this is quadratic in the number of square stmts... please consider >>>> making sqr_stmts a bitmap of SSA defs (so the stmt you have now >>>> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))). >>> Done. In practice I didn't see there being more than one such use, I expect >>> them >>> to be CSE'd, but maybe if they're in different basic blocks... >> Oh, it just occured to me you could use FOR_EACH_IMM_USE_STMT () >> which will only get you each stmt once... ;) Sorry for the misleading >> bitmap suggestion. >>>> You do not seem to restrict placement of the use stmts but insert >>>> before the 1/sqrt(a) stmt. That possibly hoists the multiplications >>>> where they are not needed. Consider >>>> >>>> x = 1./sqrt(a); >>>> if (l) >>>> l1 = x * 3.; >>>> else if (l2) >>>> l1 = x * x; >>>> else if (l3) >>>> l1 = a * x; >>>> >>>> or similar where on the path to x * 3. you now perform two extra >>>> multiplications. >>> Ok, I've restricted the optimisation somewhat. >>> Now if there's an other use it won't perform the transformation unless there >>> is already >>> a multiplication present on the main path. That way it won't introduce >>> multiplications >>> on paths where there aren't any. >> OK. >>>> Your testcases do not cover the case of other uses at all. Or of >>>> EH. >>> gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a >>> global >>> being written to. I've added more testcases involving uses in different basic >>> blocks >>> and a g++.dg testcase with -fnon-call-exceptions. >>> I'm not sure what functionality to test though apart from that it doesn't ICE >>> and does >>> the transformations I expect. >> That's good enough I guess. I see you do have a testcase with >> try/catch so I wonder why foo4 doesn't ICE when you insert after >> the throwing stmt... maybe it doesn't throw? Ah - they probably >> fail your new mult_on_main_path test? > > Yeah. I did manage to reproduce an ICE eventually by removing that check > and enabling -ftrapping-math. > > I think I'll just not enter this at all if stmt_can_throw_internal (stmt) like you suggested.
And here is the version that does that. It also reverts to using a vector for the sqr_stmts for consistency and uses FOR_EACH_IMM_USE_STMT to iterate over the use statements. Bootstrapped and tested on aarch64-none-linux-gnu. Ok? Thanks, Kyrill 2018-09-03 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * tree-ssa-math-opts.c (is_mult_by): New function. (is_square_of): Use the above. (optimize_recip_sqrt): New function. (pass_cse_reciprocals::execute): Use the above. 2018-09-03 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * gcc.dg/recip_sqrt_mult_1.c: New test. * gcc.dg/recip_sqrt_mult_2.c: Likewise. * gcc.dg/recip_sqrt_mult_3.c: Likewise. * gcc.dg/recip_sqrt_mult_4.c: Likewise. * gcc.dg/recip_sqrt_mult_5.c: Likewise. * g++.dg/recip_sqrt_mult_1.C: Likewise. * g++.dg/recip_sqrt_mult_2.C: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C new file mode 100644 index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195 --- /dev/null +++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo1 (double a, double b) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; + } + catch (...) + { ; } +} + +void +foo4 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + } + catch (...) + { + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } +} + +void +foo5 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } + catch (...) + { ; } +} + +/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */ diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C new file mode 100644 index 0000000000000000000000000000000000000000..cca12caf4d79bfa69933d9b8fce41f38bb5b7a19 --- /dev/null +++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */ + +/* Check that the recip_sqrt optimization does not trigger here, causing an + ICE due to EH info. */ + + +double res, res2, tmp; +void +foo1 (double a, double b) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; + } + catch (...) + { ; } +} + +void +foo4 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + } + catch (...) + { + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } +} + +void +foo5 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } + catch (...) + { ; } +} diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c new file mode 100644 index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo (double a, double b) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c new file mode 100644 index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +float +foo (float a) +{ + float tmp = 1.0f / __builtin_sqrtf (a); + return a * tmp; +} + +/* { dg-final { scan-tree-dump-not " / " "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c new file mode 100644 index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +double +foo (double a) +{ + double tmp = 1.0f / __builtin_sqrt (a); + return tmp * tmp; +} + +/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c new file mode 100644 index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* The main path doesn't have any multiplications. + Avoid introducing them in the recip pass. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c new file mode 100644 index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* We want to do the recip_sqrt transformations here there is already + a multiplication on the main path. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index 25378da6f4ab27dbce51e10461efc18cc416a4d7..19bff5c3c3715f3e194e1c091ae01f6c4d6d2ce8 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def) && gimple_assign_rhs1 (use_stmt) != def; } -/* Return whether USE_STMT is DEF * DEF. */ +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */ static inline bool -is_square_of (gimple *use_stmt, tree def) +is_mult_by (gimple *use_stmt, tree def, tree a) { if (gimple_code (use_stmt) == GIMPLE_ASSIGN && gimple_assign_rhs_code (use_stmt) == MULT_EXPR) @@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def) tree op0 = gimple_assign_rhs1 (use_stmt); tree op1 = gimple_assign_rhs2 (use_stmt); - return op0 == op1 && op0 == def; + return (op0 == def && op1 == a) + || (op0 == a && op1 == def); } return 0; } +/* Return whether USE_STMT is DEF * DEF. */ +static inline bool +is_square_of (gimple *use_stmt, tree def) +{ + return is_mult_by (use_stmt, def, def); +} + /* Return whether USE_STMT is a floating-point division by DEF * DEF. */ static inline bool @@ -526,6 +534,188 @@ free_bb (struct occurrence *occ) } } +/* Transform sequences like + t = sqrt (a) + x = 1.0 / t; + r1 = x * x; + r2 = a * x; + into: + t = sqrt (a) + r1 = 1.0 / a; + r2 = t; + x = r1 * r2; + depending on the uses of x, r1, r2. This removes one multiplication and + allows the sqrt and division operations to execute in parallel. + DEF_GSI is the gsi of the initial division by sqrt that defines + DEF (x in the example abovs). */ + +static void +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def) +{ + gimple *use_stmt; + imm_use_iterator use_iter; + gimple *stmt = gsi_stmt (*def_gsi); + tree x = def; + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt); + tree div_rhs1 = gimple_assign_rhs1 (stmt); + + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME + || TREE_CODE (div_rhs1) != REAL_CST + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1)) + return; + + gcall *sqrt_stmt + = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name)); + + if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt)) + return; + + switch (gimple_call_combined_fn (sqrt_stmt)) + { + CASE_CFN_SQRT: + CASE_CFN_SQRT_FN: + break; + + default: + return; + } + tree a = gimple_call_arg (sqrt_stmt, 0); + + /* We have 'a' and 'x'. Now analyze the uses of 'x'. */ + + /* Statements that use x in x * x. */ + auto_vec<gimple *> sqr_stmts; + /* Statements that use x in a * x. */ + auto_vec<gimple *> mult_stmts; + bool has_other_use = false; + bool mult_on_main_path = false; + + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x) + { + if (is_gimple_debug (use_stmt)) + continue; + if (is_square_of (use_stmt, x)) + { + sqr_stmts.safe_push (use_stmt); + if (gimple_bb (use_stmt) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else if (is_mult_by (use_stmt, x, a)) + { + mult_stmts.safe_push (use_stmt); + if (gimple_bb (use_stmt) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else + has_other_use = true; + } + + /* In the x * x and a * x cases we just rewire stmt operands or + remove multiplications. In the has_other_use case we introduce + a multiplication so make sure we don't introduce a multiplication + on a path where there was none. */ + if (has_other_use && !mult_on_main_path) + return; + + if (sqr_stmts.is_empty () && mult_stmts.is_empty ()) + return; + + /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want + to be able to compose it from the sqr and mult cases. */ + if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ())) + return; + + if (dump_file) + { + fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n"); + print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "\n"); + } + + bool delete_div = !has_other_use; + tree sqr_ssa_name = NULL_TREE; + if (!sqr_stmts.is_empty ()) + { + /* r1 = x * x. Transform the original + x = 1.0 / t + into + tmp1 = 1.0 / a + r1 = tmp1. */ + + sqr_ssa_name + = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr"); + + if (dump_file) + { + fprintf (dump_file, "Replacing original division\n"); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "with new division\n"); + } + gimple_assign_set_lhs (stmt, sqr_ssa_name); + gimple_assign_set_rhs2 (stmt, a); + fold_stmt_inplace (def_gsi); + update_stmt (stmt); + + if (dump_file) + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + + delete_div = false; + gimple *sqr_stmt; + unsigned int i; + FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt) + { + gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt); + gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name); + update_stmt (sqr_stmt); + } + } + if (!mult_stmts.is_empty ()) + { + /* r2 = a * x. Transform this into: + r2 = t (The original sqrt (a)). */ + unsigned int i; + gimple *mult_stmt = NULL; + FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt) + { + gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt); + + if (dump_file) + { + fprintf (dump_file, "Replacing squaring multiplication\n"); + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + fprintf (dump_file, "with assignment\n"); + } + gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name); + fold_stmt_inplace (&gsi2); + update_stmt (mult_stmt); + if (dump_file) + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + } + } + + if (has_other_use) + { + /* Using the two temporaries tmp1, tmp2 from above + the original x is now: + x = tmp1 * tmp2. */ + gcc_assert (orig_sqrt_ssa_name); + gcc_assert (sqr_ssa_name); + + gimple *new_stmt + = gimple_build_assign (x, MULT_EXPR, + orig_sqrt_ssa_name, sqr_ssa_name); + gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT); + update_stmt (stmt); + } + else if (delete_div) + { + /* Remove the original division. */ + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt); + gsi_remove (&gsi2, true); + release_defs (stmt); + } +} /* Look for floating-point divisions among DEF's uses, and try to replace them by multiplications with the reciprocal. Add @@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun) && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL && FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME) - execute_cse_reciprocals_1 (&gsi, def); + { + if (flag_unsafe_math_optimizations + && is_gimple_assign (stmt) + && !stmt_can_throw_internal (stmt) + && gimple_assign_rhs_code (stmt) == RDIV_EXPR) + optimize_recip_sqrt (&gsi, def); + else + execute_cse_reciprocals_1 (&gsi, def); + } } if (optimize_bb_for_size_p (bb))