Hi Richard, On 31/08/18 12:07, Richard Biener wrote: > On Thu, 30 Aug 2018, Kyrill Tkachov wrote: > >> Ping. >> >> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html >> >> Thanks, >> Kyrill >> >> On 23/08/18 18:09, Kyrill Tkachov wrote: >>> Hi Richard, >>> >>> On 23/08/18 11:13, Richard Sandiford wrote: >>>> Kyrill Tkachov <kyrylo.tkac...@foss.arm.com> writes: >>>>> Hi all, >>>>> >>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) >>>>> under -freciprocal-math and -funsafe-math-optimizations. >>>>> In particular consider: >>>>> >>>>> x = 1.0 / sqrt (a); >>>>> r1 = x * x; // same as 1.0 / a >>>>> r2 = a * x; // same as sqrt (a) >>>>> >>>>> If x, r1 and r2 are all used further on in the code, this can be >>>>> transformed into: >>>>> tmp1 = 1.0 / a >>>>> tmp2 = sqrt (a) >>>>> tmp3 = tmp1 * tmp2 >>>>> x = tmp3 >>>>> r1 = tmp1 >>>>> r2 = tmp2 >>>> Nice optimisation :-) Someone who knows the pass better should review, >>>> but: >>> >>> Thanks for the review. >>> >>>> There seems to be an implicit assumption that this is a win even >>>> when the r1 and r2 assignments are only conditionally executed. >>>> That's probably true, but it might be worth saying explicitly. >>> >>> I'll admit I had not considered that case. >>> I think it won't make a difference in practice, as the really expensive >>> operations here >>> are the sqrt and the division and they are on the executed path in either >>> case and them >>> becoming independent should be a benefit of its own. >>> >>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */ >>>>> + >>>>> +static inline bool >>>>> +is_mult_by (gimple *use_stmt, tree def, tree a) >>>>> +{ >>>>> + if (gimple_code (use_stmt) == GIMPLE_ASSIGN >>>>> + && gimple_assign_rhs_code (use_stmt) == MULT_EXPR) >>>>> + { >>>>> + tree op0 = gimple_assign_rhs1 (use_stmt); >>>>> + tree op1 = gimple_assign_rhs2 (use_stmt); >>>>> + >>>>> + return (op0 == def && op1 == a) >>>>> + || (op0 == a && op1 == def); >>>>> + } >>>>> + return 0; >>>>> +} >>>> Seems like is_square_of could now be a light-weight wrapper around this. >>> >>> Indeed, I've done the wrapping now. >>> >>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator >>>>> *def_gsi, tree def) >>>>> occ_head = NULL; >>>>> } >>>>> +/* Transform sequences like >>>>> + x = 1.0 / sqrt (a); >>>>> + r1 = x * x; >>>>> + r2 = a * x; >>>>> + into: >>>>> + tmp1 = 1.0 / a; >>>>> + tmp2 = sqrt (a); >>>>> + tmp3 = tmp1 * tmp2; >>>>> + x = tmp3; >>>>> + r1 = tmp1; >>>>> + r2 = tmp2; >>>>> + depending on the uses of x, r1, r2. This removes one multiplication >>>>> and >>>>> + allows the sqrt and division operations to execute in parallel. >>>>> + DEF_GSI is the gsi of the initial division by sqrt that defines >>>>> + DEF (x in the example abovs). */ >>>>> + >>>>> +static void >>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def) >>>>> +{ >>>>> + use_operand_p use_p; >>>>> + imm_use_iterator use_iter; >>>>> + gimple *stmt = gsi_stmt (*def_gsi); >>>>> + tree x = def; >>>>> + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt); >>>>> + tree div_rhs1 = gimple_assign_rhs1 (stmt); >>>>> + >>>>> + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME >>>>> + || TREE_CODE (div_rhs1) != REAL_CST >>>>> + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1)) >>>>> + return; >>>>> + >>>>> + gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name); >>>>> + if (!is_gimple_call (sqrt_stmt) >>>>> + || !gimple_call_lhs (sqrt_stmt)) >>>>> + return; >>>>> + >>>>> + gcall *call = as_a <gcall *> (sqrt_stmt); >>>> Very minor, but: >>>> >>>> gcall *sqrt_stmt >>>> = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name)); >>>> if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt)) >>>> return; >>>> >>>> would avoid the need for the separate as_a<>, and would mean that >>>> we only call gimple_call_* on gcalls. >>> >>> Ok. >>> >>>>> + if (has_other_use) >>>>> + { >>>>> + /* Using the two temporaries tmp1, tmp2 from above >>>>> + the original x is now: >>>>> + x = tmp1 * tmp2. */ >>>>> + gcc_assert (mult_ssa_name); >>>>> + gcc_assert (sqr_ssa_name); >>>>> + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt); >>>>> + >>>>> + tree new_ssa_name >>>>> + = make_temp_ssa_name (TREE_TYPE (a), NULL, >>>>> "recip_sqrt_transformed"); >>>>> + gimple *new_stmt >>>>> + = gimple_build_assign (new_ssa_name, MULT_EXPR, >>>>> + mult_ssa_name, sqr_ssa_name); >>>>> + gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT); >>>>> + gcc_assert (gsi_stmt (gsi2) == stmt); >>>>> + gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name); >>>>> + fold_stmt (&gsi2); >>>>> + update_stmt (stmt); >>>> In this case we're replacing the statement in its original position, >>>> so there's no real need to use a temporary. It seems better to >>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same >>>> lhs as before. >>> >>> Yes, that's cleaner. >>> >>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun) >>>>> if (optimize_bb_for_size_p (bb)) >>>>> continue; >>>> Seems unnecessary to skip the new optimisation when optimising for size. >>>> Like you say, it saves a multiplication overall. Also: >>> >>> Indeed. >>> >>>>> + if (flag_unsafe_math_optimizations) >>>>> + { >>>>> + for (gimple_stmt_iterator gsi = gsi_after_labels (bb); >>>>> + !gsi_end_p (gsi); >>>>> + gsi_next (&gsi)) >>>>> + { >>>>> + gimple *stmt = gsi_stmt (gsi); >>>>> + >>>>> + if (gimple_has_lhs (stmt) >>>>> + && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL >>>>> + && FLOAT_TYPE_P (TREE_TYPE (def)) >>>>> + && TREE_CODE (def) == SSA_NAME >>>>> + && is_gimple_assign (stmt) >>>>> + && gimple_assign_rhs_code (stmt) == RDIV_EXPR) >>>>> + optimize_recip_sqrt (&gsi, def); >>>>> + } >>>>> + } >>>> It looks like this could safely be done in one of the existing walks >>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising >>>> for size). >>> >>> You're right. I've moved this into one of the walks above this. >>> >>> Bootstrapped and tested on aarch64-none-linux-gnu and >>> x86_64-unknown-linux-gnu. >>> CC'ing richi as he's reviewed these kinds of patches in the past. >>> >>> Is this ok for trunk? > > I wonder how it interacts with execute_cse_reciprocals_1 given it > introduces a division 1.0 / a which will be not processed by > execute_cse_reciprocals_1 given that operates by walking uses of 'a'. > That may be just a missed optimization of course.
Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as it's either one or the other. I've left it as it is for now, but would you like us to call execute_cse_reciprocals_1 on the potentially transformed division? > > + if (has_other_use) > + { > + /* Using the two temporaries tmp1, tmp2 from above > + the original x is now: > + x = tmp1 * tmp2. */ > + gcc_assert (mult_ssa_name); > + gcc_assert (sqr_ssa_name); > + > + gimple_assign_set_rhs1 (stmt, mult_ssa_name); > + gimple_assign_set_rhs2 (stmt, sqr_ssa_name); > + gimple_assign_set_rhs_code (stmt, MULT_EXPR); > + fold_stmt_inplace (def_gsi); > + update_stmt (stmt); > + } > > so you are leaving the original stmt in place unchanged even if it is > not used? Why? Note that with -fno-call-exceptions this stmt may > throw, so you should arrange to code-generate 1./a in place of the > original division to preserve EH behavior. Watch out because then > with other uses you have to find a place to insert its computation. Ok. These are oversights on my part. I've updated the patch to modify the original division in place. The multiplication is placed after it. > > + if (is_square_of (stmt2, x)) > + { > + if (!sqr_stmts.contains (stmt2)) > + sqr_stmts.safe_push (stmt2); > + } > > this is quadratic in the number of square stmts... please consider > making sqr_stmts a bitmap of SSA defs (so the stmt you have now > is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))). > Done. In practice I didn't see there being more than one such use, I expect them to be CSE'd, but maybe if they're in different basic blocks... > You do not seem to restrict placement of the use stmts but insert > before the 1/sqrt(a) stmt. That possibly hoists the multiplications > where they are not needed. Consider > > x = 1./sqrt(a); > if (l) > l1 = x * 3.; > else if (l2) > l1 = x * x; > else if (l3) > l1 = a * x; > > or similar where on the path to x * 3. you now perform two extra > multiplications. > Ok, I've restricted the optimisation somewhat. Now if there's an other use it won't perform the transformation unless there is already a multiplication present on the main path. That way it won't introduce multiplications on paths where there aren't any. > Your testcases do not cover the case of other uses at all. Or of > EH. gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a global being written to. I've added more testcases involving uses in different basic blocks and a g++.dg testcase with -fnon-call-exceptions. I'm not sure what functionality to test though apart from that it doesn't ICE and does the transformations I expect. Bootstrapped and tested on aarch64-none-linux-gnu. Is this version better? Thanks, Kyrill 2018-09-03 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * tree-ssa-math-opts.c (is_mult_by): New function. (is_square_of): Use the above. (optimize_recip_sqrt): New function. (pass_cse_reciprocals::execute): Use the above. 2018-09-03 Kyrylo Tkachov <kyrylo.tkac...@arm.com> * gcc.dg/recip_sqrt_mult_1.c: New test. * gcc.dg/recip_sqrt_mult_2.c: Likewise. * gcc.dg/recip_sqrt_mult_3.c: Likewise. * gcc.dg/recip_sqrt_mult_4.c: Likewise. * gcc.dg/recip_sqrt_mult_5.c: Likewise. * g++.dg/recip_sqrt_mult_1.c: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C new file mode 100644 index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195 --- /dev/null +++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo1 (double a, double b) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; + } + catch (...) + { ; } +} + +void +foo4 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + } + catch (...) + { + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } +} + +void +foo5 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } + catch (...) + { ; } +} + +/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c new file mode 100644 index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo (double a, double b) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c new file mode 100644 index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +float +foo (float a) +{ + float tmp = 1.0f / __builtin_sqrtf (a); + return a * tmp; +} + +/* { dg-final { scan-tree-dump-not " / " "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c new file mode 100644 index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +double +foo (double a) +{ + double tmp = 1.0f / __builtin_sqrt (a); + return tmp * tmp; +} + +/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c new file mode 100644 index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* The main path doesn't have any multiplications. + Avoid introducing them in the recip pass. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c new file mode 100644 index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* We want to do the recip_sqrt transformations here there is already + a multiplication on the main path. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index 25378da6f4ab27dbce51e10461efc18cc416a4d7..297afcd7a8f9cfc5f1edc1a762371b2a127c66db 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def) && gimple_assign_rhs1 (use_stmt) != def; } -/* Return whether USE_STMT is DEF * DEF. */ +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */ static inline bool -is_square_of (gimple *use_stmt, tree def) +is_mult_by (gimple *use_stmt, tree def, tree a) { if (gimple_code (use_stmt) == GIMPLE_ASSIGN && gimple_assign_rhs_code (use_stmt) == MULT_EXPR) @@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def) tree op0 = gimple_assign_rhs1 (use_stmt); tree op1 = gimple_assign_rhs2 (use_stmt); - return op0 == op1 && op0 == def; + return (op0 == def && op1 == a) + || (op0 == a && op1 == def); } return 0; } +/* Return whether USE_STMT is DEF * DEF. */ +static inline bool +is_square_of (gimple *use_stmt, tree def) +{ + return is_mult_by (use_stmt, def, def); +} + /* Return whether USE_STMT is a floating-point division by DEF * DEF. */ static inline bool @@ -526,6 +534,195 @@ free_bb (struct occurrence *occ) } } +/* Transform sequences like + t = sqrt (a) + x = 1.0 / t; + r1 = x * x; + r2 = a * x; + into: + t = sqrt (a) + r1 = 1.0 / a; + r2 = t; + x = r1 * r2; + depending on the uses of x, r1, r2. This removes one multiplication and + allows the sqrt and division operations to execute in parallel. + DEF_GSI is the gsi of the initial division by sqrt that defines + DEF (x in the example abovs). */ + +static void +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def) +{ + use_operand_p use_p; + imm_use_iterator use_iter; + gimple *stmt = gsi_stmt (*def_gsi); + tree x = def; + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt); + tree div_rhs1 = gimple_assign_rhs1 (stmt); + + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME + || TREE_CODE (div_rhs1) != REAL_CST + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1)) + return; + + gcall *sqrt_stmt + = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name)); + + if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt)) + return; + + switch (gimple_call_combined_fn (sqrt_stmt)) + { + CASE_CFN_SQRT: + CASE_CFN_SQRT_FN: + break; + + default: + return; + } + tree a = gimple_call_arg (sqrt_stmt, 0); + + /* We have 'a' and 'x'. Now analyze the uses of 'x'. */ + + /* Statements that use x in x * x. Since x appears twice use an SSA_NAME + bitmap to avoid recording the statement twice. */ + auto_bitmap sqr_ssa_names; + /* Statements that use x in a * x. */ + auto_vec<gimple *> mult_stmts; + bool has_other_use = false; + bool mult_on_main_path = false; + + FOR_EACH_IMM_USE_FAST (use_p, use_iter, x) + { + gimple *stmt2 = USE_STMT (use_p); + if (is_gimple_debug (stmt2)) + continue; + if (is_square_of (stmt2, x)) + { + bitmap_set_bit (sqr_ssa_names, + SSA_NAME_VERSION (gimple_assign_lhs (stmt2))); + if (gimple_bb (stmt2) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else if (is_mult_by (stmt2, x, a)) + { + mult_stmts.safe_push (stmt2); + if (gimple_bb (stmt2) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else + has_other_use = true; + } + + /* In the x * x and a * x cases we just rewire stmt operands or + remove multiplications. In the has_other_use case we introduce + a multiplication so make sure we don't introduce a multiplication + on a path where there was none. */ + if (has_other_use && !mult_on_main_path) + return; + + if (bitmap_empty_p (sqr_ssa_names) && mult_stmts.is_empty ()) + return; + + /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want + to be able to compose it from the sqr and mult cases. */ + if (has_other_use && (bitmap_empty_p (sqr_ssa_names) + || mult_stmts.is_empty ())) + return; + + if (dump_file) + { + fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n"); + print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "\n"); + } + + bool delete_div = !has_other_use; + tree sqr_ssa_name = NULL_TREE; + if (!bitmap_empty_p (sqr_ssa_names)) + { + /* r1 = x * x. Transform the original + x = 1.0 / t + into + tmp1 = 1.0 / a + r1 = tmp1. */ + + sqr_ssa_name + = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr"); + + if (dump_file) + { + fprintf (dump_file, "Replacing original division\n"); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "with new division\n"); + } + gimple_assign_set_lhs (stmt, sqr_ssa_name); + gimple_assign_set_rhs2 (stmt, a); + fold_stmt_inplace (def_gsi); + update_stmt (stmt); + + if (dump_file) + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + + delete_div = false; + gimple *sqr_stmt; + bitmap_iterator bi; + unsigned int i; + EXECUTE_IF_SET_IN_BITMAP (sqr_ssa_names, 0, i, bi) + { + sqr_stmt = SSA_NAME_DEF_STMT (ssa_name (i)); + + gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt); + gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name); + update_stmt (sqr_stmt); + } + } + if (!mult_stmts.is_empty ()) + { + /* r2 = a * x. Transform this into: + r2 = t (The original sqrt (a)). */ + unsigned int i; + gimple *mult_stmt = NULL; + FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt) + { + gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt); + + if (dump_file) + { + fprintf (dump_file, "Replacing squaring multiplication\n"); + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + fprintf (dump_file, "with assignment\n"); + } + gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name); + fold_stmt_inplace (&gsi2); + update_stmt (mult_stmt); + if (dump_file) + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + } + } + + if (has_other_use) + { + /* Using the two temporaries tmp1, tmp2 from above + the original x is now: + x = tmp1 * tmp2. */ + gcc_assert (orig_sqrt_ssa_name); + gcc_assert (sqr_ssa_name); + + gimple *new_stmt + = gimple_build_assign (x, MULT_EXPR, + orig_sqrt_ssa_name, sqr_ssa_name); + gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT); + update_stmt (stmt); + } + else if (delete_div) + { + /* Remove the original division. */ + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt); + gsi_remove (&gsi2, true); + release_defs (stmt); + } +} /* Look for floating-point divisions among DEF's uses, and try to replace them by multiplications with the reciprocal. Add @@ -546,6 +743,7 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def) int sqrt_recip_count = 0; gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME); + threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def))); /* If DEF is a square (x * x), count the number of divisions by x. @@ -756,7 +954,14 @@ pass_cse_reciprocals::execute (function *fun) && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL && FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME) - execute_cse_reciprocals_1 (&gsi, def); + { + if (flag_unsafe_math_optimizations + && is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == RDIV_EXPR) + optimize_recip_sqrt (&gsi, def); + else + execute_cse_reciprocals_1 (&gsi, def); + } } if (optimize_bb_for_size_p (bb))