Hi Richard,

On 31/08/18 12:07, Richard Biener wrote:
> On Thu, 30 Aug 2018, Kyrill Tkachov wrote:
>
>> Ping.
>>
>> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg01496.html
>>
>> Thanks,
>> Kyrill
>>
>> On 23/08/18 18:09, Kyrill Tkachov wrote:
>>> Hi Richard,
>>>
>>> On 23/08/18 11:13, Richard Sandiford wrote:
>>>> Kyrill  Tkachov <kyrylo.tkac...@foss.arm.com> writes:
>>>>> Hi all,
>>>>>
>>>>> This patch aims to optimise sequences involving uses of 1.0 / sqrt (a)
>>>>> under -freciprocal-math and -funsafe-math-optimizations.
>>>>> In particular consider:
>>>>>
>>>>> x = 1.0 / sqrt (a);
>>>>> r1 = x * x;  // same as 1.0 / a
>>>>> r2 = a * x; // same as sqrt (a)
>>>>>
>>>>> If x, r1 and r2 are all used further on in the code, this can be
>>>>> transformed into:
>>>>> tmp1 = 1.0 / a
>>>>> tmp2 = sqrt (a)
>>>>> tmp3 = tmp1 * tmp2
>>>>> x = tmp3
>>>>> r1 = tmp1
>>>>> r2 = tmp2
>>>> Nice optimisation :-)  Someone who knows the pass better should review,
>>>> but:
>>>
>>> Thanks for the review.
>>>
>>>> There seems to be an implicit assumption that this is a win even
>>>> when the r1 and r2 assignments are only conditionally executed.
>>>> That's probably true, but it might be worth saying explicitly.
>>>
>>> I'll admit I had not considered that case.
>>> I think it won't make a difference in practice, as the really expensive
>>> operations here
>>> are the sqrt and the division and they are on the executed path in either
>>> case and them
>>> becoming independent should be a benefit of its own.
>>>
>>>>> +/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
>>>>> +
>>>>> +static inline bool
>>>>> +is_mult_by (gimple *use_stmt, tree def, tree a)
>>>>> +{
>>>>> +  if (gimple_code (use_stmt) == GIMPLE_ASSIGN
>>>>> +      && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
>>>>> +    {
>>>>> +      tree op0 = gimple_assign_rhs1 (use_stmt);
>>>>> +      tree op1 = gimple_assign_rhs2 (use_stmt);
>>>>> +
>>>>> +      return (op0 == def && op1 == a)
>>>>> +          || (op0 == a && op1 == def);
>>>>> +    }
>>>>> +  return 0;
>>>>> +}
>>>> Seems like is_square_of could now be a light-weight wrapper around this.
>>>
>>> Indeed, I've done the wrapping now.
>>>
>>>>> @@ -652,6 +669,180 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator
>>>>> *def_gsi, tree def)
>>>>>     occ_head = NULL;
>>>>>   }
>>>>>   +/* Transform sequences like
>>>>> +   x = 1.0 / sqrt (a);
>>>>> +   r1 = x * x;
>>>>> +   r2 = a * x;
>>>>> +   into:
>>>>> +   tmp1 = 1.0 / a;
>>>>> +   tmp2 = sqrt (a);
>>>>> +   tmp3 = tmp1 * tmp2;
>>>>> +   x = tmp3;
>>>>> +   r1 = tmp1;
>>>>> +   r2 = tmp2;
>>>>> +   depending on the uses of x, r1, r2.  This removes one multiplication
>>>>> and
>>>>> +   allows the sqrt and division operations to execute in parallel.
>>>>> +   DEF_GSI is the gsi of the initial division by sqrt that defines
>>>>> +   DEF (x in the example abovs).  */
>>>>> +
>>>>> +static void
>>>>> +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
>>>>> +{
>>>>> +  use_operand_p use_p;
>>>>> +  imm_use_iterator use_iter;
>>>>> +  gimple *stmt = gsi_stmt (*def_gsi);
>>>>> +  tree x = def;
>>>>> +  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
>>>>> +  tree div_rhs1 = gimple_assign_rhs1 (stmt);
>>>>> +
>>>>> +  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
>>>>> +      || TREE_CODE (div_rhs1) != REAL_CST
>>>>> +      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
>>>>> +    return;
>>>>> +
>>>>> +  gimple *sqrt_stmt = SSA_NAME_DEF_STMT (orig_sqrt_ssa_name);
>>>>> +  if (!is_gimple_call (sqrt_stmt)
>>>>> +      || !gimple_call_lhs (sqrt_stmt))
>>>>> +    return;
>>>>> +
>>>>> +  gcall *call = as_a <gcall *> (sqrt_stmt);
>>>> Very minor, but:
>>>>
>>>>    gcall *sqrt_stmt
>>>>      = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
>>>>    if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
>>>>      return;
>>>>
>>>> would avoid the need for the separate as_a<>, and would mean that
>>>> we only call gimple_call_* on gcalls.
>>>
>>> Ok.
>>>
>>>>> +  if (has_other_use)
>>>>> +    {
>>>>> +      /* Using the two temporaries tmp1, tmp2 from above
>>>>> +     the original x is now:
>>>>> +     x = tmp1 * tmp2.  */
>>>>> +      gcc_assert (mult_ssa_name);
>>>>> +      gcc_assert (sqr_ssa_name);
>>>>> +      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
>>>>> +
>>>>> +      tree new_ssa_name
>>>>> +    = make_temp_ssa_name (TREE_TYPE (a), NULL,
>>>>> "recip_sqrt_transformed");
>>>>> +      gimple *new_stmt
>>>>> +    = gimple_build_assign (new_ssa_name, MULT_EXPR,
>>>>> +                   mult_ssa_name, sqr_ssa_name);
>>>>> +      gsi_insert_before (&gsi2, new_stmt, GSI_SAME_STMT);
>>>>> +      gcc_assert (gsi_stmt (gsi2) == stmt);
>>>>> +      gimple_assign_set_rhs_from_tree (&gsi2, new_ssa_name);
>>>>> +      fold_stmt (&gsi2);
>>>>> +      update_stmt (stmt);
>>>> In this case we're replacing the statement in its original position,
>>>> so there's no real need to use a temporary.  It seems better to
>>>> change the rhs_code, rhs1 and rhs2 of stmt in-place, with the same
>>>> lhs as before.
>>>
>>> Yes, that's cleaner.
>>>
>>>>> @@ -762,6 +953,23 @@ pass_cse_reciprocals::execute (function *fun)
>>>>>         if (optimize_bb_for_size_p (bb))
>>>>>           continue;
>>>> Seems unnecessary to skip the new optimisation when optimising for size.
>>>> Like you say, it saves a multiplication overall. Also:
>>>
>>> Indeed.
>>>
>>>>> +      if (flag_unsafe_math_optimizations)
>>>>> +    {
>>>>> +      for (gimple_stmt_iterator gsi = gsi_after_labels (bb);
>>>>> +           !gsi_end_p (gsi);
>>>>> +           gsi_next (&gsi))
>>>>> +        {
>>>>> +          gimple *stmt = gsi_stmt (gsi);
>>>>> +
>>>>> +          if (gimple_has_lhs (stmt)
>>>>> +          && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
>>>>> +          && FLOAT_TYPE_P (TREE_TYPE (def))
>>>>> +          && TREE_CODE (def) == SSA_NAME
>>>>> +          && is_gimple_assign (stmt)
>>>>> +          && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
>>>>> +        optimize_recip_sqrt (&gsi, def);
>>>>> +        }
>>>>> +    }
>>>> It looks like this could safely be done in one of the existing walks
>>>> (e.g. the execute_cse_reciprocals_1 one, if we do this when optimising
>>>> for size).
>>>
>>> You're right. I've moved this into one of the walks above this.
>>>
>>> Bootstrapped and tested on aarch64-none-linux-gnu and
>>> x86_64-unknown-linux-gnu.
>>> CC'ing richi as he's reviewed these kinds of patches in the past.
>>>
>>> Is this ok for trunk?
>
> I wonder how it interacts with execute_cse_reciprocals_1 given it
> introduces a division 1.0 / a which will be not processed by
> execute_cse_reciprocals_1 given that operates by walking uses of 'a'.
> That may be just a missed optimization of course.

Hmm, I believe right now it doesn't interact with execute_cse_reciprocals_1 as 
it's either one or the other.
I've left it as it is for now, but would you like us to call 
execute_cse_reciprocals_1 on the potentially transformed division?


>
> +  if (has_other_use)
> +    {
> +      /* Using the two temporaries tmp1, tmp2 from above
> +        the original x is now:
> +        x = tmp1 * tmp2.  */
> +      gcc_assert (mult_ssa_name);
> +      gcc_assert (sqr_ssa_name);
> +
> +      gimple_assign_set_rhs1 (stmt, mult_ssa_name);
> +      gimple_assign_set_rhs2 (stmt, sqr_ssa_name);
> +      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
> +      fold_stmt_inplace (def_gsi);
> +      update_stmt (stmt);
> +    }
>
> so you are leaving the original stmt in place unchanged even if it is
> not used?  Why?  Note that with -fno-call-exceptions this stmt may
> throw, so you should arrange to code-generate 1./a in place of the
> original division to preserve EH behavior.  Watch out because then
> with other uses you have to find a place to insert its computation.

Ok. These are oversights on my part. I've updated the patch to modify the
original division in place. The multiplication is placed after it.


>
> +      if (is_square_of (stmt2, x))
> +       {
> +         if (!sqr_stmts.contains (stmt2))
> +           sqr_stmts.safe_push (stmt2);
> +       }
>
> this is quadratic in the number of square stmts...  please consider
> making sqr_stmts a bitmap of SSA defs (so the stmt you have now
> is then SSA_NAME_DEF_STMT (ssa_name (bitmap-element))).
>

Done. In practice I didn't see there being more than one such use, I expect them
to be CSE'd, but maybe if they're in different basic blocks...

> You do not seem to restrict placement of the use stmts but insert
> before the 1/sqrt(a) stmt.  That possibly hoists the multiplications
> where they are not needed.  Consider
>
>   x = 1./sqrt(a);
>   if (l)
>     l1 = x * 3.;
>   else if (l2)
>     l1 = x * x;
>   else if (l3)
>     l1 = a * x;
>
> or similar where on the path to x * 3. you now perform two extra
> multiplications.
>

Ok, I've restricted the optimisation somewhat.
Now if there's an other use it won't perform the transformation unless there is 
already
a multiplication present on the main path. That way it won't introduce 
multiplications
on paths where there aren't any.


> Your testcases do not cover the case of other uses at all.  Or of
> EH.

gcc.dg/recip_sqrt_mult_1.c should handle the other uses case as tmp is a global
being written to. I've added more testcases involving uses in different basic 
blocks
and a g++.dg testcase with -fnon-call-exceptions.
I'm not sure what functionality to test though apart from that it doesn't ICE 
and does
the transformations I expect.

Bootstrapped and tested on aarch64-none-linux-gnu.
Is this version better?

Thanks,
Kyrill


2018-09-03  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * tree-ssa-math-opts.c (is_mult_by): New function.
    (is_square_of): Use the above.
    (optimize_recip_sqrt): New function.
    (pass_cse_reciprocals::execute): Use the above.

2018-09-03  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.dg/recip_sqrt_mult_1.c: New test.
    * gcc.dg/recip_sqrt_mult_2.c: Likewise.
    * gcc.dg/recip_sqrt_mult_3.c: Likewise.
    * gcc.dg/recip_sqrt_mult_4.c: Likewise.
    * gcc.dg/recip_sqrt_mult_5.c: Likewise.
    * g++.dg/recip_sqrt_mult_1.c: Likewise.
diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
new file mode 100644
index 0000000000000000000000000000000000000000..11d9c6f758f1529d8ed4cadf85010f6ce379c195
--- /dev/null
+++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo1 (double a, double b)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+    res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+void
+foo4 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+  }
+  catch (...)
+    {
+      if (c)
+	res = tmp * tmp;
+
+      if (d)
+	res2 = a * tmp;
+    }
+}
+
+void
+foo5 (double a, double b, int c, int d)
+{
+  try {
+    tmp = 1.0 / __builtin_sqrt (a);
+    res = tmp * tmp;
+
+    if (d)
+      res2 = a * tmp;
+  }
+  catch (...)
+    { ; }
+}
+
+/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */
+/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..188390a4ecffca1b21f05c4f19abecfb7ebd188f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+double res, res2, tmp;
+void
+foo (double a, double b)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+  res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc3de7b657b1769e76254b4bc874e0595e43ef
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+float
+foo (float a)
+{
+  float tmp = 1.0f / __builtin_sqrtf (a);
+  return a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not " / " "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d185ba7e22cfc7cca72296d5ccc544f24fdb14
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+double
+foo (double a)
+{
+  double tmp = 1.0f / __builtin_sqrt (a);
+  return tmp * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..e3005f2feb6f4bacbb6eafc0155e196cb866fcdf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* The main path doesn't have any multiplications.
+   Avoid introducing them in the recip pass.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  if (c)
+    res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */
diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..e871f0fcd4feb1687f9815e4babf4d0667a15ea8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-recip" } */
+
+/* We want to do the recip_sqrt transformations here there is already
+   a multiplication on the main path.  */
+
+double res, res2, tmp;
+void
+foo (double a, double b, int c, int d)
+{
+  tmp = 1.0 / __builtin_sqrt (a);
+  res = tmp * tmp;
+
+  if (d)
+    res2 = a * tmp;
+}
+
+/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */
+/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */
diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
index 25378da6f4ab27dbce51e10461efc18cc416a4d7..297afcd7a8f9cfc5f1edc1a762371b2a127c66db 100644
--- a/gcc/tree-ssa-math-opts.c
+++ b/gcc/tree-ssa-math-opts.c
@@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def)
 	 && gimple_assign_rhs1 (use_stmt) != def;
 }
 
-/* Return whether USE_STMT is DEF * DEF.  */
+/* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
 static inline bool
-is_square_of (gimple *use_stmt, tree def)
+is_mult_by (gimple *use_stmt, tree def, tree a)
 {
   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
@@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def)
       tree op0 = gimple_assign_rhs1 (use_stmt);
       tree op1 = gimple_assign_rhs2 (use_stmt);
 
-      return op0 == op1 && op0 == def;
+      return (op0 == def && op1 == a)
+	      || (op0 == a && op1 == def);
     }
   return 0;
 }
 
+/* Return whether USE_STMT is DEF * DEF.  */
+static inline bool
+is_square_of (gimple *use_stmt, tree def)
+{
+  return is_mult_by (use_stmt, def, def);
+}
+
 /* Return whether USE_STMT is a floating-point division by
    DEF * DEF.  */
 static inline bool
@@ -526,6 +534,195 @@ free_bb (struct occurrence *occ)
     }
 }
 
+/* Transform sequences like
+   t = sqrt (a)
+   x = 1.0 / t;
+   r1 = x * x;
+   r2 = a * x;
+   into:
+   t = sqrt (a)
+   r1 = 1.0 / a;
+   r2 = t;
+   x = r1 * r2;
+   depending on the uses of x, r1, r2.  This removes one multiplication and
+   allows the sqrt and division operations to execute in parallel.
+   DEF_GSI is the gsi of the initial division by sqrt that defines
+   DEF (x in the example abovs).  */
+
+static void
+optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
+{
+  use_operand_p use_p;
+  imm_use_iterator use_iter;
+  gimple *stmt = gsi_stmt (*def_gsi);
+  tree x = def;
+  tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
+  tree div_rhs1 = gimple_assign_rhs1 (stmt);
+
+  if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
+      || TREE_CODE (div_rhs1) != REAL_CST
+      || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
+    return;
+
+  gcall *sqrt_stmt
+    = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
+
+  if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
+    return;
+
+  switch (gimple_call_combined_fn (sqrt_stmt))
+    {
+    CASE_CFN_SQRT:
+    CASE_CFN_SQRT_FN:
+      break;
+
+    default:
+      return;
+    }
+  tree a = gimple_call_arg (sqrt_stmt, 0);
+
+  /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
+
+  /* Statements that use x in x * x.  Since x appears twice use an SSA_NAME
+     bitmap to avoid recording the statement twice.  */
+  auto_bitmap sqr_ssa_names;
+  /* Statements that use x in a * x.  */
+  auto_vec<gimple *> mult_stmts;
+  bool has_other_use = false;
+  bool mult_on_main_path = false;
+
+  FOR_EACH_IMM_USE_FAST (use_p, use_iter, x)
+    {
+      gimple *stmt2 = USE_STMT (use_p);
+      if (is_gimple_debug (stmt2))
+	continue;
+      if (is_square_of (stmt2, x))
+	{
+	  bitmap_set_bit (sqr_ssa_names,
+			  SSA_NAME_VERSION (gimple_assign_lhs (stmt2)));
+	  if (gimple_bb (stmt2) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else if (is_mult_by (stmt2, x, a))
+	{
+	  mult_stmts.safe_push (stmt2);
+	  if (gimple_bb (stmt2) == gimple_bb (stmt))
+	    mult_on_main_path = true;
+	}
+      else
+	has_other_use = true;
+    }
+
+  /* In the x * x and a * x cases we just rewire stmt operands or
+     remove multiplications.  In the has_other_use case we introduce
+     a multiplication so make sure we don't introduce a multiplication
+     on a path where there was none.  */
+  if (has_other_use && !mult_on_main_path)
+    return;
+
+  if (bitmap_empty_p (sqr_ssa_names) && mult_stmts.is_empty ())
+    return;
+
+  /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
+     to be able to compose it from the sqr and mult cases.  */
+  if (has_other_use && (bitmap_empty_p (sqr_ssa_names)
+			|| mult_stmts.is_empty ()))
+    return;
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
+      print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
+      print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+      fprintf (dump_file, "\n");
+    }
+
+  bool delete_div = !has_other_use;
+  tree sqr_ssa_name = NULL_TREE;
+  if (!bitmap_empty_p (sqr_ssa_names))
+    {
+      /* r1 = x * x.  Transform the original
+	 x = 1.0 / t
+	 into
+	 tmp1 = 1.0 / a
+	 r1 = tmp1.  */
+
+      sqr_ssa_name
+	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
+
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Replacing original division\n");
+	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+	  fprintf (dump_file, "with new division\n");
+	}
+      gimple_assign_set_lhs (stmt, sqr_ssa_name);
+      gimple_assign_set_rhs2 (stmt, a);
+      fold_stmt_inplace (def_gsi);
+      update_stmt (stmt);
+
+      if (dump_file)
+	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
+
+      delete_div = false;
+      gimple *sqr_stmt;
+      bitmap_iterator bi;
+      unsigned int i;
+      EXECUTE_IF_SET_IN_BITMAP (sqr_ssa_names, 0, i, bi)
+	{
+	  sqr_stmt = SSA_NAME_DEF_STMT (ssa_name (i));
+
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
+	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
+	  update_stmt (sqr_stmt);
+      }
+    }
+  if (!mult_stmts.is_empty ())
+    {
+      /* r2 = a * x.  Transform this into:
+	 r2 = t (The original sqrt (a)).  */
+      unsigned int i;
+      gimple *mult_stmt = NULL;
+      FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
+	{
+	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
+
+	  if (dump_file)
+	    {
+	      fprintf (dump_file, "Replacing squaring multiplication\n");
+	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+	      fprintf (dump_file, "with assignment\n");
+	    }
+	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
+	  fold_stmt_inplace (&gsi2);
+	  update_stmt (mult_stmt);
+	  if (dump_file)
+	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
+      }
+    }
+
+  if (has_other_use)
+    {
+      /* Using the two temporaries tmp1, tmp2 from above
+	 the original x is now:
+	 x = tmp1 * tmp2.  */
+      gcc_assert (orig_sqrt_ssa_name);
+      gcc_assert (sqr_ssa_name);
+
+      gimple *new_stmt
+	= gimple_build_assign (x, MULT_EXPR,
+				orig_sqrt_ssa_name, sqr_ssa_name);
+      gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
+      update_stmt (stmt);
+    }
+  else if (delete_div)
+    {
+      /* Remove the original division.  */
+      gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
+      gsi_remove (&gsi2, true);
+      release_defs (stmt);
+    }
+}
 
 /* Look for floating-point divisions among DEF's uses, and try to
    replace them by multiplications with the reciprocal.  Add
@@ -546,6 +743,7 @@ execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
   int sqrt_recip_count = 0;
 
   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
+
   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
 
   /* If DEF is a square (x * x), count the number of divisions by x.
@@ -756,7 +954,14 @@ pass_cse_reciprocals::execute (function *fun)
 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
 	      && FLOAT_TYPE_P (TREE_TYPE (def))
 	      && TREE_CODE (def) == SSA_NAME)
-	    execute_cse_reciprocals_1 (&gsi, def);
+	    {
+	      if (flag_unsafe_math_optimizations
+		  && is_gimple_assign (stmt)
+		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
+		optimize_recip_sqrt (&gsi, def);
+	      else
+		execute_cse_reciprocals_1 (&gsi, def);
+	    }
 	}
 
       if (optimize_bb_for_size_p (bb))

Reply via email to