Hello,

This patch introduces an optimization for narrowing binary and builtin
math operations to the smallest type when unsafe math optimizations are
enabled (typically -Ofast or -ffast-math).

Consider the example:

   float f (float x) {
     return 1.0 / sqrt (x);
   }

   f:
     fcvt       d0, s0
     fmov       d1, 1.0e+0
     fsqrt      d0, d0
     fdiv       d0, d1, d0
     fcvt       s0, d0
     ret

Given that all outputs are of float type, we can do the whole 
calculation in single precision and avoid any potentially expensive 
conversions between single and double precision.

Aka the expression would end up looking more like

   float f (float x) {
     return 1.0f / sqrtf (x);
   }

   f:
     fsqrt      s0, s0
     fmov       s1, 1.0e+0
     fdiv       s0, s1, s0
     ret

This optimization will narrow casts around math builtins, and also
not try to find the widest type for calculations when processing binary
math operations (if unsafe math optimizations are enable).

Added tests to verify that narrower math builtins are chosen and
no unnecessary casts are introduced when appropriate.

Bootstrapped and regtested on aarch64 and x86_64 with no regressions.

I don't have write access, so if OK for trunk then can someone commit on 
my behalf?

Regards,
Barney

gcc/ChangeLog:

2019-09-02  Barnaby Wilks  <barnaby.wi...@arm.com>

        * builtins.c (mathfn_built_in): Expose find implicit builtin parameter.
        * builtins.h (mathfn_built_in): Likewise.
        * match.pd: Add expressions for simplifying builtin and binary
        math expressions.

gcc/testsuite/ChangeLog:

2019-09-02  Barnaby Wilks  <barnaby.wi...@arm.com>

        * gcc.dg/fold-single-precision.c: New test.
diff --git a/gcc/builtins.h b/gcc/builtins.h
index 
1ffb491d7850366c74bd694bf9e1c277bcde1da9..5cd02af3be55b041918ad6f1a44d5520f5689fee
 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -108,6 +108,7 @@ extern void expand_builtin_setjmp_setup (rtx, rtx);
 extern void expand_builtin_setjmp_receiver (rtx);
 extern void expand_builtin_update_setjmp_buf (rtx);
 extern tree mathfn_built_in (tree, enum built_in_function fn);
+extern tree mathfn_built_in (tree, enum built_in_function fn, bool implicit);
 extern tree mathfn_built_in (tree, combined_fn);
 extern rtx builtin_strncpy_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
 extern rtx builtin_memset_read_str (void *, HOST_WIDE_INT, scalar_int_mode);
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 
695a9d191af4c4922351e3e59601a87b3fedda5c..6cfd7f4af54110fec9f53ddaf71408e7efc329da
 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -2137,6 +2137,12 @@ mathfn_built_in (tree type, enum built_in_function fn)
   return mathfn_built_in_1 (type, as_combined_fn (fn), /*implicit=*/ 1);
 }
 
+tree
+mathfn_built_in (tree type, enum built_in_function fn, bool implicit)
+{
+  return mathfn_built_in_1 (type, as_combined_fn (fn), implicit);
+}
+
 /* If BUILT_IN_NORMAL function FNDECL has an associated internal function,
    return its code, otherwise return IFN_LAST.  Note that this function
    only tests whether the function is defined in internals.def, not whether
diff --git a/gcc/match.pd b/gcc/match.pd
index 
0317bc704f771f626ab72189b3a54de00087ad5a..3562548de3ebcb986da20986b868d9a3d318c4ee
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5004,10 +5004,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
              && newtype == type
              && types_match (newtype, type))
            (op (convert:newtype @1) (convert:newtype @2))
-           (with { if (TYPE_PRECISION (ty1) > TYPE_PRECISION (newtype))
+           (with
+             {
+               if (!flag_unsafe_math_optimizations)
+                 {
+                   if (TYPE_PRECISION (ty1) > TYPE_PRECISION (newtype))
                      newtype = ty1;
+
                    if (TYPE_PRECISION (ty2) > TYPE_PRECISION (newtype))
-                     newtype = ty2; }
+                     newtype = ty2;
+                 }
+             }
+
               /* Sometimes this transformation is safe (cannot
                  change results through affecting double rounding
                  cases) and sometimes it is not.  If NEWTYPE is
@@ -5654,3 +5662,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (simplify
  (vec_perm vec_same_elem_p@0 @0 @1)
  @0)
+
+/* Convert expressions of the form
+   (x) math_call1 ((y) z) where (x) and z are the same type, into
+   math_call2 (z), where math_call2 is the math builtin for
+   type x.  Type x (and therefore type of z) must be a lower precision
+   than y/math_call1.  */
+(if (flag_unsafe_math_optimizations && !flag_errno_math)
+  (for op (COSH EXP EXP10 EXP2 EXPM1 GAMMA J0 J1 LGAMMA
+          POW10 SINH TGAMMA Y0 Y1 ACOS ACOSH ASIN ASINH
+          ATAN ATANH CBRT COS ERF ERFC LOG LOG10 LOG2
+          LOG1P SIN TAN TANH SQRT FABS LOGB)
+    (simplify
+      (convert (op@0 (convert@1 @2)))
+       (if (SCALAR_FLOAT_TYPE_P (type) && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@1))
+             && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@2))
+             && types_match (type, TREE_TYPE (@2))
+             && TYPE_PRECISION (type) < TYPE_PRECISION (TREE_TYPE (@1)))
+         (with { enum built_in_function fcode = builtin_mathfn_code (@0);
+                 tree fn = mathfn_built_in (type, fcode, false); }
+           (if (fn)
+             (convert { build_call_expr (fn, 1, @2); })))))))
diff --git a/gcc/testsuite/gcc.dg/fold-single-precision.c 
b/gcc/testsuite/gcc.dg/fold-single-precision.c
new file mode 100644
index 
0000000000000000000000000000000000000000..9209b5ce42d87cda69e84b048f0f0e3eaf0dd973
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-single-precision.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized" } */
+
+#include <math.h>
+
+float f (float x)
+{
+  x = 1.0 / sqrt (x);
+  return x;
+}
+
+float g (float x, float y)
+{
+  double t = 1.0 / x;
+  return t * y;
+}
+
+float h (float x, float y)
+{
+  float z = pow (y, 2.0);
+  return sqrt ((x * x) + z);
+}
+
+float i (float x)
+{
+  return x * (double) sqrtf (x);
+}
+
+void j (float* x, float* y)
+{
+  double len = h (*x, *y);
+  *x = *x / len;
+  *y = *y / len;
+}
+
+float k (float x, float y)
+{
+  double t = 4.0 * x;
+  double z = t + y;
+  return z;
+}
+
+float l (float n)
+{
+  return cbrt (n);
+}
+
+float m (float n)
+{
+  float x = n * n;
+  return sqrt (x) - 1.0f;
+}
+
+/* { dg-final { scan-tree-dump "__builtin_sqrtf" "optimized" } } */
+/* { dg-final { scan-tree-dump "__builtin_cbrtf" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "\\(double\\)" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "\\(float\\)" "optimized" } } */

Reply via email to