Benedikt,

You beat me to it! :-)  Do you have the implementation for dividing using
the Newton series as well?

I'm not sure that the series is always for all data types and on all
processors.  It would be useful to allow each AArch64 processor to enable
this or not depending on the data type.  BTW, do you have some tests showing
the speed up?

Thank you,

-- 
Evandro Menezes                              Austin, TX

> -----Original Message-----
> From: gcc-patches-ow...@gcc.gnu.org [mailto:gcc-patches-ow...@gcc.gnu.org]
On
> Behalf Of Benedikt Huber
> Sent: Thursday, June 18, 2015 7:04
> To: gcc-patches@gcc.gnu.org
> Cc: benedikt.hu...@theobroma-systems.com; philipp.tomsich@theobroma-
> systems.com
> Subject: [PATCH] [aarch64] Implemented reciprocal square root (rsqrt)
> estimation in -ffast-math
> 
> arch64 offers the instructions frsqrte and frsqrts, for rsqrt estimation
and
> a Newton-Raphson step, respectively.
> There are ARMv8 implementations where this is faster than using fdiv and
> rsqrt.
> It runs three steps for double and two steps for float to achieve the
needed
> precision.
> 
> There is one caveat and open question.
> Since -ffast-math enables flush to zero intermediate values between
> approximation steps will be flushed to zero if they are denormal.
> E.g. This happens in the case of rsqrt (DBL_MAX) and rsqrtf (FLT_MAX).
> The test cases pass, but it is unclear to me whether this is expected
> behavior with -ffast-math.
> 
> The patch applies to commit:
> svn+ssh://gcc.gnu.org/svn/gcc/trunk@224470
> 
> Please consider including this patch.
> Thank you and best regards,
> Benedikt Huber
> 
> Benedikt Huber (1):
>   2015-06-15  Benedikt Huber  <benedikt.hu...@theobroma-systems.com>
> 
>  gcc/ChangeLog                            |   9 +++
>  gcc/config/aarch64/aarch64-builtins.c    |  60 ++++++++++++++++
>  gcc/config/aarch64/aarch64-protos.h      |   2 +
>  gcc/config/aarch64/aarch64-simd.md       |  27 ++++++++
>  gcc/config/aarch64/aarch64.c             |  63 +++++++++++++++++
>  gcc/config/aarch64/aarch64.md            |   3 +
>  gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113
> +++++++++++++++++++++++++++++++
>  7 files changed, 277 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
> 
> --
> 1.9.1
--- Begin Message ---
       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
       * config/aarch64/aarch64-protos.h: Declare.
       * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte
and frsqrts.
       * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code
in fast math mode.
       * config/aarch64/aarch64.md: Added enum entry.
       * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
---
 gcc/ChangeLog                            |   9 +++
 gcc/config/aarch64/aarch64-builtins.c    |  60 ++++++++++++++++
 gcc/config/aarch64/aarch64-protos.h      |   2 +
 gcc/config/aarch64/aarch64-simd.md       |  27 ++++++++
 gcc/config/aarch64/aarch64.c             |  63 +++++++++++++++++
 gcc/config/aarch64/aarch64.md            |   3 +
 gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113
+++++++++++++++++++++++++++++++
 7 files changed, 277 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c9b156f..690ebba 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,12 @@
+2015-06-15  Benedikt Huber  <benedikt.hu...@theobroma-systems.com>
+
+       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+       * config/aarch64/aarch64-protos.h: Declare.
+       * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte
and frsqrts.
+       * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation
code in fast math mode.
+       * config/aarch64/aarch64.md: Added enum entry.
+       * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double.
+
 2015-06-14  Richard Sandiford  <richard.sandif...@arm.com>
 
        * rtl.h (classify_insn): Declare.
diff --git a/gcc/config/aarch64/aarch64-builtins.c
b/gcc/config/aarch64/aarch64-builtins.c
index f7a39ec..484bb84 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -342,6 +342,8 @@ enum aarch64_builtins
   AARCH64_BUILTIN_GET_FPSR,
   AARCH64_BUILTIN_SET_FPSR,
 
+  AARCH64_BUILTIN_RSQRT,
+  AARCH64_BUILTIN_RSQRTF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
 #include "aarch64-simd-builtins.def"
@@ -831,6 +833,32 @@ aarch64_init_crc32_builtins ()
 }
 
 void
+aarch64_add_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+  ftype = build_function_type_list (double_type_node, double_type_node,
NULL_TREE);
+
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt",
+                                 ftype,
+                                 AARCH64_BUILTIN_RSQRT,
+                                 BUILT_IN_MD,
+                                 NULL,
+                                 NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT] = fndecl;
+
+  tree ftypef = NULL;
+  ftypef = build_function_type_list (float_type_node, float_type_node,
NULL_TREE);
+  fndecl = add_builtin_function ("__builtin_aarch64_rsqrtf",
+                                 ftypef,
+                                 AARCH64_BUILTIN_RSQRTF,
+                                 BUILT_IN_MD,
+                                 NULL,
+                                 NULL_TREE);
+  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF] = fndecl;
+}
+
+void
 aarch64_init_builtins (void)
 {
   tree ftype_set_fpr
@@ -855,6 +883,7 @@ aarch64_init_builtins (void)
     aarch64_init_simd_builtins ();
   if (TARGET_CRC32)
     aarch64_init_crc32_builtins ();
+  aarch64_add_builtin_rsqrt ();
 }
 
 tree
@@ -1099,6 +1128,23 @@ aarch64_crc32_expand_builtin (int fcode, tree exp,
rtx target)
   return target;
 }
 
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  enum insn_code c = CODE_FOR_rsqrtdf;
+  if (fcode == AARCH64_BUILTIN_RSQRTF)
+    c = CODE_FOR_rsqrtsf;
+
+  pat = GEN_FCN (c) (target, op0);
+  emit_insn (pat);
+
+  return target;
+}
+
 /* Expand an expression EXP that calls a built-in function,
    with result going to TARGET if that's convenient.  */
 rtx
@@ -1146,6 +1192,11 @@ aarch64_expand_builtin (tree exp,
   else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <=
AARCH64_CRC32_BUILTIN_MAX)
     return aarch64_crc32_expand_builtin (fcode, exp, target);
 
+  if (fcode == AARCH64_BUILTIN_RSQRT ||
+      fcode == AARCH64_BUILTIN_RSQRTF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
+  return NULL_RTX;
   gcc_unreachable ();
 }
 
@@ -1303,6 +1354,15 @@ aarch64_builtin_vectorized_function (tree fndecl,
tree type_out, tree type_in)
   return NULL_TREE;
 }
 
+tree
+aarch64_builtin_rsqrt (bool is_float)
+{
+  if (is_float)
+    return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF];
+  else
+    return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT];
+}
+
 #undef VAR1
 #define VAR1(T, N, MAP, A) \
   case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h
b/gcc/config/aarch64/aarch64-protos.h
index 965a11b..4f1c8ce 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -270,6 +270,8 @@ void aarch64_print_operand (FILE *, rtx, char);
 void aarch64_print_operand_address (FILE *, rtx);
 void aarch64_emit_call_insn (rtx);
 
+void aarch64_emit_swrsqrt (rtx, rtx);
+
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
 
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index b90f938..266800a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -353,6 +353,33 @@
   [(set_attr "type" "neon_fp_mul_d_scalar_q")]
 )
 
+(define_insn "*rsqrte_simd"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+                    UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "*rsqrts_simd"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+               (match_operand:VALLF 2 "register_operand" "w")]
+                    UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "rsqrt<mode>"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+       (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")]
+                    UNSPEC_RSQRT))]
+  "TARGET_FLOAT"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_mul3_elt_to_64v2df"
   [(set (match_operand:DF 0 "register_operand" "=w")
      (mult:DF
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c3c2795..281f0b2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6816,6 +6816,66 @@ aarch64_memory_move_cost (machine_mode mode
ATTRIBUTE_UNUSED,
   return aarch64_tune_params->memmov_cost;
 }
 
+extern tree aarch64_builtin_rsqrt (bool is_float);
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+                            bool md_fn ATTRIBUTE_UNUSED,
+                            bool sqrt ATTRIBUTE_UNUSED)
+{
+  if (!fast_math_flags_set_p (&global_options))
+    return NULL_TREE;
+
+  if (fn == BUILT_IN_SQRTF)
+    return aarch64_builtin_rsqrt (true);
+  else if (fn == BUILT_IN_SQRT)
+    return aarch64_builtin_rsqrt (false);
+  else
+    return NULL_TREE;
+}
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  enum machine_mode mode = GET_MODE (src);
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_set_insn (xsrc, src);
+
+  rtx x0 = gen_reg_rtx (mode);
+  emit_insn (gen_rtx_SET (x0,
+                          gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc),
+                                          UNSPEC_RSQRTE)));
+
+  bool double_mode = (DFmode   == mode ||
+                      V1DFmode == mode ||
+                      V2DFmode == mode ||
+                      V4DFmode == mode ||
+                      V6DFmode == mode ||
+                      V8DFmode == mode);
+
+  int iterations = 2;
+  if (double_mode)
+    iterations = 3;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_insn (gen_rtx_SET (x2,
+                              gen_rtx_MULT (mode, x0, x0)));
+      emit_insn (gen_rtx_SET (x3,
+                              gen_rtx_UNSPEC (mode, gen_rtvec (2, xsrc,
x2),
+                                              UNSPEC_RSQRTS)));
+      emit_insn (gen_rtx_SET (x1,
+                              gen_rtx_MULT (mode, x0, x3)));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+  return;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
@@ -11747,6 +11807,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool
load,
 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
 
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
 #undef TARGET_VECTOR_MODE_SUPPORTED_P
 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 11123d6..7272d05 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -120,6 +120,9 @@
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c
b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
new file mode 100644
index 0000000..6607483
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
@@ -0,0 +1,113 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-inline --save-temps -ffast-math" } */
+
+#include <math.h>
+#include <stdio.h>
+
+#include <values.h>
+
+#define PI    3.141592653589793
+#define SQRT2 1.4142135623730951
+
+#define PI_4 0.7853981633974483
+#define SQRT1_2 0.7071067811865475
+
+/* 2^25+1, float has 24 significand bits
+ *       according to Single-precision floating-point format.  */
+#define TESTA8_FLT 33554433
+/* 2^54+1, double has 53 significand bits
+ *       according to Double-precision floating-point format.  */
+#define TESTA8_DBL 18014398509481985
+
+#define SD(a, b) t_double ((#a), (a), (b));
+#define SF(a, b) t_float ((#a), (a), (b));
+
+#define EPSILON_double __DBL_EPSILON__
+#define EPSILON_float __FLT_EPSILON__
+#define ABS_double fabs
+#define ABS_float fabsf
+#define SQRT_double sqrt
+#define SQRT_float sqrtf
+
+extern void abort (void);
+
+#define TESTTYPE(TYPE)
\
+TYPE rsqrt_##TYPE (TYPE a)
\
+{
\
+  return 1.0/SQRT_##TYPE(a);
\
+}
\
+
\
+int equals_##TYPE (TYPE a, TYPE b)
\
+{
\
+  return (a == b ||
\
+          (isnan (a) && isnan (b)) ||
\
+          (ABS_##TYPE (a - b) < EPSILON_##TYPE));
\
+}
\
+
\
+void t_##TYPE (const char *s, TYPE a, TYPE result)
\
+{
\
+  TYPE r = rsqrt_##TYPE (a);
\
+  if (!equals_##TYPE (r, result))
\
+  {
\
+    abort ();
\
+  }
\
+}
\
+
+//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result);
\
+
+TESTTYPE(double)
+TESTTYPE(float)
+
+/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } }
*/
+/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+,
d\[0-9\]+" 3 } } */
+
+/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } }
*/
+/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+,
s\[0-9\]+" 2 } } */
+
+int main ()
+{
+  SD(    1.0/256, 0X1.00000000000000P+4  );
+  SD(        1.0, 0X1.00000000000000P+0  );
+  SD(       -1.0,                     NAN);
+  SD(       11.0, 0X1.34BF63D1568260P-2  );
+  SD(        0.0,                INFINITY);
+  SD(   INFINITY, 0X0.00000000000000P+0  );
+  SD(        NAN,                     NAN);
+  SD(       -NAN,                    -NAN);
+  SD(    DBL_MAX, 0X1.00000000000010P-512);
+  SD(    DBL_MIN, 0X1.00000000000000P+511);
+  SD(         PI, 0X1.20DD750429B6D0P-1  );
+  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
+  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
+  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
+  SD(        -PI,                     NAN);
+  SD(     -SQRT2,                     NAN);
+  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
+
+  SF(    1.0/256, 0X1.00000000000000P+4  );
+  SF(        1.0, 0X1.00000000000000P+0  );
+  SF(       -1.0,                     NAN);
+  SF(       11.0, 0X1.34BF6400000000P-2  );
+  SF(        0.0,                INFINITY);
+  SF(   INFINITY, 0X0.00000000000000P+0  );
+  SF(        NAN,                     NAN);
+  SF(       -NAN,                    -NAN);
+  SF(    FLT_MAX, 0X1.00000200000000P-64 );
+  SF(    FLT_MIN, 0X1.00000000000000P+63 );
+  SF(         PI, 0X1.20DD7400000000P-1  );
+  SF(       PI_4, 0X1.20DD7400000000P+0  );
+  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
+  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
+  SF(        -PI,                     NAN);
+  SF(     -SQRT2,                     NAN);
+  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
+
+//   With -ffast-math these return positive INF.
+//   SD(       -0.0,               -INFINITY);
+//   SF(       -0.0,               -INFINITY);
+//   The reason here is that -ffast-math flushes to zero.
+//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
+//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
+
+  return 0;
+}
-- 
1.9.1

--- End Message ---

Reply via email to