Emit square root using the Newton series

Evandro Menezes Sat, 19 Mar 2016 04:09:00 -0700

   2016-03-16  Evandro Menezes <e.mene...@samsung.com>
                Wilco Dijkstra  <wilco.dijks...@arm.com>


   gcc/
        * config/aarch64/aarch64-tuning-flags.def
        (AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
        * config/aarch64/aarch64-protos.h
        (aarch64_emit_approx_rsqrt): Replace with
   "aarch64_emit_approx_sqrt".
        (AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
        * config/aarch64/aarch64.c
        (exynosm1_tunings): Use the new macro.
        (aarch64_emit_approx_sqrt): Define new function.
        * config/aarch64/aarch64.md
        (rsqrt<mode>2): Use new function instead.
        (sqrt<mode>2): New expansion and insn definitions.
        * config/aarch64/aarch64-simd.md: Likewise.
        * config/aarch64/aarch64.opt
        (mlow-precision-recip-sqrt): Expand option description.
        * doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.

This patch refactors the function to emit the reciprocal square rootapproximation to also emit the square root approximation.


Feedback is welcome.

Thank you,

--
Evandro Menezes

>From 8d00622b90fa414df605011446ac058efe867cf6 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 17 Mar 2016 17:39:55 -0500
Subject: [PATCH] Emit square root using the Newton series

2016-03-17  Evandro Menezes  <e.mene...@samsung.com>
            Wilco Dijkstra  <wilco.dijks...@arm.com>

gcc/
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
	* config/aarch64/aarch64-protos.h
	(aarch64_emit_approx_rsqrt): Replace with "aarch64_emit_approx_sqrt".
	(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
	* config/aarch64/aarch64.c
	(exynosm1_tunings): Use the new macro.
	(aarch64_emit_approx_sqrt): Define new function.
	* config/aarch64/aarch64.md
	(rsqrt<mode>2): Use new function instead.
	(sqrt<mode>2): New expansion and insn definitions.
	* config/aarch64/aarch64-simd.md: Likewise.
	* config/aarch64/aarch64.opt
	(mlow-precision-recip-sqrt): Expand option description.
	* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h         |  5 +-
 gcc/config/aarch64/aarch64-simd.md          | 27 +++++++-
 gcc/config/aarch64/aarch64-tuning-flags.def |  3 +-
 gcc/config/aarch64/aarch64.c                | 97 +++++++++++++++++++++++------
 gcc/config/aarch64/aarch64.md               | 25 +++++++-
 gcc/config/aarch64/aarch64.opt              |  4 +-
 gcc/doc/invoke.texi                         |  9 +--
 7 files changed, 139 insertions(+), 31 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..3f3ae1c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
 };
 #undef AARCH64_EXTRA_TUNING_OPTION
 
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+  (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
+
 extern struct tune_params aarch64_tune_params;
 
 HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -361,7 +364,7 @@ void aarch64_register_pragmas (void);
 void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+void aarch64_emit_approx_sqrt (rtx, rtx, bool);
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..31191bb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
 		     UNSPEC_RSQRT))]
   "TARGET_SIMD"
 {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
   DONE;
 })
 
@@ -4307,7 +4307,30 @@
 
 ;; sqrt
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+	(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+    {
+      aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
         (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
   "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..725a79c 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ed0daa5..04f5633 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
 #include "recog.h"
 #include "diagnostic.h"
 #include "insn-attr.h"
+#include "insn-flags.h"
 #include "alias.h"
 #include "fold-const.h"
 #include "stor-layout.h"
@@ -7498,46 +7499,102 @@ get_rsqrts_type (machine_mode mode)
   }
 }
 
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal.  */
 
 void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
 {
   machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-	|| mode == DFmode || mode == V2DFmode);
+  machine_mode mmsk;
+
+  gcc_assert (GET_MODE_INNER (mode) == SFmode
+              || GET_MODE_INNER (mode) == DFmode);
 
   rtx xsrc = gen_reg_rtx (mode);
   emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
 
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+  rtx xcc, xne, xmsk;
+  bool scalar = !VECTOR_MODE_P (mode);
+  if (!recp)
+    {
+      if (scalar)
+	{
+	  /* Compare argument with 0.0 and set the CC.  */
+	  xcc = aarch64_gen_compare_reg (NE, xsrc, CONST0_RTX (mode));
+	  xne = gen_rtx_NE (VOIDmode, xcc, const0_rtx);
+	}
+      else
+	{
+	  /* Compare the argument with 0.0 and create a vector mask.  */
+	  mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+				  GET_MODE_NUNITS (mode));
+	  xmsk = gen_reg_rtx (mmsk);
+	  switch (mode)
+	  {
+	    case V2SFmode:
+	      emit_insn (gen_aarch64_cmeqv2sf (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+	    case V4SFmode:
+	      emit_insn (gen_aarch64_cmeqv4sf (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  int iterations = double_mode ? 3 : 2;
+	    case V2DFmode:
+	      emit_insn (gen_aarch64_cmeqv2df (xmsk, xsrc, CONST0_RTX (mode)));
+	      break;
 
-  /* Optionally iterate over the series one less time than otherwise.  */
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+    }
+
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, xsrc));
+
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
   if (flag_mrecip_low_precision_sqrt)
     iterations--;
 
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series.  */
+  while (iterations--)
     {
-      rtx x1 = gen_reg_rtx (mode);
       rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+      rtx x1 = gen_reg_rtx (mode);
+      emit_insn ((*get_rsqrts_type (mode)) (x1, xsrc, x2));
 
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      emit_set_insn (xdst, gen_rtx_MULT (mode, x1, xdst));
+    }
+
+  if (!recp)
+    {
+      /* Qualify the final estimate for the approximate reciprocal square root
+	 when the argument is 0.0.  */
+      if (scalar)
+	/* Conditionally set the final estimate to 0.0.  */
+	emit_set_insn (xdst, gen_rtx_IF_THEN_ELSE (mode, xne, xdst, xsrc));
+      else
+	{
+	  /* Mask off any final vector element estimate to 0.0.  */
+	  rtx xtmp = gen_reg_rtx (mmsk);
+	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+					    gen_rtx_SUBREG (mmsk, xdst, 0)));
+	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+	}
 
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      /* Calculate the approximate square root.  */
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xsrc, xdst));
     }
 
-  emit_move_insn (dst, x0);
+  emit_move_insn (dst, xdst);
 }
 
 /* Return the number of instructions that can be issued per cycle.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..71725e7 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,30 @@
   [(set_attr "type" "ffarith<s>")]
 )
 
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+  if (flag_finite_math_only
+      && !flag_trapping_math
+      && flag_unsafe_math_optimizations
+      && !optimize_function_for_size_p (cfun)
+      && ((mode == SFmode
+           && (aarch64_tune_params.extra_tuning_flags
+               & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+          || (mode == DFmode
+              && (aarch64_tune_params.extra_tuning_flags
+                  & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+    {
+      aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+      DONE;
+    }
+})
+
+(define_insn "*sqrt<mode>2"
   [(set (match_operand:GPF 0 "register_operand" "=w")
         (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
   "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..c5e7fc9 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,5 @@ PC relative literal loads.
 
 mlow-precision-recip-sqrt
 Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 99ac11b..d48c29b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12903,10 +12903,11 @@ corresponding flag to the linker.
 @item -mno-low-precision-recip-sqrt
 @opindex -mlow-precision-recip-sqrt
 @opindex -mno-low-precision-recip-sqrt
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} enables the reciprocal square root
-approximation, which in turn depends on the target processor.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the approximate square root or its approximate reciprocal,
+which in turn depends on the target processor.
 
 @item -march=@var{name}
 @opindex march
-- 
1.9.1

Emit square root using the Newton series

Reply via email to