2016-03-16 Evandro Menezes <e.mene...@samsung.com>
Wilco Dijkstra <wilco.dijks...@arm.com>
gcc/
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_rsqrt): Replace with
"aarch64_emit_approx_sqrt".
(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
* config/aarch64/aarch64.c
(exynosm1_tunings): Use the new macro.
(aarch64_emit_approx_sqrt): Define new function.
* config/aarch64/aarch64.md
(rsqrt<mode>2): Use new function instead.
(sqrt<mode>2): New expansion and insn definitions.
* config/aarch64/aarch64-simd.md: Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-recip-sqrt): Expand option description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
This patch refactors the function to emit the reciprocal square root
approximation to also emit the square root approximation.
Feedback is welcome.
Thank you,
--
Evandro Menezes
>From 8d00622b90fa414df605011446ac058efe867cf6 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 17 Mar 2016 17:39:55 -0500
Subject: [PATCH] Emit square root using the Newton series
2016-03-17 Evandro Menezes <e.mene...@samsung.com>
Wilco Dijkstra <wilco.dijks...@arm.com>
gcc/
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_SQRT_{SF,DF}): New tuning macros.
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_rsqrt): Replace with "aarch64_emit_approx_sqrt".
(AARCH64_EXTRA_TUNE_APPROX_SQRT): New macro.
* config/aarch64/aarch64.c
(exynosm1_tunings): Use the new macro.
(aarch64_emit_approx_sqrt): Define new function.
* config/aarch64/aarch64.md
(rsqrt<mode>2): Use new function instead.
(sqrt<mode>2): New expansion and insn definitions.
* config/aarch64/aarch64-simd.md: Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-recip-sqrt): Expand option description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 5 +-
gcc/config/aarch64/aarch64-simd.md | 27 +++++++-
gcc/config/aarch64/aarch64-tuning-flags.def | 3 +-
gcc/config/aarch64/aarch64.c | 97 +++++++++++++++++++++++------
gcc/config/aarch64/aarch64.md | 25 +++++++-
gcc/config/aarch64/aarch64.opt | 4 +-
gcc/doc/invoke.texi | 9 +--
7 files changed, 139 insertions(+), 31 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..3f3ae1c 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
};
#undef AARCH64_EXTRA_TUNING_OPTION
+#define AARCH64_EXTRA_TUNE_APPROX_SQRT \
+ (AARCH64_EXTRA_TUNE_APPROX_SQRT_DF | AARCH64_EXTRA_TUNE_APPROX_SQRT_SF)
+
extern struct tune_params aarch64_tune_params;
HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -361,7 +364,7 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+void aarch64_emit_approx_sqrt (rtx, rtx, bool);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..31191bb 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
UNSPEC_RSQRT))]
"TARGET_SIMD"
{
- aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+ aarch64_emit_approx_sqrt (operands[0], operands[1], true);
DONE;
})
@@ -4307,7 +4307,30 @@
;; sqrt
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+ if (flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations
+ && !optimize_function_for_size_p (cfun)
+ && ((mode == SFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+ || (mode == DFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+ {
+ aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
"TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..725a79c 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrt", APPROX_SQRT_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_sqrtf", APPROX_SQRT_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ed0daa5..04f5633 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
#include "recog.h"
#include "diagnostic.h"
#include "insn-attr.h"
+#include "insn-flags.h"
#include "alias.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -7498,46 +7499,102 @@ get_rsqrts_type (machine_mode mode)
}
}
-/* Emit instruction sequence to compute the reciprocal square root using the
- Newton-Raphson series. Iterate over the series twice for SF
- and thrice for DF. */
+/* Emit instruction sequence to compute either the approximate square root
+ or its approximate reciprocal. */
void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
{
machine_mode mode = GET_MODE (src);
- gcc_assert (
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
+ machine_mode mmsk;
+
+ gcc_assert (GET_MODE_INNER (mode) == SFmode
+ || GET_MODE_INNER (mode) == DFmode);
rtx xsrc = gen_reg_rtx (mode);
emit_move_insn (xsrc, src);
- rtx x0 = gen_reg_rtx (mode);
- emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+ rtx xcc, xne, xmsk;
+ bool scalar = !VECTOR_MODE_P (mode);
+ if (!recp)
+ {
+ if (scalar)
+ {
+ /* Compare argument with 0.0 and set the CC. */
+ xcc = aarch64_gen_compare_reg (NE, xsrc, CONST0_RTX (mode));
+ xne = gen_rtx_NE (VOIDmode, xcc, const0_rtx);
+ }
+ else
+ {
+ /* Compare the argument with 0.0 and create a vector mask. */
+ mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode));
+ xmsk = gen_reg_rtx (mmsk);
+ switch (mode)
+ {
+ case V2SFmode:
+ emit_insn (gen_aarch64_cmeqv2sf (xmsk, xsrc, CONST0_RTX (mode)));
+ break;
- bool double_mode = (mode == DFmode || mode == V2DFmode);
+ case V4SFmode:
+ emit_insn (gen_aarch64_cmeqv4sf (xmsk, xsrc, CONST0_RTX (mode)));
+ break;
- int iterations = double_mode ? 3 : 2;
+ case V2DFmode:
+ emit_insn (gen_aarch64_cmeqv2df (xmsk, xsrc, CONST0_RTX (mode)));
+ break;
- /* Optionally iterate over the series one less time than otherwise. */
+ default:
+ gcc_unreachable ();
+ }
+ }
+ }
+
+ /* Estimate the approximate reciprocal square root. */
+ rtx xdst = gen_reg_rtx (mode);
+ emit_insn ((*get_rsqrte_type (mode)) (xdst, xsrc));
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ /* Optionally iterate over the series once less for faster performance
+ while sacrificing the accuracy. */
if (flag_mrecip_low_precision_sqrt)
iterations--;
- for (int i = 0; i < iterations; ++i)
+ /* Iterate over the series. */
+ while (iterations--)
{
- rtx x1 = gen_reg_rtx (mode);
rtx x2 = gen_reg_rtx (mode);
- rtx x3 = gen_reg_rtx (mode);
- emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+ emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+ rtx x1 = gen_reg_rtx (mode);
+ emit_insn ((*get_rsqrts_type (mode)) (x1, xsrc, x2));
- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+ emit_set_insn (xdst, gen_rtx_MULT (mode, x1, xdst));
+ }
+
+ if (!recp)
+ {
+ /* Qualify the final estimate for the approximate reciprocal square root
+ when the argument is 0.0. */
+ if (scalar)
+ /* Conditionally set the final estimate to 0.0. */
+ emit_set_insn (xdst, gen_rtx_IF_THEN_ELSE (mode, xne, xdst, xsrc));
+ else
+ {
+ /* Mask off any final vector element estimate to 0.0. */
+ rtx xtmp = gen_reg_rtx (mmsk);
+ emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+ gen_rtx_SUBREG (mmsk, xdst, 0)));
+ emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+ }
- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
- x0 = x1;
+ /* Calculate the approximate square root. */
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xsrc, xdst));
}
- emit_move_insn (dst, x0);
+ emit_move_insn (dst, xdst);
}
/* Return the number of instructions that can be issued per cycle. */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..71725e7 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,30 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ machine_mode mode = GET_MODE_INNER (GET_MODE (operands[1]));
+
+ if (flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations
+ && !optimize_function_for_size_p (cfun)
+ && ((mode == SFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_SF))
+ || (mode == DFmode
+ && (aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_SQRT_DF))))
+ {
+ aarch64_emit_approx_sqrt (operands[0], operands[1], false);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..c5e7fc9 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,5 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 99ac11b..d48c29b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12903,10 +12903,11 @@ corresponding flag to the linker.
@item -mno-low-precision-recip-sqrt
@opindex -mlow-precision-recip-sqrt
@opindex -mno-low-precision-recip-sqrt
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
-This is only relevant if @option{-ffast-math} enables the reciprocal square root
-approximation, which in turn depends on the target processor.
+When calculating the approximate square root or its approximate reciprocal,
+use one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables
+the approximate square root or its approximate reciprocal,
+which in turn depends on the target processor.
@item -march=@var{name}
@opindex march
--
1.9.1