On 05/25/16 10:52, James Greenhalgh wrote:
On Wed, Apr 27, 2016 at 04:15:45PM -0500, Evandro Menezes wrote:
gcc/
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(tune_params): New member "approx_sqrt_modes".
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_rsqrt_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(aarch64_override_options_after_change_1): Handle new option.
* config/aarch64/aarch64-simd.md
(rsqrt<mode>2): Use new function instead.
(sqrt<mode>2): New expansion and insn definitions.
* config/aarch64/aarch64.md: Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-sqrt): Add new option description.
* doc/invoke.texi (mlow-precision-sqrt): Likewise.
diff --git a/gcc/config/aarch64/aarch64-protos.h
b/gcc/config/aarch64/aarch64-protos.h
index 50f1d24..437f6af 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -244,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_sqrt_modes;
unsigned int approx_rsqrt_modes;
};
This should go in struct recommended in 1/3 too.
OK
@@ -396,7 +397,7 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 68381bf..589871b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
#include "recog.h"
#include "diagnostic.h"
#include "insn-attr.h"
+#include "insn-flags.h"
Can you remember what you needed this include for? I couldn't spot it below
and removing the include didn't seem to cause any trouble to the build.
If it isn't needed, drop it.
I don't recall. Dropped.
#include "insn-modes.h"
#include "alias.h"
#include "fold-const.h"
@@ -7521,46 +7530,78 @@ get_rsqrts_type (machine_mode mode)
}
}
-/* Emit instruction sequence to compute the reciprocal square root using the
- Newton-Raphson series. Iterate over the series twice for SF
- and thrice for DF. */
+/* Emit instruction sequence to compute either the approximate square root
+ or its approximate reciprocal. */
As you are updating this function a new parameter and a return value, please
comment it appropriately. Describing the purpose of the parameter and the
meaning of the return value.
OK
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
{
- machine_mode mode = GET_MODE (src);
- gcc_assert (
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
-
- rtx xsrc = gen_reg_rtx (mode);
- emit_move_insn (xsrc, src);
- rtx x0 = gen_reg_rtx (mode);
+ machine_mode mode = GET_MODE (dst);
+ machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER
(mode)),
+ GET_MODE_NUNITS (mode));
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !((recp && (flag_mrecip_low_precision_sqrt
+ || (aarch64_tune_params.approx_rsqrt_modes
+ & AARCH64_APPROX_MODE (mode))))
+ || (!recp && (flag_mlow_precision_sqrt
+ || (aarch64_tune_params.approx_sqrt_modes
+ & AARCH64_APPROX_MODE (mode))))))
+ return false;
Can you pull out these sub-clauses and assign them somewhere for clarity.
These expressions are a bit too big to grok at a distance. I'd rather be
reading:
bool use_low_precision_rsqrt_p
= recp
&& (flag_mrecip_low_precision_sqrt
|| (aarch64_tune_params.approx_rsqrt_modes
& AARCH64_APPROX_MODE (mode)))
<...>
!((use_low_precision_sqrt_p)
|| (use_low_precision_rsqrt_p))
OK
- emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+ rtx xmsk = gen_reg_rtx (mmsk);
+ if (!recp)
+ /* When calculating the approximate square root, compare the argument with
+ 0.0 and create a mask. */
+ emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+ CONST0_RTX (mode)))));
- bool double_mode = (mode == DFmode || mode == V2DFmode);
+ /* Estimate the approximate reciprocal square root. */
+ rtx xdst = gen_reg_rtx (mode);
+ emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
- int iterations = double_mode ? 3 : 2;
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
- /* Optionally iterate over the series one less time than otherwise. */
- if (flag_mrecip_low_precision_sqrt)
+ /* Optionally iterate over the series once less for faster performance
+ while sacrificing the accuracy. */
+ if ((recp && flag_mrecip_low_precision_sqrt)
+ || (!recp && flag_mlow_precision_sqrt))
iterations--;
- for (int i = 0; i < iterations; ++i)
+ /* Iterate over the series to calculate the approximate reciprocal square
root. */
+ rtx x1 = gen_reg_rtx (mode);
Why do you generate a register for x1 outside the loop but x2 inside it?
Is it just because you want x1 to survive this scope? It seems you could
either reuse a single x2 register too, or repeatedly generate a new x1
inside the loop too.
Yes, x1 is needed from here through the last step before the function
returns.
+ while (iterations--)
{
- rtx x1 = gen_reg_rtx (mode);
rtx x2 = gen_reg_rtx (mode);
- rtx x3 = gen_reg_rtx (mode);
- emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+ emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+ emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+ if (iterations > 0)
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
+ }
+
+ if (!recp)
+ {
+ /* Qualify the approximate reciprocal square root when the argument is
+ 0.0 by squashing the intermediary result to 0.0. */
+ rtx xtmp = gen_reg_rtx (mmsk);
+ emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+ gen_rtx_SUBREG (mmsk, xdst, 0)));
+ emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
- x0 = x1;
+ /* Calculate the approximate square root. */
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
}
- emit_move_insn (dst, x0);
+ /* Return the approximation. */
+ emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+ return true;
}
/* Return the number of instructions that can be issued per cycle. */
@@ -8090,6 +8131,12 @@ aarch64_override_options_after_change_1 (struct
gcc_options *opts)
&& (aarch64_cmodel == AARCH64_CMODEL_TINY
|| aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
aarch64_nopcrelative_literal_loads = false;
+
+ /* When enabling the lower precision Newton series for the square root, also
+ enable it for the reciprocal square root, since the later is an
+ intermediary step for the latter. */
+ if (flag_mlow_precision_sqrt)
+ flag_mrecip_low_precision_sqrt = true;
}
/* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9b282f1..aab3e00 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4683,7 +4683,16 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
Do we really want separate options for each of these optimizations? Could
they all be wrapped under one -mlow-precision-fp or similar? I can see
benefits to either approach, but let me know your thoughts.
Since the results of these approximations are not accurate, I prefer to
let the user decide which of them may be relaxed. Again, I lean towards
the way done for RS6000.
Thank you,
--
Evandro Menezes
>From 42e1a3d1289ba3951d3b9a7fd523422b6c0e56c8 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Mon, 4 Apr 2016 11:23:29 -0500
Subject: [PATCH 2/3] [AArch64] Emit square root using the Newton series
2016-04-04 Evandro Menezes <e.mene...@samsung.com>
Wilco Dijkstra <wilco.dijks...@arm.com>
gcc/
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(cpu_approx_modes): New member "sqrt".
* config/aarch64/aarch64.c
(generic_approx_modes): New member "sqrt".
(exynosm1_approx_modes): Likewise.
(xgene1_approx_modes): Likewise.
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(aarch64_override_options_after_change_1): Handle new option.
* config/aarch64/aarch64-simd.md
(rsqrt<mode>2): Use new function instead.
(sqrt<mode>2): New expansion and insn definitions.
* config/aarch64/aarch64.md: Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-sqrt): Add new option description.
* doc/invoke.texi (mlow-precision-sqrt): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 5 +-
gcc/config/aarch64/aarch64-simd.md | 13 ++++-
gcc/config/aarch64/aarch64.c | 108 +++++++++++++++++++++++++-----------
gcc/config/aarch64/aarch64.md | 11 +++-
gcc/config/aarch64/aarch64.opt | 9 ++-
gcc/doc/invoke.texi | 10 ++++
6 files changed, 117 insertions(+), 39 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 6156281..2f407fd 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,7 +192,8 @@ struct cpu_branch_cost
/* Allowed modes for approximations. */
struct cpu_approx_modes
{
- const unsigned int recip_sqrt; /* Reciprocal square root. */
+ const unsigned int sqrt; /* Square root. */
+ const unsigned int recip_sqrt; /* Reciprocal square root. */
};
struct tune_params
@@ -388,7 +389,7 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..47ccb18 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
UNSPEC_RSQRT))]
"TARGET_SIMD"
{
- aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+ aarch64_emit_approx_sqrt (operands[0], operands[1], true);
DONE;
})
@@ -4307,7 +4307,16 @@
;; sqrt
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
"TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e532cfc..d55ca34 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -397,18 +397,21 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
/* Generic approximation modes. */
static const cpu_approx_modes generic_approx_modes =
{
+ AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_NONE /* recip_sqrt */
};
/* Approximation modes for Exynos M1. */
static const cpu_approx_modes exynosm1_approx_modes =
{
+ AARCH64_APPROX_ALL, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
/* Approximation modes for Xgene1. */
static const cpu_approx_modes xgene1_approx_modes =
{
+ AARCH64_APPROX_NONE, /* sqrt */
AARCH64_APPROX_ALL /* recip_sqrt */
};
@@ -7503,8 +7506,8 @@ aarch64_builtin_reciprocal (tree fndecl)
typedef rtx (*rsqrte_type) (rtx, rtx);
-/* Select reciprocal square root initial estimate
- insn depending on machine mode. */
+/* Select reciprocal square root initial estimate insn depending on machine
+ mode. */
rsqrte_type
get_rsqrte_type (machine_mode mode)
@@ -7516,14 +7519,13 @@ get_rsqrte_type (machine_mode mode)
case V2DFmode: return gen_aarch64_rsqrte_v2df2;
case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
- default: gcc_unreachable ();
+ default: gcc_unreachable ();
}
}
typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
-/* Select reciprocal square root Newton-Raphson step
- insn depending on machine mode. */
+/* Select reciprocal square root series step insn depending on machine mode. */
rsqrts_type
get_rsqrts_type (machine_mode mode)
@@ -7535,50 +7537,86 @@ get_rsqrts_type (machine_mode mode)
case V2DFmode: return gen_aarch64_rsqrts_v2df3;
case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
- default: gcc_unreachable ();
+ default: gcc_unreachable ();
}
}
-/* Emit instruction sequence to compute the reciprocal square root using the
- Newton-Raphson series. Iterate over the series twice for SF
- and thrice for DF. */
+/* Emit instruction sequence to compute either the approximate square root
+ or its approximate reciprocal, depending on the flag RECP, and return
+ whether the sequence was emitted or not. */
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
{
- machine_mode mode = GET_MODE (src);
- gcc_assert (
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
-
- rtx xsrc = gen_reg_rtx (mode);
- emit_move_insn (xsrc, src);
- rtx x0 = gen_reg_rtx (mode);
+ machine_mode mode = GET_MODE (dst);
+ machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode));
+ bool use_approx_sqrt_p = (!recp
+ && (flag_mlow_precision_sqrt
+ || (aarch64_tune_params.approx_modes->sqrt
+ & AARCH64_APPROX_MODE (mode))));
+ bool use_approx_rsqrt_p = (recp
+ && (flag_mrecip_low_precision_sqrt
+ || (aarch64_tune_params.approx_modes->recip_sqrt
+ & AARCH64_APPROX_MODE (mode))));
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !(use_approx_sqrt_p || use_approx_rsqrt_p))
+ return false;
- emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+ rtx xmsk = gen_reg_rtx (mmsk);
+ if (!recp)
+ /* When calculating the approximate square root, compare the argument with
+ 0.0 and create a mask. */
+ emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+ CONST0_RTX (mode)))));
- bool double_mode = (mode == DFmode || mode == V2DFmode);
+ /* Estimate the approximate reciprocal square root. */
+ rtx xdst = gen_reg_rtx (mode);
+ emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
- int iterations = double_mode ? 3 : 2;
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
- /* Optionally iterate over the series one less time than otherwise. */
- if (flag_mrecip_low_precision_sqrt)
+ /* Optionally iterate over the series once less for faster performance
+ while sacrificing the accuracy. */
+ if ((recp && flag_mrecip_low_precision_sqrt)
+ || (!recp && flag_mlow_precision_sqrt))
iterations--;
- for (int i = 0; i < iterations; ++i)
+ /* Iterate over the series to calculate the approximate reciprocal square root. */
+ rtx x1 = gen_reg_rtx (mode);
+ while (iterations--)
{
- rtx x1 = gen_reg_rtx (mode);
rtx x2 = gen_reg_rtx (mode);
- rtx x3 = gen_reg_rtx (mode);
- emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+ emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+ emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
- x0 = x1;
+ if (iterations > 0)
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
}
- emit_move_insn (dst, x0);
+ if (!recp)
+ {
+ /* Qualify the approximate reciprocal square root when the argument is
+ 0.0 by squashing the intermediary result to 0.0. */
+ rtx xtmp = gen_reg_rtx (mmsk);
+ emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+ gen_rtx_SUBREG (mmsk, xdst, 0)));
+ emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
+
+ /* Calculate the approximate square root. */
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
+ }
+
+ /* Finalize the approximation. */
+ emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+ return true;
}
/* Return the number of instructions that can be issued per cycle. */
@@ -8108,6 +8146,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
&& (aarch64_cmodel == AARCH64_CMODEL_TINY
|| aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
aarch64_nopcrelative_literal_loads = false;
+
+ /* When enabling the lower precision Newton series for the square root, also
+ enable it for the reciprocal square root, since the later is an
+ intermediary step for the latter. */
+ if (flag_mlow_precision_sqrt)
+ flag_mrecip_low_precision_sqrt = true;
}
/* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9b282f1..aab3e00 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4683,7 +4683,16 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 4340b08..76b7a5c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -12941,6 +12942,15 @@ uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the reciprocal square root
approximation.
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+When calculating the square root approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the square root
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
2.6.3