On 04/05/16 13:37, Wilco Dijkstra wrote:
I can't get any of these to work... Not only do I get a large number of
collisions and duplicated
code between these patches, when I try to resolve them, all I get is crashes
whenever I try
to use sqrt (even rsqrt stopped working). Do you have a patchset that applies
cleanly so I can
try all approximation routines?
Hi, Wilco.
The original patches should be independent of each other, so indeed they
duplicate code.
This patch suite should be suitable for testing.
HTH
--
Evandro Menezes
>From cbc2b62f7df5c3e2fef2a24157b1bdd1a6de191b Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Mon, 4 Apr 2016 14:02:24 -0500
Subject: [PATCH 3/3] Emit division using the Newton series
2016-04-04 Evandro Menezes <e.mene...@samsung.com>
Wilco Dijkstra <wilco.dijks...@arm.com>
gcc/
* config/aarch64/aarch64-tuning-flags.def
* config/aarch64/aarch64-protos.h
(tune_params): Add new member "approx_div_modes".
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_div_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
* config/aarch64/aarch64.opt (-mlow-precision-div): Add new option.
* doc/invoke.texi (-mlow-precision-div): Describe new option.
---
gcc/config/aarch64/aarch64-protos.h | 2 +
gcc/config/aarch64/aarch64-simd.md | 14 +++++-
gcc/config/aarch64/aarch64.c | 85 +++++++++++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 19 +++++++--
gcc/config/aarch64/aarch64.opt | 5 +++
gcc/doc/invoke.texi | 10 +++++
6 files changed, 130 insertions(+), 5 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 85ad796..649faf7 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -244,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_div_modes;
unsigned int approx_sqrt_modes;
unsigned int approx_rsqrt_modes;
};
@@ -390,6 +391,7 @@ void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 47ccb18..7e99e16 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (div:VDQF (match_operand:VDQF 1 "general_operand")
+ (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
(match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4af2175..74310e8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -417,6 +417,7 @@ static const struct tune_params generic_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -444,6 +445,7 @@ static const struct tune_params cortexa35_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -471,6 +473,7 @@ static const struct tune_params cortexa53_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -498,6 +501,7 @@ static const struct tune_params cortexa57_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -525,6 +529,7 @@ static const struct tune_params cortexa72_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -551,6 +556,7 @@ static const struct tune_params exynosm1_tunings =
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_ALL), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -577,6 +583,7 @@ static const struct tune_params thunderx_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -603,6 +610,7 @@ static const struct tune_params xgene1_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_div_modes. */
(AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -7604,6 +7612,83 @@ aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
return true;
}
+/* Emit the instruction sequence to compute the approximation for a division. */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+ machine_mode mode = GET_MODE (quo);
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !(flag_mlow_precision_div
+ || (aarch64_tune_params.approx_div_modes & AARCH64_APPROX_MODE (mode))))
+ return false;
+
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ /* Optionally iterate over the series once less for faster performance,
+ while sacrificing the accuracy. */
+ if (flag_mlow_precision_div)
+ iterations--;
+
+ /* Iterate over the series to calculate the approximate reciprocal. */
+ rtx xtmp = gen_reg_rtx (mode);
+ while (iterations--)
+ {
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (iterations > 0)
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ }
+
+ if (num != CONST1_RTX (mode))
+ {
+ /* Calculate the approximate division. */
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
+ }
+
+ /* Return the approximation. */
+ emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
+ return true;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 43fa318..b42ce1a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4647,11 +4647,22 @@
[(set_attr "type" "fmul<s>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+ (div:GPF (match_operand:GPF 1 "general_operand")
+ (match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:GPF 0 "register_operand" "=w")
- (div:GPF
- (match_operand:GPF 1 "register_operand" "w")
- (match_operand:GPF 2 "register_operand" "w")))]
+ (div:GPF (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "register_operand" "w")))]
"TARGET_FLOAT"
"fdiv\\t%<s>0, %<s>1, %<s>2"
[(set_attr "type" "fdiv<s>")]
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index ffd5540..760bd50 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -158,3 +158,8 @@ mlow-precision-sqrt
Common Var(flag_mlow_precision_sqrt) Optimization
When calculating the approximate square root,
use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-div
+Common Var(flag_mlow_precision_div) Optimization
+When calculating the approximate division,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 01c3e87..8d33997 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
-mlow-precision-sqrt -mno-low-precision-sqrt@gol
+-mlow-precision-div -mno-low-precision-div @gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -12931,6 +12932,15 @@ uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the square root
approximation.
+@item -mlow-precision-div
+@item -mno-low-precision-div
+@opindex -mlow-precision-div
+@opindex -mno-low-precision-div
+When calculating the division approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the division
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
1.9.1
>From ea7079be1850290146096e2b69c537875713ef62 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Mon, 4 Apr 2016 11:23:29 -0500
Subject: [PATCH 2/3] [AArch64] Emit square root using the Newton series
2016-04-04 Evandro Menezes <e.mene...@samsung.com>
Wilco Dijkstra <wilco.dijks...@arm.com>
gcc/
* config/aarch64/aarch64-protos.h
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(tune_params): New member "approx_sqrt_modes".
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_rsqrt_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(aarch64_emit_approx_rsqrt): Replace with new function
"aarch64_emit_approx_sqrt".
(aarch64_override_options_after_change_1): Handle new option.
* config/aarch64/aarch64-simd.md
(rsqrt<mode>2): Use new function instead.
(sqrt<mode>2): New expansion and insn definitions.
* config/aarch64/aarch64.md: Likewise.
* config/aarch64/aarch64.opt
(mlow-precision-sqrt): Add new option description.
* doc/invoke.texi (mlow-precision-sqrt): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 3 +-
gcc/config/aarch64/aarch64-simd.md | 13 ++++-
gcc/config/aarch64/aarch64.c | 99 +++++++++++++++++++++++++++----------
gcc/config/aarch64/aarch64.md | 11 ++++-
gcc/config/aarch64/aarch64.opt | 9 +++-
gcc/doc/invoke.texi | 10 ++++
6 files changed, 113 insertions(+), 32 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index fe1746b..85ad796 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -244,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_sqrt_modes;
unsigned int approx_rsqrt_modes;
};
@@ -388,7 +389,7 @@ void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..47ccb18 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
UNSPEC_RSQRT))]
"TARGET_SIMD"
{
- aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+ aarch64_emit_approx_sqrt (operands[0], operands[1], true);
DONE;
})
@@ -4307,7 +4307,16 @@
;; sqrt
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
"TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b0ee11e..4af2175 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
#include "recog.h"
#include "diagnostic.h"
#include "insn-attr.h"
+#include "insn-flags.h"
#include "insn-modes.h"
#include "alias.h"
#include "fold-const.h"
@@ -416,6 +417,7 @@ static const struct tune_params generic_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -442,6 +444,7 @@ static const struct tune_params cortexa35_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -468,6 +471,7 @@ static const struct tune_params cortexa53_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -494,6 +498,7 @@ static const struct tune_params cortexa57_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -520,6 +525,7 @@ static const struct tune_params cortexa72_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -545,6 +551,7 @@ static const struct tune_params exynosm1_tunings =
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_ALL), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -570,6 +577,7 @@ static const struct tune_params thunderx_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
@@ -595,6 +603,7 @@ static const struct tune_params xgene1_tunings =
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */
(AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
@@ -7521,46 +7530,78 @@ get_rsqrts_type (machine_mode mode)
}
}
-/* Emit instruction sequence to compute the reciprocal square root using the
- Newton-Raphson series. Iterate over the series twice for SF
- and thrice for DF. */
+/* Emit instruction sequence to compute either the approximate square root
+ or its approximate reciprocal. */
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
{
- machine_mode mode = GET_MODE (src);
- gcc_assert (
- mode == SFmode || mode == V2SFmode || mode == V4SFmode
- || mode == DFmode || mode == V2DFmode);
-
- rtx xsrc = gen_reg_rtx (mode);
- emit_move_insn (xsrc, src);
- rtx x0 = gen_reg_rtx (mode);
+ machine_mode mode = GET_MODE (dst);
+ machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)),
+ GET_MODE_NUNITS (mode));
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || !((recp && (flag_mrecip_low_precision_sqrt
+ || (aarch64_tune_params.approx_rsqrt_modes
+ & AARCH64_APPROX_MODE (mode))))
+ || (!recp && (flag_mlow_precision_sqrt
+ || (aarch64_tune_params.approx_sqrt_modes
+ & AARCH64_APPROX_MODE (mode))))))
+ return false;
- emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+ rtx xmsk = gen_reg_rtx (mmsk);
+ if (!recp)
+ /* When calculating the approximate square root, compare the argument with
+ 0.0 and create a mask. */
+ emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+ CONST0_RTX (mode)))));
- bool double_mode = (mode == DFmode || mode == V2DFmode);
+ /* Estimate the approximate reciprocal square root. */
+ rtx xdst = gen_reg_rtx (mode);
+ emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
- int iterations = double_mode ? 3 : 2;
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
- /* Optionally iterate over the series one less time than otherwise. */
- if (flag_mrecip_low_precision_sqrt)
+ /* Optionally iterate over the series once less for faster performance
+ while sacrificing the accuracy. */
+ if ((recp && flag_mrecip_low_precision_sqrt)
+ || (!recp && flag_mlow_precision_sqrt))
iterations--;
- for (int i = 0; i < iterations; ++i)
+ /* Iterate over the series to calculate the approximate reciprocal square root. */
+ rtx x1 = gen_reg_rtx (mode);
+ while (iterations--)
{
- rtx x1 = gen_reg_rtx (mode);
rtx x2 = gen_reg_rtx (mode);
- rtx x3 = gen_reg_rtx (mode);
- emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+ emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+ emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
- emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+ if (iterations > 0)
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
+ }
+
+ if (!recp)
+ {
+ /* Qualify the approximate reciprocal square root when the argument is
+ 0.0 by squashing the intermediary result to 0.0. */
+ rtx xtmp = gen_reg_rtx (mmsk);
+ emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+ gen_rtx_SUBREG (mmsk, xdst, 0)));
+ emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
- emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
- x0 = x1;
+ /* Calculate the approximate square root. */
+ emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
}
- emit_move_insn (dst, x0);
+ /* Return the approximation. */
+ emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+ return true;
}
/* Return the number of instructions that can be issued per cycle. */
@@ -8167,6 +8208,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
&& (aarch64_cmodel == AARCH64_CMODEL_TINY
|| aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
aarch64_nopcrelative_literal_loads = false;
+
+ /* When enabling the lower precision Newton series for the square root, also
+ enable it for the reciprocal square root, since the later is an
+ intermediary step for the latter. */
+ if (flag_mlow_precision_sqrt)
+ flag_mrecip_low_precision_sqrt = true;
}
/* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..43fa318 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4665,7 +4665,16 @@
[(set_attr "type" "ffarith<s>")]
)
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:GPF 0 "register_operand")
+ (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+ DONE;
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:GPF 0 "register_operand" "=w")
(sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
"TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index c637ff4..ffd5540 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,10 @@ PC relative literal loads.
mlow-precision-recip-sqrt
Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+When calculating the approximate reciprocal square root,
+use one less step than otherwise, thus reducing latency and precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+When calculating the approximate square root,
+use one less step than otherwise, thus reducing latency and precision.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 488c52c..01c3e87 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -573,6 +573,7 @@ Objective-C and Objective-C++ Dialects}.
-mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol
-mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol
-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
-march=@var{name} -mcpu=@var{name} -mtune=@var{name}}
@emph{Adapteva Epiphany Options}
@@ -12921,6 +12922,15 @@ uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the reciprocal square root
approximation.
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+When calculating the square root approximation,
+uses one less step than otherwise, thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} enables the square root
+approximation.
+
@item -march=@var{name}
@opindex march
Specify the name of the target architecture and, optionally, one or
--
1.9.1
>From 428d21df1ae04ad263ddb9b0493cc40a3e566e04 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
approximation
Allow a target to prefer such operation depending on the operation mode.
2016-03-03 Evandro Menezes <e.mene...@samsung.com>
gcc/
* config/aarch64/aarch64-protos.h
(AARCH64_APPROX_MODE): New macro.
(AARCH64_APPROX_{NONE,SP,DP,DFORM,QFORM,SCALAR,VECTOR,ALL}): Likewise.
(tune_params): New member "approx_rsqrt_modes".
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_rsqrt_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(use_rsqrt_p): New argument for the mode and use new member from
"tune_params".
(aarch64_builtin_reciprocal): Devise mode from builtin.
(aarch64_optab_supported_p): New argument for the mode.
* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
---
gcc/config/aarch64/aarch64-protos.h | 27 ++++++++++++++++++++
gcc/config/aarch64/aarch64-tuning-flags.def | 2 --
gcc/config/aarch64/aarch64.c | 39 ++++++++++++++++++-----------
gcc/doc/invoke.texi | 2 +-
4 files changed, 53 insertions(+), 17 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 58c9d0d..fe1746b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,32 @@ struct cpu_branch_cost
const int unpredictable; /* Unpredictable branch or optimizing for speed. */
};
+/* Control approximate alternatives to certain FP operators. */
+#define AARCH64_APPROX_MODE(MODE) \
+ ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+ : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+ + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+ : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_SP (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (V2SFmode) \
+ | AARCH64_APPROX_MODE (V4SFmode))
+#define AARCH64_APPROX_DP (AARCH64_APPROX_MODE (DFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_DFORM (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (DFmode) \
+ | AARCH64_APPROX_MODE (V2SFmode))
+#define AARCH64_APPROX_QFORM (AARCH64_APPROX_MODE (V4SFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_SCALAR (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (DFmode))
+#define AARCH64_APPROX_VECTOR (AARCH64_APPROX_MODE (V2SFmode) \
+ | AARCH64_APPROX_MODE (V4SFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_ALL (-1)
+
struct tune_params
{
const struct cpu_cost_table *insn_extra_cost;
@@ -218,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_rsqrt_modes;
};
#define AARCH64_FUSION_PAIR(x, name) \
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@
AARCH64_TUNE_ to give an enum name. */
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b7086dd..b0ee11e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
#include "recog.h"
#include "diagnostic.h"
#include "insn-attr.h"
+#include "insn-modes.h"
#include "alias.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -414,7 +415,8 @@ static const struct tune_params generic_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa35_tunings =
@@ -439,7 +441,8 @@ static const struct tune_params cortexa35_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa53_tunings =
@@ -464,7 +467,8 @@ static const struct tune_params cortexa53_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa57_tunings =
@@ -489,7 +493,8 @@ static const struct tune_params cortexa57_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa72_tunings =
@@ -514,7 +519,8 @@ static const struct tune_params cortexa72_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params exynosm1_tunings =
@@ -538,7 +544,8 @@ static const struct tune_params exynosm1_tunings =
48, /* max_case_values. */
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
static const struct tune_params thunderx_tunings =
@@ -562,7 +569,8 @@ static const struct tune_params thunderx_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params xgene1_tunings =
@@ -586,7 +594,8 @@ static const struct tune_params xgene1_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
/* Support for fine-grained override of the tuning structures. */
@@ -7452,12 +7461,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
to optimize 1.0/sqrt. */
static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
{
return (!flag_trapping_math
&& flag_unsafe_math_optimizations
- && ((aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+ && ((aarch64_tune_params.approx_rsqrt_modes
+ & AARCH64_APPROX_MODE (mode))
|| flag_mrecip_low_precision_sqrt));
}
@@ -7467,7 +7476,9 @@ use_rsqrt_p (void)
static tree
aarch64_builtin_reciprocal (tree fndecl)
{
- if (!use_rsqrt_p ())
+ machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+ if (!use_rsqrt_p (mode))
return NULL_TREE;
return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
}
@@ -13964,13 +13975,13 @@ aarch64_promoted_type (const_tree t)
/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
optimization_type opt_type)
{
switch (op)
{
case rsqrt_optab:
- return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+ return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
default:
return true;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e9763d4..488c52c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12919,7 +12919,7 @@ corresponding flag to the linker.
When calculating the reciprocal square root approximation,
uses one less step than otherwise, thus reducing latency and precision.
This is only relevant if @option{-ffast-math} enables the reciprocal square root
-approximation, which in turn depends on the target processor.
+approximation.
@item -march=@var{name}
@opindex march
--
1.9.1