gcc/ * config/aarch64/aarch64-protos.h (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (tune_params): New member "approx_sqrt_modes". * config/aarch64/aarch64.c (generic_tunings): New member "approx_rsqrt_modes". (cortexa35_tunings): Likewise. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (cortexa72_tunings): Likewise. (exynosm1_tunings): Likewise. (thunderx_tunings): Likewise. (xgene1_tunings): Likewise. (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (aarch64_override_options_after_change_1): Handle new option. * config/aarch64/aarch64-simd.md (rsqrt<mode>2): Use new function instead. (sqrt<mode>2): New expansion and insn definitions. * config/aarch64/aarch64.md: Likewise. * config/aarch64/aarch64.opt (mlow-precision-sqrt): Add new option description. * doc/invoke.texi (mlow-precision-sqrt): Likewise.
-- Evandro Menezes
>From 753115a8691afd7aed4a510d9e9cb0a8e859acf4 Mon Sep 17 00:00:00 2001 From: Evandro Menezes <e.mene...@samsung.com> Date: Mon, 4 Apr 2016 11:23:29 -0500 Subject: [PATCH 2/3] [AArch64] Emit square root using the Newton series 2016-04-04 Evandro Menezes <e.mene...@samsung.com> Wilco Dijkstra <wilco.dijks...@arm.com> gcc/ * config/aarch64/aarch64-protos.h (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (tune_params): New member "approx_sqrt_modes". * config/aarch64/aarch64.c (generic_tunings): New member "approx_rsqrt_modes". (cortexa35_tunings): Likewise. (cortexa53_tunings): Likewise. (cortexa57_tunings): Likewise. (cortexa72_tunings): Likewise. (exynosm1_tunings): Likewise. (thunderx_tunings): Likewise. (xgene1_tunings): Likewise. (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (aarch64_override_options_after_change_1): Handle new option. * config/aarch64/aarch64-simd.md (rsqrt<mode>2): Use new function instead. (sqrt<mode>2): New expansion and insn definitions. * config/aarch64/aarch64.md: Likewise. * config/aarch64/aarch64.opt (mlow-precision-sqrt): Add new option description. * doc/invoke.texi (mlow-precision-sqrt): Likewise. --- gcc/config/aarch64/aarch64-protos.h | 3 +- gcc/config/aarch64/aarch64-simd.md | 13 ++++- gcc/config/aarch64/aarch64.c | 99 +++++++++++++++++++++++++++---------- gcc/config/aarch64/aarch64.md | 11 ++++- gcc/config/aarch64/aarch64.opt | 9 +++- gcc/doc/invoke.texi | 10 ++++ 6 files changed, 113 insertions(+), 32 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 50f1d24..437f6af 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -244,6 +244,7 @@ struct tune_params } autoprefetcher_model; unsigned int extra_tuning_flags; + unsigned int approx_sqrt_modes; unsigned int approx_rsqrt_modes; }; @@ -396,7 +397,7 @@ void aarch64_register_pragmas (void); void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_save_restore_target_globals (tree); -void aarch64_emit_approx_rsqrt (rtx, rtx); +bool aarch64_emit_approx_sqrt (rtx, rtx, bool); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index bd73bce..47ccb18 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -405,7 +405,7 @@ UNSPEC_RSQRT))] "TARGET_SIMD" { - aarch64_emit_approx_rsqrt (operands[0], operands[1]); + aarch64_emit_approx_sqrt (operands[0], operands[1], true); DONE; }) @@ -4307,7 +4307,16 @@ ;; sqrt -(define_insn "sqrt<mode>2" +(define_expand "sqrt<mode>2" + [(set (match_operand:VDQF 0 "register_operand") + (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))] + "TARGET_SIMD" +{ + if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) + DONE; +}) + +(define_insn "*sqrt<mode>2" [(set (match_operand:VDQF 0 "register_operand" "=w") (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 68381bf..589871b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -38,6 +38,7 @@ #include "recog.h" #include "diagnostic.h" #include "insn-attr.h" +#include "insn-flags.h" #include "insn-modes.h" #include "alias.h" #include "fold-const.h" @@ -416,6 +417,7 @@ static const struct tune_params generic_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -442,6 +444,7 @@ static const struct tune_params cortexa35_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -468,6 +471,7 @@ static const struct tune_params cortexa53_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -494,6 +498,7 @@ static const struct tune_params cortexa57_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -520,6 +525,7 @@ static const struct tune_params cortexa72_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -545,6 +551,7 @@ static const struct tune_params exynosm1_tunings = 64, /* cache_line_size. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_ALL), /* approx_sqrt_modes. */ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; @@ -570,6 +577,7 @@ static const struct tune_params thunderx_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */ }; @@ -595,6 +603,7 @@ static const struct tune_params xgene1_tunings = 0, /* cache_line_size. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_APPROX_NONE), /* approx_sqrt_modes. */ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */ }; @@ -7521,46 +7530,78 @@ get_rsqrts_type (machine_mode mode) } } -/* Emit instruction sequence to compute the reciprocal square root using the - Newton-Raphson series. Iterate over the series twice for SF - and thrice for DF. */ +/* Emit instruction sequence to compute either the approximate square root + or its approximate reciprocal. */ -void -aarch64_emit_approx_rsqrt (rtx dst, rtx src) +bool +aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) { - machine_mode mode = GET_MODE (src); - gcc_assert ( - mode == SFmode || mode == V2SFmode || mode == V4SFmode - || mode == DFmode || mode == V2DFmode); - - rtx xsrc = gen_reg_rtx (mode); - emit_move_insn (xsrc, src); - rtx x0 = gen_reg_rtx (mode); + machine_mode mode = GET_MODE (dst); + machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)), + GET_MODE_NUNITS (mode)); + + if (!flag_finite_math_only + || flag_trapping_math + || !flag_unsafe_math_optimizations + || optimize_function_for_size_p (cfun) + || !((recp && (flag_mrecip_low_precision_sqrt + || (aarch64_tune_params.approx_rsqrt_modes + & AARCH64_APPROX_MODE (mode)))) + || (!recp && (flag_mlow_precision_sqrt + || (aarch64_tune_params.approx_sqrt_modes + & AARCH64_APPROX_MODE (mode)))))) + return false; - emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); + rtx xmsk = gen_reg_rtx (mmsk); + if (!recp) + /* When calculating the approximate square root, compare the argument with + 0.0 and create a mask. */ + emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src, + CONST0_RTX (mode))))); - bool double_mode = (mode == DFmode || mode == V2DFmode); + /* Estimate the approximate reciprocal square root. */ + rtx xdst = gen_reg_rtx (mode); + emit_insn ((*get_rsqrte_type (mode)) (xdst, src)); - int iterations = double_mode ? 3 : 2; + /* Iterate over the series twice for SF and thrice for DF. */ + int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; - /* Optionally iterate over the series one less time than otherwise. */ - if (flag_mrecip_low_precision_sqrt) + /* Optionally iterate over the series once less for faster performance + while sacrificing the accuracy. */ + if ((recp && flag_mrecip_low_precision_sqrt) + || (!recp && flag_mlow_precision_sqrt)) iterations--; - for (int i = 0; i < iterations; ++i) + /* Iterate over the series to calculate the approximate reciprocal square root. */ + rtx x1 = gen_reg_rtx (mode); + while (iterations--) { - rtx x1 = gen_reg_rtx (mode); rtx x2 = gen_reg_rtx (mode); - rtx x3 = gen_reg_rtx (mode); - emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); + emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst)); + + emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2)); - emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2)); + if (iterations > 0) + emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1)); + } + + if (!recp) + { + /* Qualify the approximate reciprocal square root when the argument is + 0.0 by squashing the intermediary result to 0.0. */ + rtx xtmp = gen_reg_rtx (mmsk); + emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), + gen_rtx_SUBREG (mmsk, xdst, 0))); + emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); - emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); - x0 = x1; + /* Calculate the approximate square root. */ + emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src)); } - emit_move_insn (dst, x0); + /* Return the approximation. */ + emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1)); + + return true; } /* Return the number of instructions that can be issued per cycle. */ @@ -8090,6 +8131,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts) && (aarch64_cmodel == AARCH64_CMODEL_TINY || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)) aarch64_nopcrelative_literal_loads = false; + + /* When enabling the lower precision Newton series for the square root, also + enable it for the reciprocal square root, since the later is an + intermediary step for the latter. */ + if (flag_mlow_precision_sqrt) + flag_mrecip_low_precision_sqrt = true; } /* 'Unpack' up the internal tuning structs and update the options diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 9b282f1..aab3e00 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4683,7 +4683,16 @@ [(set_attr "type" "ffarith<s>")] ) -(define_insn "sqrt<mode>2" +(define_expand "sqrt<mode>2" + [(set (match_operand:GPF 0 "register_operand") + (sqrt:GPF (match_operand:GPF 1 "register_operand")))] + "TARGET_SIMD" +{ + if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) + DONE; +}) + +(define_insn "*sqrt<mode>2" [(set (match_operand:GPF 0 "register_operand" "=w") (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))] "TARGET_FLOAT" diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index c637ff4..ffd5540 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -151,5 +151,10 @@ PC relative literal loads. mlow-precision-recip-sqrt Common Var(flag_mrecip_low_precision_sqrt) Optimization -When calculating the reciprocal square root approximation, -uses one less step than otherwise, thus reducing latency and precision. +When calculating the approximate reciprocal square root, +use one less step than otherwise, thus reducing latency and precision. + +mlow-precision-sqrt +Common Var(flag_mlow_precision_sqrt) Optimization +When calculating the approximate square root, +use one less step than otherwise, thus reducing latency and precision. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 4340b08..76b7a5c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -574,6 +574,7 @@ Objective-C and Objective-C++ Dialects}. -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol +-mlow-precision-sqrt -mno-low-precision-sqrt@gol -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} @emph{Adapteva Epiphany Options} @@ -12941,6 +12942,15 @@ uses one less step than otherwise, thus reducing latency and precision. This is only relevant if @option{-ffast-math} enables the reciprocal square root approximation. +@item -mlow-precision-sqrt +@item -mno-low-precision-sqrt +@opindex -mlow-precision-sqrt +@opindex -mno-low-precision-sqrt +When calculating the square root approximation, +uses one less step than otherwise, thus reducing latency and precision. +This is only relevant if @option{-ffast-math} enables the square root +approximation. + @item -march=@var{name} @opindex march Specify the name of the target architecture and, optionally, one or -- 2.6.3