* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-opts.h: -mrecip has a default value depending on the core. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64-tuning-flags.def: Added MRECIP_DEFAULT_ENABLED. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode. * config/aarch64/aarch64.md: Added enum entries. * config/aarch64/aarch64.opt: Added options -mrecip and -mlow-precision-recip-sqrt. * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans for frsqrte and frsqrts * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
Signed-off-by: Philipp Tomsich <philipp.toms...@theobroma-systems.com> --- gcc/ChangeLog | 21 ++++ gcc/config/aarch64/aarch64-builtins.c | 107 +++++++++++++++++++ gcc/config/aarch64/aarch64-opts.h | 7 ++ gcc/config/aarch64/aarch64-protos.h | 3 + gcc/config/aarch64/aarch64-simd.md | 27 +++++ gcc/config/aarch64/aarch64-tuning-flags.def | 1 + gcc/config/aarch64/aarch64.c | 113 ++++++++++++++++++++- gcc/config/aarch64/aarch64.md | 3 + gcc/config/aarch64/aarch64.opt | 8 ++ gcc/doc/invoke.texi | 19 ++++ gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c | 63 ++++++++++++ gcc/testsuite/gcc.target/aarch64/rsqrt.c | 107 +++++++++++++++++++ 12 files changed, 474 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 77fb2c1..382f6b3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2015-09-03 Benedikt Huber <benedikt.hu...@theobroma-systems.com> + Philipp Tomsich <philipp.toms...@theobroma-systems.com> + + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and + rsqrtf. + * config/aarch64/aarch64-opts.h: -mrecip has a default value + depending on the core. + * config/aarch64/aarch64-protos.h: Declare. + * config/aarch64/aarch64-simd.md: Matching expressions for + frsqrte and frsqrts. + * config/aarch64/aarch64-tuning-flags.def: Added + MRECIP_DEFAULT_ENABLED. + * config/aarch64/aarch64.c: New functions. Emit rsqrt + estimation code in fast math mode. + * config/aarch64/aarch64.md: Added enum entries. + * config/aarch64/aarch64.opt: Added options -mrecip and + -mlow-precision-recip-sqrt. + * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans + for frsqrte and frsqrts + * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt. + 2015-09-02 Charles Baylis <charles.bay...@linaro.org> * cgraphunit.c (cgraph_node::create_wrapper): Set can_throw_external diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index e3a90b5..729f384 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -337,6 +337,11 @@ enum aarch64_builtins AARCH64_BUILTIN_GET_FPSR, AARCH64_BUILTIN_SET_FPSR, + AARCH64_BUILTIN_RSQRT_DF, + AARCH64_BUILTIN_RSQRT_SF, + AARCH64_BUILTIN_RSQRT_V2DF, + AARCH64_BUILTIN_RSQRT_V2SF, + AARCH64_BUILTIN_RSQRT_V4SF, AARCH64_SIMD_BUILTIN_BASE, AARCH64_SIMD_BUILTIN_LANE_CHECK, #include "aarch64-simd-builtins.def" @@ -835,6 +840,43 @@ aarch64_init_crc32_builtins () } } +/* Add builtins for reciprocal square root. */ +void +aarch64_add_builtin_rsqrt (void) +{ + tree fndecl = NULL; + tree ftype = NULL; + + tree V2SF_type_node = build_vector_type (float_type_node, 2); + tree V2DF_type_node = build_vector_type (double_type_node, 2); + tree V4SF_type_node = build_vector_type (float_type_node, 4); + + ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df", + ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl; + + ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf", + ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl; + + ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df", + ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl; + + ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf", + ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl; + + ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf", + ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl; +} + void aarch64_init_builtins (void) { @@ -866,6 +908,7 @@ aarch64_init_builtins (void) aarch64_init_simd_builtins (); aarch64_init_crc32_builtins (); + aarch64_add_builtin_rsqrt (); } tree @@ -1129,6 +1172,40 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target) return target; } +/* Function to expand reciprocal square root builtins. */ +static rtx +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op0 = expand_normal (arg0); + + enum insn_code c; + + switch (fcode) + { + case AARCH64_BUILTIN_RSQRT_DF: + c = CODE_FOR_rsqrt_df2; break; + case AARCH64_BUILTIN_RSQRT_SF: + c = CODE_FOR_rsqrt_sf2; break; + case AARCH64_BUILTIN_RSQRT_V2DF: + c = CODE_FOR_rsqrt_v2df2; break; + case AARCH64_BUILTIN_RSQRT_V2SF: + c = CODE_FOR_rsqrt_v2sf2; break; + case AARCH64_BUILTIN_RSQRT_V4SF: + c = CODE_FOR_rsqrt_v4sf2; break; + default: gcc_unreachable (); + } + + if (!target) + target = gen_reg_rtx (GET_MODE (op0)); + + pat = GEN_FCN (c) (target, op0); + emit_insn (pat); + + return target; +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1176,6 +1253,13 @@ aarch64_expand_builtin (tree exp, else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX) return aarch64_crc32_expand_builtin (fcode, exp, target); + if (fcode == AARCH64_BUILTIN_RSQRT_DF + || fcode == AARCH64_BUILTIN_RSQRT_SF + || fcode == AARCH64_BUILTIN_RSQRT_V2DF + || fcode == AARCH64_BUILTIN_RSQRT_V2SF + || fcode == AARCH64_BUILTIN_RSQRT_V4SF) + return aarch64_expand_builtin_rsqrt (fcode, exp, target); + gcc_unreachable (); } @@ -1333,6 +1417,29 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) return NULL_TREE; } +/* Return builtin for reciprocal square root. */ +tree +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn) +{ + if (md_fn) + { + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF]; + } + else + { + if (fn == BUILT_IN_SQRT) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF]; + if (fn == BUILT_IN_SQRTF) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF]; + } + return NULL_TREE; +} + #undef VAR1 #define VAR1(T, N, MAP, A) \ case AARCH64_SIMD_BUILTIN_##T##_##N##A: diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h index bf6bb7b..f8e79cb 100644 --- a/gcc/config/aarch64/aarch64-opts.h +++ b/gcc/config/aarch64/aarch64-opts.h @@ -73,4 +73,11 @@ enum aarch64_code_model { AARCH64_CMODEL_LARGE }; +/* Each core can have -mrecip enabled or disabled by default. */ +enum aarch64_mrecip { + AARCH64_MRECIP_OFF = 0, + AARCH64_MRECIP_ON, + AARCH64_MRECIP_DEFAULT, +}; + #endif diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index ff19851..767050a 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -347,6 +347,8 @@ void aarch64_register_pragmas (void); void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); +void aarch64_emit_swrsqrt (rtx, rtx); + /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); @@ -412,4 +414,5 @@ int aarch64_ccmp_mode_to_code (enum machine_mode mode); bool extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset); bool aarch64_operands_ok_for_ldpstp (rtx *, bool, enum machine_mode); bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, enum machine_mode); +tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn); #endif /* GCC_AARCH64_PROTOS_H */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 75fa0ab..627d2ae 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -353,6 +353,33 @@ [(set_attr "type" "neon_fp_mul_d_scalar_q")] ) +(define_insn "rsqrte_<mode>2" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRTE))] + "TARGET_SIMD" + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) + +(define_insn "rsqrts_<mode>3" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") + (match_operand:VALLF 2 "register_operand" "w")] + UNSPEC_RSQRTS))] + "TARGET_SIMD" + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) + +(define_expand "rsqrt_<mode>2" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRT))] + "TARGET_SIMD" +{ + aarch64_emit_swrsqrt (operands[0], operands[1]); + DONE; +}) + (define_insn "*aarch64_mul3_elt_to_64v2df" [(set (match_operand:DF 0 "register_operand" "=w") (mult:DF diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 628386b..300dab6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -29,4 +29,5 @@ AARCH64_TUNE_ to give an enum name. */ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) +AARCH64_EXTRA_TUNING_OPTION ("mrecip_default_enabled", MRECIP_DEFAULT_ENABLED) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index bc612e4..247bdaf 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -364,7 +364,7 @@ static const struct tune_params generic_tunings = 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED) /* tune_flags. */ }; static const struct tune_params cortexa53_tunings = @@ -386,7 +386,7 @@ static const struct tune_params cortexa53_tunings = 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED) /* tune_flags. */ }; static const struct tune_params cortexa57_tunings = @@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings = 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ - (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS /* tune_flags. */ + | AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED) }; static const struct tune_params cortexa72_tunings = @@ -430,7 +431,7 @@ static const struct tune_params cortexa72_tunings = 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED) /* tune_flags. */ }; static const struct tune_params thunderx_tunings = @@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings = 1, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED) /* tune_flags. */ }; /* Support for fine-grained override of the tuning structures. */ @@ -7121,6 +7122,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, return aarch64_tune_params.memmov_cost; } +/* Function to decide when to use + * reciprocal square root builtins. */ +static tree +aarch64_builtin_reciprocal (unsigned int fn, + bool md_fn, + bool) +{ + if (!flag_finite_math_only + || flag_trapping_math + || !flag_unsafe_math_optimizations + || optimize_size + || flag_mrecip == AARCH64_MRECIP_OFF + || (flag_mrecip == AARCH64_MRECIP_DEFAULT + && !(aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED))) + { + return NULL_TREE; + } + + return aarch64_builtin_rsqrt (fn, md_fn); +} + +typedef rtx (*rsqrte_type) (rtx, rtx); + +/* Select reciprocal square root initial estimate + * insn depending on machine mode. */ +rsqrte_type get_rsqrte_type (enum machine_mode mode) +{ + switch (mode) + { + case DFmode: return gen_rsqrte_df2; + case SFmode: return gen_rsqrte_sf2; + case V2DFmode: return gen_rsqrte_v2df2; + case V2SFmode: return gen_rsqrte_v2sf2; + case V4SFmode: return gen_rsqrte_v4sf2; + default: gcc_unreachable (); + } +} + +typedef rtx (*rsqrts_type) (rtx, rtx, rtx); + +/* Select reciprocal square root Newton-Raphson step + * insn depending on machine mode. */ +rsqrts_type get_rsqrts_type (enum machine_mode mode) +{ + switch (mode) + { + case DFmode: return gen_rsqrts_df3; + case SFmode: return gen_rsqrts_sf3; + case V2DFmode: return gen_rsqrts_v2df3; + case V2SFmode: return gen_rsqrts_v2sf3; + case V4SFmode: return gen_rsqrts_v4sf3; + default: gcc_unreachable (); + } +} + +/* Emit instruction sequence to compute + * reciprocal square root. Use two Newton-Raphson steps + * for single precision and three for double precision. */ +void +aarch64_emit_swrsqrt (rtx dst, rtx src) +{ + enum machine_mode mode = GET_MODE (src); + gcc_assert ( + mode == SFmode || mode == V2SFmode || mode == V4SFmode || + mode == DFmode || mode == V2DFmode); + + rtx xsrc = gen_reg_rtx (mode); + emit_move_insn (xsrc, src); + rtx x0 = gen_reg_rtx (mode); + + emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); + + bool double_mode = (mode == DFmode || mode == V2DFmode); + + int iterations = 2; + if (double_mode) + iterations = 3; + + if (flag_mrecip_low_precision_sqrt) + iterations--; + + for (int i = 0; i < iterations; ++i) + { + rtx x1 = gen_reg_rtx (mode); + rtx x2 = gen_reg_rtx (mode); + rtx x3 = gen_reg_rtx (mode); + emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); + + emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2)); + + emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); + x0 = x1; + } + + emit_move_insn (dst, x0); + return; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) @@ -13375,6 +13475,9 @@ aarch64_promoted_type (const_tree t) #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 2522982..23077a6 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -126,6 +126,9 @@ UNSPEC_VSTRUCTDUMMY UNSPEC_SP_SET UNSPEC_SP_TEST + UNSPEC_RSQRT + UNSPEC_RSQRTE + UNSPEC_RSQRTS ]) (define_c_enum "unspecv" [ diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 8642bdb..a9393a3 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -144,3 +144,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32) EnumValue Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) + +mrecip +Common Report Var(flag_mrecip) Optimization Init(AARCH64_MRECIP_DEFAULT) +Generate software reciprocal square root for better throughput. + +mlow-precision-recip-sqrt +Common Var(flag_mrecip_low_precision_sqrt) Optimization +Run fewer approximation steps to reduce latency and precision. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index e7bbcec..66544f7 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -517,6 +517,8 @@ Objective-C and Objective-C++ Dialects}. -mtls-size=@var{size} @gol -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol +-mrecip -mno-recip @gol +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} @emph{Adapteva Epiphany Options} @@ -12342,6 +12344,23 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419. This erratum workaround is made at link time and this will only pass the corresponding flag to the linker. +@item -mrecip +@item -mno-recip +@opindex mrecip +@opindex mno-recip +This option enables use of the +reciprocal square root estimate instructions with additional +Newton-Raphson steps to increase precision instead of doing a square root and +divide for floating-point arguments. +It can only be used together with @option{-ffast-math}. + +@item -mlow-precision-recip-sqrt +@item -mno-low-precision-recip-sqrt +@opindex -mlow-precision-recip-sqrt +@opindex -mno-low-precision-recip-sqrt +The square root estimate uses two steps instead of three for double-precision, +and one step instead of two for single-precision. Thus reducing latency and precision. + @item -march=@var{name} @opindex march Specify the name of the target architecture, optionally suffixed by one or diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c new file mode 100644 index 0000000..d6cfe11 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math -mrecip" } */ + +#include <math.h> + +#define sqrt_float __builtin_sqrtf +#define sqrt_double __builtin_sqrt + +#define TESTTYPE(TYPE) \ +typedef struct { \ + TYPE a; \ + TYPE b; \ + TYPE c; \ + TYPE d; \ +} s4_##TYPE; \ + \ +typedef struct { \ + TYPE a; \ + TYPE b; \ +} s2_##TYPE; \ + \ +s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i) \ +{ \ + s4_##TYPE o; \ + o.a = 1.0 / sqrt_##TYPE (i.a); \ + o.b = 1.0 / sqrt_##TYPE (i.b); \ + o.c = 1.0 / sqrt_##TYPE (i.c); \ + o.d = 1.0 / sqrt_##TYPE (i.d); \ + return o; \ +} \ + \ +s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i) \ +{ \ + s2_##TYPE o; \ + o.a = 1.0 / sqrt_##TYPE (i.a); \ + o.b = 1.0 / sqrt_##TYPE (i.b); \ + return o; \ +} \ + \ +TYPE rsqrt_##TYPE (TYPE i) \ +{ \ + return 1.0 / sqrt_##TYPE (i); \ +} \ + \ + +TESTTYPE(double) +TESTTYPE(float) + +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */ + +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, v\[0-9\]+.2d" 3 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */ + + +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */ + +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, v\[0-9\]+.4s" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */ + +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, v\[0-9\]+.2s" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c new file mode 100644 index 0000000..4a5c008 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c @@ -0,0 +1,107 @@ +/* { dg-do run } */ +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math -mrecip" } */ + +#include <math.h> +#include <stdio.h> + +#include <float.h> + +#define PI 3.141592653589793 +#define SQRT2 1.4142135623730951 + +#define PI_4 0.7853981633974483 +#define SQRT1_2 0.7071067811865475 + +/* 2^25+1, float has 24 significand bits + * according to Single-precision floating-point format. */ +#define TESTA8_FLT 33554433 +/* 2^54+1, double has 53 significand bits + * according to Double-precision floating-point format. */ +#define TESTA8_DBL 18014398509481985 + +#define SD(a, b) t_double ((#a), (a), (b)); +#define SF(a, b) t_float ((#a), (a), (b)); + +#define EPSILON_double __DBL_EPSILON__ +#define EPSILON_float __FLT_EPSILON__ +#define ABS_double __builtin_fabs +#define ABS_float __builtin_fabsf +#define SQRT_double __builtin_sqrt +#define SQRT_float __builtin_sqrtf + +extern void abort (void); + +#define TESTTYPE(TYPE) \ +TYPE rsqrt_##TYPE (TYPE a) \ +{ \ + return 1.0/SQRT_##TYPE(a); \ +} \ + \ +int equals_##TYPE (TYPE a, TYPE b) \ +{ \ + return (a == b || \ + (isnan (a) && isnan (b)) || \ + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \ +} \ + \ +void t_##TYPE (const char *s, TYPE a, TYPE result) \ +{ \ + TYPE r = rsqrt_##TYPE (a); \ + if (!equals_##TYPE (r, result)) \ + { \ + abort (); \ + } \ +} \ + +// printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \ + +TESTTYPE(double) +TESTTYPE(float) + +int main () +{ + SD( 1.0/256, 0X1.00000000000000P+4 ); + SD( 1.0, 0X1.00000000000000P+0 ); + SD( -1.0, NAN); + SD( 11.0, 0X1.34BF63D1568260P-2 ); + SD( 0.0, INFINITY); + SD( INFINITY, 0X0.00000000000000P+0 ); + SD( NAN, NAN); + SD( -NAN, -NAN); + SD( DBL_MAX, 0X1.00000000000010P-512); + SD( DBL_MIN, 0X1.00000000000000P+511); + SD( PI, 0X1.20DD750429B6D0P-1 ); + SD( PI_4, 0X1.20DD750429B6D0P+0 ); + SD( SQRT2, 0X1.AE89F995AD3AE0P-1 ); + SD( SQRT1_2, 0X1.306FE0A31B7150P+0 ); + SD( -PI, NAN); + SD( -SQRT2, NAN); + SD( TESTA8_DBL, 0X1.00000000000000P-27 ); + + SF( 1.0/256, 0X1.00000000000000P+4 ); + SF( 1.0, 0X1.00000000000000P+0 ); + SF( -1.0, NAN); + SF( 11.0, 0X1.34BF6400000000P-2 ); + SF( 0.0, INFINITY); + SF( INFINITY, 0X0.00000000000000P+0 ); + SF( NAN, NAN); + SF( -NAN, -NAN); + SF( FLT_MAX, 0X1.00000200000000P-64 ); + SF( FLT_MIN, 0X1.00000000000000P+63 ); + SF( PI, 0X1.20DD7400000000P-1 ); + SF( PI_4, 0X1.20DD7400000000P+0 ); + SF( SQRT2, 0X1.AE89FA00000000P-1 ); + SF( SQRT1_2, 0X1.306FE000000000P+0 ); + SF( -PI, NAN); + SF( -SQRT2, NAN); + SF( TESTA8_FLT, 0X1.6A09E600000000P-13 ); + +// With -ffast-math these return positive INF. +// SD( -0.0, -INFINITY); +// SF( -0.0, -INFINITY); +// The reason here is that -ffast-math flushes to zero. +// SD(DBL_MIN/256, 0X1.00000000000000P+515); +// SF(FLT_MIN/256, 0X1.00000000000000P+67 ); + + return 0; +} -- 1.9.1