On 03/17/16 15:09, Evandro Menezes wrote:
This patch implements FP division by an approximation using the Newton
series.
With this patch, DF division is sped up by over 100% and SF division,
zilch, both on A57 and on M1.
gcc/
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_DIV_{SF,DF}: New tuning macros.
* config/aarch64/aarch64-protos.h
(AARCH64_EXTRA_TUNE_APPROX_DIV): New macro.
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
This version of the patch cleans up the changes to the MD files and
optimizes the division when the numerator is 1.0.
Again, I look forward to your feedback.
Thank you,
--
Evandro Menezes
>From 5cd2a628086af3656b3242f0c4f41784646f52b1 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 17 Mar 2016 14:44:55 -0500
Subject: [PATCH] [AArch64] Emit division using the Newton series
2016-03-17 Evandro Menezes <e.mene...@samsung.com>
gcc/
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_DIV_{SF,DF}: New tuning macros.
* config/aarch64/aarch64-protos.h
(AARCH64_EXTRA_TUNE_APPROX_DIV): New macro.
(aarch64_emit_approx_div): Declare new function.
* config/aarch64/aarch64.c
(aarch64_emit_approx_div): Define new function.
* config/aarch64/aarch64.md ("div<mode>3"): New expansion.
* config/aarch64/aarch64-simd.md ("div<mode>3"): Likewise.
---
gcc/config/aarch64/aarch64-protos.h | 4 ++
gcc/config/aarch64/aarch64-simd.md | 14 +++++-
gcc/config/aarch64/aarch64-tuning-flags.def | 3 +-
gcc/config/aarch64/aarch64.c | 73 +++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 19 ++++++--
5 files changed, 107 insertions(+), 6 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index dced209..52c4838 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -263,6 +263,9 @@ enum aarch64_extra_tuning_flags
};
#undef AARCH64_EXTRA_TUNING_OPTION
+#define AARCH64_EXTRA_TUNE_APPROX_DIV \
+ (AARCH64_EXTRA_TUNE_APPROX_DIV_DF | AARCH64_EXTRA_TUNE_APPROX_DIV_SF)
+
extern struct tune_params aarch64_tune_params;
HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
@@ -362,6 +365,7 @@ void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
void aarch64_save_restore_target_globals (tree);
void aarch64_emit_approx_rsqrt (rtx, rtx);
+bool aarch64_emit_approx_div (rtx, rtx, rtx);
/* Initialize builtins for SIMD intrinsics. */
void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index bd73bce..99be92e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1509,7 +1509,19 @@
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:VDQF 0 "register_operand")
+ (div:VDQF (match_operand:VDQF 1 "general_operand")
+ (match_operand:VDQF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:VDQF 0 "register_operand" "=w")
(div:VDQF (match_operand:VDQF 1 "register_operand" "w")
(match_operand:VDQF 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..ececdc1 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,5 @@
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("approx_div", APPROX_DIV_DF)
+AARCH64_EXTRA_TUNING_OPTION ("approx_divf", APPROX_DIV_SF)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 12e498d..2c878ce 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7540,6 +7540,79 @@ aarch64_emit_approx_rsqrt (rtx dst, rtx src)
emit_move_insn (dst, x0);
}
+/* Emit the instruction sequence to compute the approximation for a reciprocal. */
+
+bool
+aarch64_emit_approx_div (rtx quo, rtx num, rtx div)
+{
+ machine_mode mode = GET_MODE (quo);
+
+ if (!flag_finite_math_only
+ || flag_trapping_math
+ || !flag_unsafe_math_optimizations
+ || optimize_function_for_size_p (cfun)
+ || ((GET_MODE_INNER (mode) != SFmode
+ || !(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_DIV_SF))
+ && (GET_MODE_INNER (mode) != DFmode
+ || !(aarch64_tune_params.extra_tuning_flags
+ & AARCH64_EXTRA_TUNE_APPROX_DIV_DF))))
+ return false;
+
+ /* Estimate the approximate reciprocal. */
+ rtx xrcp = gen_reg_rtx (mode);
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpesf (xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpev2sf (xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpev4sf (xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpedf (xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpev2df (xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Iterate over the series twice for SF and thrice for DF. */
+ int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
+
+ rtx xtmp = gen_reg_rtx (mode);
+ while (iterations--)
+ {
+ switch (mode)
+ {
+ case SFmode:
+ emit_insn (gen_aarch64_frecpssf (xtmp, xrcp, div)); break;
+ case V2SFmode:
+ emit_insn (gen_aarch64_frecpsv2sf (xtmp, xrcp, div)); break;
+ case V4SFmode:
+ emit_insn (gen_aarch64_frecpsv4sf (xtmp, xrcp, div)); break;
+ case DFmode:
+ emit_insn (gen_aarch64_frecpsdf (xtmp, xrcp, div)); break;
+ case V2DFmode:
+ emit_insn (gen_aarch64_frecpsv2df (xtmp, xrcp, div)); break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
+ }
+
+ if (num != CONST1_RTX (mode))
+ {
+ rtx xnum = force_reg (mode, num);
+ emit_set_insn (quo, gen_rtx_MULT (mode, xnum, xrcp));
+ }
+ else
+ emit_move_insn (quo, xrcp);
+
+ return true;
+}
+
/* Return the number of instructions that can be issued per cycle. */
static int
aarch64_sched_issue_rate (void)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 68676c9..985915e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4647,11 +4647,22 @@
[(set_attr "type" "fmul<s>")]
)
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:GPF 0 "register_operand")
+ (div:GPF (match_operand:GPF 1 "general_operand")
+ (match_operand:GPF 2 "register_operand")))]
+ "TARGET_SIMD"
+{
+ if (aarch64_emit_approx_div (operands[0], operands[1], operands[2]))
+ DONE;
+
+ operands[1] = force_reg (<MODE>mode, operands[1]);
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:GPF 0 "register_operand" "=w")
- (div:GPF
- (match_operand:GPF 1 "register_operand" "w")
- (match_operand:GPF 2 "register_operand" "w")))]
+ (div:GPF (match_operand:GPF 1 "register_operand" "w")
+ (match_operand:GPF 2 "register_operand" "w")))]
"TARGET_FLOAT"
"fdiv\\t%<s>0, %<s>1, %<s>2"
[(set_attr "type" "fdiv<s>")]
--
1.9.1