On 04/01/16 18:08, Wilco Dijkstra wrote:
Evandro Menezes wrote:
I hope that this gets in the ballpark of what's been discussed previously.
Yes that's very close to what I had in mind. A minor issue is that the vector
modes cannot work as they start at MAX_MODE_FLOAT (which is > 32):
+/* Control approximate alternatives to certain FP operators. */
+#define AARCH64_APPROX_MODE(MODE) \
+ ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+ : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT + MAX_MODE_FLOAT + 1)) \
+ : (0))
That should be:
+ ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT + MAX_MODE_FLOAT - MIN_MODE_FLOAT
+ 1)) \
It would be worth testing all the obvious cases to be sure they work.
Also I don't think it is a good idea to enable all modes on Exynos-M1 and
XGene-1 -
I haven't seen any evidence that shows it gives a speedup on real code for all
modes
(or at least on a good micro benchmark like the unit vector test I suggested -
a simple
throughput test does not count!).
This approximation does benefit M1 in general across several
benchmarks. As for my choice for Xgene1, it preserves the original
setting. I believe that, with this more granular option, developers can
fine tune their targets.
The issue is it hides performance gains from an improved divider/sqrt on new
revisions
or microarchitectures. That means you should only enable cases where there is
evidence
of a major speedup that cannot be matched by a future improved divider/sqrt.
I did notice that some benchmarks with heavy use of multiplication or
multiply-accumulation, the series may be detrimental, since it increases
the competition for the unit(s) that do(es) such operations.
But those micro-architectures that get a better unit for division or
sqrt() are free to add their own tuning parameters. Granted, I assume
that running legacy code is not much of an issue only in a few markets.
Thank you,
--
Evandro Menezes
>From 63a39df80104c504ffdfba698aab9dc2f73221a1 Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH 1/2] [AArch64] Add more choices for the reciprocal square root
approximation
Allow a target to prefer such operation depending on the operation mode.
gcc/
* config/aarch64/aarch64-protos.h
(AARCH64_APPROX_MODE): New macro.
(AARCH64_APPROX_{NONE,SP,DP,DFORM,QFORM,SCALAR,VECTOR,ALL}: Likewise.
(tune_params): New member "approx_rsqrt_modes".
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
* config/aarch64/aarch64.c
(generic_tunings): New member "approx_rsqrt_modes".
(cortexa35_tunings): Likewise.
(cortexa53_tunings): Likewise.
(cortexa57_tunings): Likewise.
(cortexa72_tunings): Likewise.
(exynosm1_tunings): Likewise.
(thunderx_tunings): Likewise.
(xgene1_tunings): Likewise.
(use_rsqrt_p): New argument for the mode and use new member
"approx_rsqrt_modes" from "tune_params".
(aarch64_builtin_reciprocal): Devise mode from builtin.
(aarch64_optab_supported_p): New argument for the mode.
---
gcc/config/aarch64/aarch64-protos.h | 30 ++++++++++++++++++++++
gcc/config/aarch64/aarch64-tuning-flags.def | 2 --
gcc/config/aarch64/aarch64.c | 39 ++++++++++++++++++-----------
3 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 58c9d0d..a31ee35 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,32 @@ struct cpu_branch_cost
const int unpredictable; /* Unpredictable branch or optimizing for speed. */
};
+/* Control approximate alternatives to certain FP operators. */
+#define AARCH64_APPROX_MODE(MODE) \
+ ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+ : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+ ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+ + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+ : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_SP (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (V2SFmode) \
+ | AARCH64_APPROX_MODE (V4SFmode))
+#define AARCH64_APPROX_DP (AARCH64_APPROX_MODE (DFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_DFORM (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (DFmode) \
+ | AARCH64_APPROX_MODE (V2SFmode))
+#define AARCH64_APPROX_QFORM (AARCH64_APPROX_MODE (V4SFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_SCALAR (AARCH64_APPROX_MODE (SFmode) \
+ | AARCH64_APPROX_MODE (DFmode))
+#define AARCH64_APPROX_VECTOR (AARCH64_APPROX_MODE (V2SFmode) \
+ | AARCH64_APPROX_MODE (V4SFmode) \
+ | AARCH64_APPROX_MODE (V2DFmode))
+#define AARCH64_APPROX_ALL (-1)
+
struct tune_params
{
const struct cpu_cost_table *insn_extra_cost;
@@ -218,6 +244,7 @@ struct tune_params
} autoprefetcher_model;
unsigned int extra_tuning_flags;
+ unsigned int approx_rsqrt_modes;
};
#define AARCH64_FUSION_PAIR(x, name) \
@@ -263,6 +290,9 @@ enum aarch64_extra_tuning_flags
};
#undef AARCH64_EXTRA_TUNING_OPTION
+#define AARCH64_EXTRA_TUNE_APPROX_RSQRT \
+ (AARCH64_EXTRA_TUNE_APPROX_RSQRT_DF | AARCH64_EXTRA_TUNE_APPROX_RSQRT_SF)
+
extern struct tune_params aarch64_tune_params;
HOST_WIDE_INT aarch64_initial_elimination_offset (unsigned, unsigned);
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@
AARCH64_TUNE_ to give an enum name. */
AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b7086dd..b0ee11e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
#include "recog.h"
#include "diagnostic.h"
#include "insn-attr.h"
+#include "insn-modes.h"
#include "alias.h"
#include "fold-const.h"
#include "stor-layout.h"
@@ -414,7 +415,8 @@ static const struct tune_params generic_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa35_tunings =
@@ -439,7 +441,8 @@ static const struct tune_params cortexa35_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa53_tunings =
@@ -464,7 +467,8 @@ static const struct tune_params cortexa53_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa57_tunings =
@@ -489,7 +493,8 @@ static const struct tune_params cortexa57_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params cortexa72_tunings =
@@ -514,7 +519,8 @@ static const struct tune_params cortexa72_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params exynosm1_tunings =
@@ -538,7 +544,8 @@ static const struct tune_params exynosm1_tunings =
48, /* max_case_values. */
64, /* cache_line_size. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
static const struct tune_params thunderx_tunings =
@@ -562,7 +569,8 @@ static const struct tune_params thunderx_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_NONE) /* approx_rsqrt_modes. */
};
static const struct tune_params xgene1_tunings =
@@ -586,7 +594,8 @@ static const struct tune_params xgene1_tunings =
0, /* max_case_values. */
0, /* cache_line_size. */
tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
- (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags. */
+ (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
+ (AARCH64_APPROX_ALL) /* approx_rsqrt_modes. */
};
/* Support for fine-grained override of the tuning structures. */
@@ -7452,12 +7461,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
to optimize 1.0/sqrt. */
static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
{
return (!flag_trapping_math
&& flag_unsafe_math_optimizations
- && ((aarch64_tune_params.extra_tuning_flags
- & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+ && ((aarch64_tune_params.approx_rsqrt_modes
+ & AARCH64_APPROX_MODE (mode))
|| flag_mrecip_low_precision_sqrt));
}
@@ -7467,7 +7476,9 @@ use_rsqrt_p (void)
static tree
aarch64_builtin_reciprocal (tree fndecl)
{
- if (!use_rsqrt_p ())
+ machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+ if (!use_rsqrt_p (mode))
return NULL_TREE;
return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
}
@@ -13964,13 +13975,13 @@ aarch64_promoted_type (const_tree t)
/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
optimization_type opt_type)
{
switch (op)
{
case rsqrt_optab:
- return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+ return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
default:
return true;
--
1.9.1