Re: [PATCH 1/3][AArch64] Add more choices for the reciprocal square root approximation

Evandro Menezes Fri, 03 Jun 2016 14:51:07 -0700

On 06/01/16 03:35, James Greenhalgh wrote:

On Fri, May 27, 2016 at 05:57:23PM -0500, Evandro Menezes wrote:

 From 86d7690632d03ec85fd69bfaef8e89c0542518ad Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
  approximation


Allow a target to prefer such operation depending on the operation mode.

2016-03-03  Evandro Menezes  <e.mene...@samsung.com>

gcc/
        * config/aarch64/aarch64-protos.h
        (AARCH64_APPROX_MODE): New macro.
        (AARCH64_APPROX_{NONE,ALL}): Likewise.
        (cpu_approx_modes): New structure.
        (tune_params): New member "approx_modes".
        * config/aarch64/aarch64-tuning-flags.def
        (AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
        * config/aarch64/aarch64.c
        ({generic,exynosm1,xgene1}_approx_modes): New core
        "cpu_approx_modes" structures.
        (generic_tunings): New member "approx_modes".
        (cortexa35_tunings): Likewise.
        (cortexa53_tunings): Likewise.
        (cortexa57_tunings): Likewise.
        (cortexa72_tunings): Likewise.
        (exynosm1_tunings): Likewise.
        (thunderx_tunings): Likewise.
        (xgene1_tunings): Likewise.
        (use_rsqrt_p): New argument for the mode and use new member from
        "tune_params".
        (aarch64_builtin_reciprocal): Devise mode from builtin.
        (aarch64_optab_supported_p): New argument for the mode.
        * doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.

We're almost there, just a couple of style comments left on this one
I think. Thanks for sticking with it so far.

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index f22a31c..6156281 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,23 @@ struct cpu_branch_cost
    const int unpredictable;  /* Unpredictable branch or optimizing for speed.  
*/
  };

+/* Control approximate alternatives to certain FP operators. */

+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+             + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_ALL (-1)
+
+/* Allowed modes for approximations.  */
+struct cpu_approx_modes
+{
+  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+};
+
  struct tune_params
  {
    const struct cpu_cost_table *insn_extra_cost;
@@ -218,6 +235,8 @@ struct tune_params
    } autoprefetcher_model;

unsigned int extra_tuning_flags;

+
+  const struct cpu_approx_modes *approx_modes;

Sorry to be irritating, but would you mind putting this up beside the
other "struct" members of tune_params. So directly under the branch_costs
member?

OK

  };
#define AARCH64_FUSION_PAIR(x, name) \

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@
       AARCH64_TUNE_ to give an enum name. */

AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)

-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 9995494..e532cfc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -38,6 +38,7 @@
  #include "recog.h"
  #include "diagnostic.h"
  #include "insn-attr.h"
+#include "insn-modes.h"

Do we need this include? My build test of just this patch suggests not (maybe
you need it later in the series?). This can probably be dropped.

OK

  #include "alias.h"
  #include "fold-const.h"
  #include "stor-layout.h"
@@ -393,6 +394,24 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
    3   /* Unpredictable.  */
  };

+/* Generic approximation modes. */

+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE  /* recip_sqrt  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_ALL   /* recip_sqrt  */
+};
+
+/* Approximation modes for Xgene1.  */

This should be "X-Gene 1" looking at how Applied Micro style it in their
marketing materials.

OK

+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_ALL   /* recip_sqrt  */
+};
+
  static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
  {
    return (!flag_trapping_math
          && flag_unsafe_math_optimizations
-         && ((aarch64_tune_params.extra_tuning_flags
-              & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+         && ((aarch64_tune_params.approx_modes->recip_sqrt
+              & AARCH64_APPROX_MODE (mode))
              || flag_mrecip_low_precision_sqrt));
  }

@@ -7467,7 +7494,9 @@ use_rsqrt_p (void)

  static tree
  aarch64_builtin_reciprocal (tree fndecl)
  {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
      return NULL_TREE;
    return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
  }
@@ -13889,13 +13918,13 @@ aarch64_promoted_type (const_tree t)
  /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */

static bool

-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
                           optimization_type opt_type)
  {
    switch (op)
      {
      case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);

default:

        return true;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index f1ac257..4340b08 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -12939,7 +12939,7 @@ corresponding flag to the linker.
  When calculating the reciprocal square root approximation,
  uses one less step than otherwise, thus reducing latency and precision.
  This is only relevant if @option{-ffast-math} enables the reciprocal square 
root
-approximation, which in turn depends on the target processor.
+approximation.

This hunk no longer applies on trunk over Wilco's changes. Can you check
whether you're happy with the wording that is on trunk now, and drop or
update this hunk as appropriate.


OK

Thank you,

--
Evandro Menezes

>From 763562f829d4fec54d21555b2bfd6478d449294f Mon Sep 17 00:00:00 2001
From: Evandro Menezes <e.mene...@samsung.com>
Date: Thu, 3 Mar 2016 18:13:46 -0600
Subject: [PATCH 1/3] [AArch64] Add more choices for the reciprocal square root
 approximation

Allow a target to prefer such operation depending on the operation mode.

2016-03-03  Evandro Menezes  <e.mene...@samsung.com>

gcc/
	* config/aarch64/aarch64-protos.h
	(AARCH64_APPROX_MODE): New macro.
	(AARCH64_APPROX_{NONE,ALL}): Likewise.
	(cpu_approx_modes): New structure.
	(tune_params): New member "approx_modes".
	* config/aarch64/aarch64-tuning-flags.def
	(AARCH64_EXTRA_TUNE_APPROX_RSQRT): Remove macro.
	* config/aarch64/aarch64.c
	({generic,exynosm1,xgene1}_approx_modes): New core
	"cpu_approx_modes" structures.
	(generic_tunings): New member "approx_modes".
	(cortexa35_tunings): Likewise.
	(cortexa53_tunings): Likewise.
	(cortexa57_tunings): Likewise.
	(cortexa72_tunings): Likewise.
	(exynosm1_tunings): Likewise.
	(thunderx_tunings): Likewise.
	(xgene1_tunings): Likewise.
	(use_rsqrt_p): New argument for the mode and use new member from
	"tune_params".
	(aarch64_builtin_reciprocal): Devise mode from builtin.
	(aarch64_optab_supported_p): New argument for the mode.
	* doc/invoke.texi (-mlow-precision-recip-sqrt): Reword description.
---
 gcc/config/aarch64/aarch64-protos.h         | 18 ++++++++++++
 gcc/config/aarch64/aarch64-tuning-flags.def |  2 --
 gcc/config/aarch64/aarch64.c                | 44 +++++++++++++++++++++++------
 gcc/doc/invoke.texi                         |  2 +-
 4 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ab9b37a..1c56a1d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,23 @@ struct cpu_branch_cost
   const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
 };
 
+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+	      + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_ALL (-1)
+
+/* Allowed modes for approximations.  */
+struct cpu_approx_modes
+{
+  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+};
+
 struct tune_params
 {
   const struct cpu_cost_table *insn_extra_cost;
@@ -185,6 +202,7 @@ struct tune_params
   const struct cpu_regmove_cost *regmove_cost;
   const struct cpu_vector_cost *vec_costs;
   const struct cpu_branch_cost *branch_costs;
+  const struct cpu_approx_modes *approx_modes;
   int memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index 7e45a0c..048c2a3 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ad07fe1..f7a3bd6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -393,6 +393,24 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
   3   /* Unpredictable.  */
 };
 
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE	/* recip_sqrt  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_ALL	/* recip_sqrt  */
+};
+
 static const struct tune_params generic_tunings =
 {
   &cortexa57_extra_costs,
@@ -400,6 +418,7 @@ static const struct tune_params generic_tunings =
   &generic_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -424,6 +443,7 @@ static const struct tune_params cortexa35_tunings =
   &cortexa53_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   1, /* issue_rate  */
   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -449,6 +469,7 @@ static const struct tune_params cortexa53_tunings =
   &cortexa53_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   2, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -474,6 +495,7 @@ static const struct tune_params cortexa57_tunings =
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
   &cortexa57_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -499,6 +521,7 @@ static const struct tune_params cortexa72_tunings =
   &cortexa57_regmove_cost,
   &cortexa57_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   4, /* memmov_cost  */
   3, /* issue_rate  */
   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -524,6 +547,7 @@ static const struct tune_params exynosm1_tunings =
   &exynosm1_regmove_cost,
   &exynosm1_vector_cost,
   &generic_branch_cost,
+  &exynosm1_approx_modes,
   4,	/* memmov_cost  */
   3,	/* issue_rate  */
   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -538,7 +562,7 @@ static const struct tune_params exynosm1_tunings =
   48,	/* max_case_values.  */
   64,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 };
 
 static const struct tune_params thunderx_tunings =
@@ -548,6 +572,7 @@ static const struct tune_params thunderx_tunings =
   &thunderx_regmove_cost,
   &generic_vector_cost,
   &generic_branch_cost,
+  &generic_approx_modes,
   6, /* memmov_cost  */
   2, /* issue_rate  */
   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -572,6 +597,7 @@ static const struct tune_params xgene1_tunings =
   &xgene1_regmove_cost,
   &xgene1_vector_cost,
   &generic_branch_cost,
+  &xgene1_approx_modes,
   6, /* memmov_cost  */
   4, /* issue_rate  */
   AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -586,7 +612,7 @@ static const struct tune_params xgene1_tunings =
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
 };
 
 /* Support for fine-grained override of the tuning structures.  */
@@ -7319,12 +7345,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    to optimize 1.0/sqrt.  */
 
 static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
 {
   return (!flag_trapping_math
 	  && flag_unsafe_math_optimizations
-	  && ((aarch64_tune_params.extra_tuning_flags
-	       & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+	  && ((aarch64_tune_params.approx_modes->recip_sqrt
+	       & AARCH64_APPROX_MODE (mode))
 	      || flag_mrecip_low_precision_sqrt));
 }
 
@@ -7334,7 +7360,9 @@ use_rsqrt_p (void)
 static tree
 aarch64_builtin_reciprocal (tree fndecl)
 {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
     return NULL_TREE;
   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
 }
@@ -13730,13 +13758,13 @@ aarch64_promoted_type (const_tree t)
 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
 
 static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 			   optimization_type opt_type)
 {
   switch (op)
     {
     case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
 
     default:
       return true;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 89bd983..d5391c1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13014,7 +13014,7 @@ corresponding flag to the linker.
 @item -mno-low-precision-recip-sqrt
 @opindex mlow-precision-recip-sqrt
 @opindex mno-low-precision-recip-sqrt
-Enable or disable reciprocal square root approximation.
+Enable or disable the reciprocal square root approximation.
 This option only has an effect if @option{-ffast-math} or
 @option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
 precision of reciprocal square root results to about 16 bits for
-- 
2.6.3

Re: [PATCH 1/3][AArch64] Add more choices for the reciprocal square root approximation

Reply via email to