Re: [PATCH 3/3] aarch64: Avoid redundant writes to FPMR

Kyrylo Tkachov Wed, 22 Jan 2025 04:46:10 -0800

Hi Richard,

> On 22 Jan 2025, at 13:21, Richard Sandiford <richard.sandif...@arm.com> wrote:
> 
> GCC 15 is the first release to support FP8 intrinsics.
> The underlying instructions depend on the value of a new register,
> FPMR.  Unlike FPCR, FPMR is a normal call-clobbered/caller-save
> register rather than a global register.  So:
> 
> - The FP8 intrinsics take a final uint64_t argument that
>  specifies what value FPMR should have.
> 
> - If an FP8 operation is split across multiple functions,
>  it is likely that those functions would have a similar argument.
> 
> If the object code has the structure:
> 
>    for (...)
>      fp8_kernel (..., fpmr_value);
> 
> then fp8_kernel would set FPMR to fpmr_value each time it is
> called, even though FPMR will already have that value for at
> least the second and subsequent calls (and possibly the first).
> 
> The working assumption for the ABI has been that writes to
> registers like FPMR can in general be more expensive than
> reads and so it would be better to use a conditional write like:
> 
>       mrs     tmp, fpmr
>       cmp     tmp, <value>
>       beq     1f
>       nsr     fpmr, <value>


Typo “msr” here and in the comment in the code.

>     1:
> 
> instead of writing the same value to FPMR repeatedly.
> 
> This patch implements that.  It also adds a tuning flag that suppresses
> the behaviour, both to make testing easier and to support any future
> cores that (for example) are able to rename FPMR.
> 
> Hopefully this really is the last part of the FP8 enablement.
> 
> Tested on aarch64-linux-gnu.  I'll push in about 24 hours
> if there are no comments before then.
> 
> Richard
> 
> 
> gcc/
> * config/aarch64/aarch64-tuning-flags.def
> (AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE): New tuning flag.
> * config/aarch64/aarch64.h (TARGET_CHEAP_FPMR_WRITE): New macro.
> * config/aarch64/aarch64.md: Split moves into FPMR into a test
> and branch around.
> (aarch64_write_fpmr): New pattern.
> 
> gcc/testsuite/
> * g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Add
> cheap_fpmr_write by default.
> * gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Likewise.
> * gcc.target/aarch64/acle/fp8.c: Add cheap_fpmr_write.
> * gcc.target/aarch64/acle/fpmr-2.c: Likewise.
> * gcc.target/aarch64/simd/vcvt_fpm.c: Likewise.
> * gcc.target/aarch64/simd/vdot2_fpm.c: Likewise.
> * gcc.target/aarch64/simd/vdot4_fpm.c: Likewise.
> * gcc.target/aarch64/simd/vmla_fpm.c: Likewise.
> * gcc.target/aarch64/acle/fpmr-6.c: New test.
> ---
> gcc/config/aarch64/aarch64-tuning-flags.def   | 15 +++++++
> gcc/config/aarch64/aarch64.h                  |  5 +++
> gcc/config/aarch64/aarch64.md                 | 39 +++++++++++++++++++
> .../sve2/acle/aarch64-sve2-acle-asm.exp       |  2 +-
> gcc/testsuite/gcc.target/aarch64/acle/fp8.c   |  2 +-
> .../gcc.target/aarch64/acle/fpmr-2.c          |  2 +-
> .../gcc.target/aarch64/acle/fpmr-6.c          | 36 +++++++++++++++++
> .../gcc.target/aarch64/simd/vcvt_fpm.c        |  2 +-
> .../gcc.target/aarch64/simd/vdot2_fpm.c       |  2 +-
> .../gcc.target/aarch64/simd/vdot4_fpm.c       |  2 +-
> .../gcc.target/aarch64/simd/vmla_fpm.c        |  2 +-
> .../sve2/acle/aarch64-sve2-acle-asm.exp       |  2 +-
> 12 files changed, 103 insertions(+), 8 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c
> 
> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
> b/gcc/config/aarch64/aarch64-tuning-flags.def
> index 60967aac903..7a67d6197d9 100644
> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> @@ -48,6 +48,21 @@ AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", 
> FULLY_PIPELINED_FMA)
>    rather than re-use an input predicate register.  */
> AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
> 
> +/* Whether writes to the FPMR are cheap enough that:
> +
> +       msr     fpmr, <value>
> +
> +   is better than:
> +
> +       mrs     tmp, fpmr
> +       cmp     tmp, <value>
> +       beq     1f
> +       nsr     fpmr, <value>
> +     1:
> +
> +   even when the branch is predictably taken.  */
> +AARCH64_EXTRA_TUNING_OPTION ("cheap_fpmr_write", CHEAP_FPMR_WRITE)
> +
> /* Baseline tuning settings suitable for all modern cores.  */
> #define AARCH64_EXTRA_TUNE_BASE (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND \
> | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 218868a5246..5cbf442130b 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -486,6 +486,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE 
> ATTRIBUTE_UNUSED
> /* fp8 instructions are enabled through +fp8.  */
> #define TARGET_FP8 AARCH64_HAVE_ISA (FP8)
> 
> +/* See the comment above the tuning flag for details.  */
> +#define TARGET_CHEAP_FPMR_WRITE \
> +  (bool (aarch64_tune_params.extra_tuning_flags \
> + & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE))
> +
> /* Combinatorial tests.  */
> 
> #define TARGET_SVE2_OR_SME2 \
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 776c4c4ceee..071058dbeb3 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -356,6 +356,7 @@ (define_c_enum "unspec" [
>     UNSPEC_UPDATE_FFRT
>     UNSPEC_RDFFR
>     UNSPEC_WRFFR
> +    UNSPEC_WRITE_FPMR
>     UNSPEC_SYSREG_RDI
>     UNSPEC_SYSREG_RTI
>     UNSPEC_SYSREG_WDI
> @@ -1883,6 +1884,44 @@ (define_split
>   }
> )
> 
> +;; The preferred way of writing to the FPMR is to test whether it already
> +;; has the desired value and branch around the write if so.  This reduces
> +;; the number of redundant FPMR writes caused by ABI boundaries, such as in:
> +;;
> +;;    for (...)
> +;;      fp8_kernel (..., fpmr_value);
> +;;
> +;; Without this optimization, fp8_kernel would set FPMR to fpmr_value each
> +;; time that it is called.
> +;;
> +;; We do this as a split so that hardreg_pre can optimize the moves first.
> +(define_split
> +  [(set (reg:DI FPM_REGNUM)
> +        (match_operand:DI 0 "aarch64_reg_or_zero"))]
> +  "TARGET_FP8 && !TARGET_CHEAP_FPMR_WRITE && can_create_pseudo_p ()"
> +  [(const_int 0)]
> +  {
> +    auto label = gen_label_rtx ();
> +    rtx current = copy_to_reg (gen_rtx_REG (DImode, FPM_REGNUM));
> +    rtx cond = gen_rtx_EQ (VOIDmode, current, operands[0]);
> +    emit_jump_insn (gen_cbranchdi4 (cond, current, operands[0], label));

Do you think it’s worth marking this jump as likely?
In some other expand code in the backend where we emit jumps we sometimes use 
aarch64_emit_unlikely_jump.
Thanks,
Kyrill

> +    emit_insn (gen_aarch64_write_fpmr (operands[0]));
> +    emit_label (label);
> +    DONE;
> +  }
> +)
> +
> +;; A write to the FPMR that is already protected by a conditional branch.
> +;; Since this instruction is introduced late, it shouldn't matter too much
> +;; that we're using an unspec for a move.
> +(define_insn "aarch64_write_fpmr"
> +  [(set (reg:DI FPM_REGNUM)
> +        (unspec:DI [(match_operand:DI 0 "aarch64_reg_or_zero" "rZ")]
> +   UNSPEC_WRITE_FPMR))]
> +  "TARGET_FP8"
> +  "msr\tfpmr, %x0"
> +)
> +
> (define_expand "aarch64_cpymemdi"
>   [(parallel
>      [(set (match_operand 2) (const_int 0))
> diff --git 
> a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp 
> b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> index 4323e5f62ae..7fc33e99b05 100644
> --- a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> +++ b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } {
> 
> # Turn off any codegen tweaks by default that may affect expected assembly.
> # Tests relying on those should turn them on explicitly.
> -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none"
> +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none 
> -moverride=tune=cheap_fpmr_write"
> 
> set gcc_subdir [string replace $subdir 0 2 gcc]
> lappend extra_flags "-fno-ipa-icf" "-I$srcdir/$gcc_subdir/../../sve/acle/asm"
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> index 63f88e24dfb..f0e7035ffc0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> @@ -1,6 +1,6 @@
> /* Test the fp8 ACLE intrinsics family.  */
> /* { dg-do compile } */
> -/* { dg-options "-O1 -march=armv8-a" } */
> +/* { dg-options "-O1 -march=armv8-a -moverride=tune=cheap_fpmr_write" } */
> /* { dg-final { check-function-bodies "**" "" "" } } */
> 
> #include <arm_acle.h>
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> index c5b255b0a9a..79a9535126e 100644
> --- a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma -moverride=tune=cheap_fpmr_write" 
> } */
> 
> #include <arm_neon.h>
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c
> new file mode 100644
> index 00000000000..6a00e017af9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=none" 
> } */
> +/* { dg-final { check-function-bodies "**" "" "" { target *-*-* } 
> {\.L[0-9]+} } } */
> +
> +#include "arm_neon.h"
> +
> +/*
> +** f1:
> +** mrs (x[0-9]+), fpmr
> +** cmp \1, x0
> +** beq ([^\n]+)
> +** msr fpmr, x0
> +** ?\2:
> +** fdot v0.2s, v1.8b, v2.8b
> +** ret
> +*/
> +float32x2_t
> +f1 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
> +{
> +  return vdot_f32_mf8_fpm (a, b, c, d);
> +}
> +
> +/*
> +** f2:
> +** mrs (x[0-9]+), fpmr
> +** cbz \1, ([^\n]+)
> +** msr fpmr, xzr
> +** ?\2:
> +** fdot v0.2s, v1.8b, v2.8b
> +** ret
> +*/
> +float32x2_t
> +f2 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c)
> +{
> +  return vdot_f32_mf8_fpm (a, b, c, 0);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
> index 39076684345..29dece61d4a 100644
> --- a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8 
> -moverride=tune=cheap_fpmr_write" } */
> /* { dg-final { check-function-bodies "**" "" } } */
> 
> #include "arm_neon.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
> index 5fe139106c6..07decd71926 100644
> --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2 
> -moverride=tune=cheap_fpmr_write" } */
> /* { dg-final { check-function-bodies "**" "" } } */
> 
> #include "arm_neon.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
> index e47a737e8b5..27c1d38434f 100644
> --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 
> -moverride=tune=cheap_fpmr_write" } */
> /* { dg-final { check-function-bodies "**" "" } } */
> 
> #include "arm_neon.h"
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c
> index 51b47055ca2..8e5835af5a2 100644
> --- a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c
> @@ -1,5 +1,5 @@
> /* { dg-do compile } */
> -/* { dg-additional-options "-O3 -march=armv9-a+fp8fma" } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8fma 
> -moverride=tune=cheap_fpmr_write" } */
> /* { dg-final { check-function-bodies "**" "" } } */
> 
> #include "arm_neon.h"
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp 
> b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> index 69a3a1786f2..e950f8613da 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp
> @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } {
> 
> # Turn off any codegen tweaks by default that may affect expected assembly.
> # Tests relying on those should turn them on explicitly.
> -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none"
> +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none 
> -moverride=tune=cheap_fpmr_write"
> 
> lappend extra_flags "-fno-ipa-icf"
> 
> -- 
> 2.25.1
>

Re: [PATCH 3/3] aarch64: Avoid redundant writes to FPMR

Reply via email to