Hi Richard, > On 22 Jan 2025, at 13:21, Richard Sandiford <richard.sandif...@arm.com> wrote: > > GCC 15 is the first release to support FP8 intrinsics. > The underlying instructions depend on the value of a new register, > FPMR. Unlike FPCR, FPMR is a normal call-clobbered/caller-save > register rather than a global register. So: > > - The FP8 intrinsics take a final uint64_t argument that > specifies what value FPMR should have. > > - If an FP8 operation is split across multiple functions, > it is likely that those functions would have a similar argument. > > If the object code has the structure: > > for (...) > fp8_kernel (..., fpmr_value); > > then fp8_kernel would set FPMR to fpmr_value each time it is > called, even though FPMR will already have that value for at > least the second and subsequent calls (and possibly the first). > > The working assumption for the ABI has been that writes to > registers like FPMR can in general be more expensive than > reads and so it would be better to use a conditional write like: > > mrs tmp, fpmr > cmp tmp, <value> > beq 1f > nsr fpmr, <value>
Typo “msr” here and in the comment in the code. > 1: > > instead of writing the same value to FPMR repeatedly. > > This patch implements that. It also adds a tuning flag that suppresses > the behaviour, both to make testing easier and to support any future > cores that (for example) are able to rename FPMR. > > Hopefully this really is the last part of the FP8 enablement. > > Tested on aarch64-linux-gnu. I'll push in about 24 hours > if there are no comments before then. > > Richard > > > gcc/ > * config/aarch64/aarch64-tuning-flags.def > (AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE): New tuning flag. > * config/aarch64/aarch64.h (TARGET_CHEAP_FPMR_WRITE): New macro. > * config/aarch64/aarch64.md: Split moves into FPMR into a test > and branch around. > (aarch64_write_fpmr): New pattern. > > gcc/testsuite/ > * g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Add > cheap_fpmr_write by default. > * gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp: Likewise. > * gcc.target/aarch64/acle/fp8.c: Add cheap_fpmr_write. > * gcc.target/aarch64/acle/fpmr-2.c: Likewise. > * gcc.target/aarch64/simd/vcvt_fpm.c: Likewise. > * gcc.target/aarch64/simd/vdot2_fpm.c: Likewise. > * gcc.target/aarch64/simd/vdot4_fpm.c: Likewise. > * gcc.target/aarch64/simd/vmla_fpm.c: Likewise. > * gcc.target/aarch64/acle/fpmr-6.c: New test. > --- > gcc/config/aarch64/aarch64-tuning-flags.def | 15 +++++++ > gcc/config/aarch64/aarch64.h | 5 +++ > gcc/config/aarch64/aarch64.md | 39 +++++++++++++++++++ > .../sve2/acle/aarch64-sve2-acle-asm.exp | 2 +- > gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 2 +- > .../gcc.target/aarch64/acle/fpmr-2.c | 2 +- > .../gcc.target/aarch64/acle/fpmr-6.c | 36 +++++++++++++++++ > .../gcc.target/aarch64/simd/vcvt_fpm.c | 2 +- > .../gcc.target/aarch64/simd/vdot2_fpm.c | 2 +- > .../gcc.target/aarch64/simd/vdot4_fpm.c | 2 +- > .../gcc.target/aarch64/simd/vmla_fpm.c | 2 +- > .../sve2/acle/aarch64-sve2-acle-asm.exp | 2 +- > 12 files changed, 103 insertions(+), 8 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c > > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def > b/gcc/config/aarch64/aarch64-tuning-flags.def > index 60967aac903..7a67d6197d9 100644 > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > @@ -48,6 +48,21 @@ AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", > FULLY_PIPELINED_FMA) > rather than re-use an input predicate register. */ > AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) > > +/* Whether writes to the FPMR are cheap enough that: > + > + msr fpmr, <value> > + > + is better than: > + > + mrs tmp, fpmr > + cmp tmp, <value> > + beq 1f > + nsr fpmr, <value> > + 1: > + > + even when the branch is predictably taken. */ > +AARCH64_EXTRA_TUNING_OPTION ("cheap_fpmr_write", CHEAP_FPMR_WRITE) > + > /* Baseline tuning settings suitable for all modern cores. */ > #define AARCH64_EXTRA_TUNE_BASE (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND \ > | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA) > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index 218868a5246..5cbf442130b 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -486,6 +486,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE > ATTRIBUTE_UNUSED > /* fp8 instructions are enabled through +fp8. */ > #define TARGET_FP8 AARCH64_HAVE_ISA (FP8) > > +/* See the comment above the tuning flag for details. */ > +#define TARGET_CHEAP_FPMR_WRITE \ > + (bool (aarch64_tune_params.extra_tuning_flags \ > + & AARCH64_EXTRA_TUNE_CHEAP_FPMR_WRITE)) > + > /* Combinatorial tests. */ > > #define TARGET_SVE2_OR_SME2 \ > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 776c4c4ceee..071058dbeb3 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -356,6 +356,7 @@ (define_c_enum "unspec" [ > UNSPEC_UPDATE_FFRT > UNSPEC_RDFFR > UNSPEC_WRFFR > + UNSPEC_WRITE_FPMR > UNSPEC_SYSREG_RDI > UNSPEC_SYSREG_RTI > UNSPEC_SYSREG_WDI > @@ -1883,6 +1884,44 @@ (define_split > } > ) > > +;; The preferred way of writing to the FPMR is to test whether it already > +;; has the desired value and branch around the write if so. This reduces > +;; the number of redundant FPMR writes caused by ABI boundaries, such as in: > +;; > +;; for (...) > +;; fp8_kernel (..., fpmr_value); > +;; > +;; Without this optimization, fp8_kernel would set FPMR to fpmr_value each > +;; time that it is called. > +;; > +;; We do this as a split so that hardreg_pre can optimize the moves first. > +(define_split > + [(set (reg:DI FPM_REGNUM) > + (match_operand:DI 0 "aarch64_reg_or_zero"))] > + "TARGET_FP8 && !TARGET_CHEAP_FPMR_WRITE && can_create_pseudo_p ()" > + [(const_int 0)] > + { > + auto label = gen_label_rtx (); > + rtx current = copy_to_reg (gen_rtx_REG (DImode, FPM_REGNUM)); > + rtx cond = gen_rtx_EQ (VOIDmode, current, operands[0]); > + emit_jump_insn (gen_cbranchdi4 (cond, current, operands[0], label)); Do you think it’s worth marking this jump as likely? In some other expand code in the backend where we emit jumps we sometimes use aarch64_emit_unlikely_jump. Thanks, Kyrill > + emit_insn (gen_aarch64_write_fpmr (operands[0])); > + emit_label (label); > + DONE; > + } > +) > + > +;; A write to the FPMR that is already protected by a conditional branch. > +;; Since this instruction is introduced late, it shouldn't matter too much > +;; that we're using an unspec for a move. > +(define_insn "aarch64_write_fpmr" > + [(set (reg:DI FPM_REGNUM) > + (unspec:DI [(match_operand:DI 0 "aarch64_reg_or_zero" "rZ")] > + UNSPEC_WRITE_FPMR))] > + "TARGET_FP8" > + "msr\tfpmr, %x0" > +) > + > (define_expand "aarch64_cpymemdi" > [(parallel > [(set (match_operand 2) (const_int 0)) > diff --git > a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > index 4323e5f62ae..7fc33e99b05 100644 > --- a/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > +++ b/gcc/testsuite/g++.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } { > > # Turn off any codegen tweaks by default that may affect expected assembly. > # Tests relying on those should turn them on explicitly. > -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none" > +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none > -moverride=tune=cheap_fpmr_write" > > set gcc_subdir [string replace $subdir 0 2 gcc] > lappend extra_flags "-fno-ipa-icf" "-I$srcdir/$gcc_subdir/../../sve/acle/asm" > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c > b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c > index 63f88e24dfb..f0e7035ffc0 100644 > --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c > @@ -1,6 +1,6 @@ > /* Test the fp8 ACLE intrinsics family. */ > /* { dg-do compile } */ > -/* { dg-options "-O1 -march=armv8-a" } */ > +/* { dg-options "-O1 -march=armv8-a -moverride=tune=cheap_fpmr_write" } */ > /* { dg-final { check-function-bodies "**" "" "" } } */ > > #include <arm_acle.h> > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > index c5b255b0a9a..79a9535126e 100644 > --- a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O1 -march=armv8-a+fp8fma" } */ > +/* { dg-options "-O1 -march=armv8-a+fp8fma -moverride=tune=cheap_fpmr_write" > } */ > > #include <arm_neon.h> > > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c > b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c > new file mode 100644 > index 00000000000..6a00e017af9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-6.c > @@ -0,0 +1,36 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 -moverride=tune=none" > } */ > +/* { dg-final { check-function-bodies "**" "" "" { target *-*-* } > {\.L[0-9]+} } } */ > + > +#include "arm_neon.h" > + > +/* > +** f1: > +** mrs (x[0-9]+), fpmr > +** cmp \1, x0 > +** beq ([^\n]+) > +** msr fpmr, x0 > +** ?\2: > +** fdot v0.2s, v1.8b, v2.8b > +** ret > +*/ > +float32x2_t > +f1 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d) > +{ > + return vdot_f32_mf8_fpm (a, b, c, d); > +} > + > +/* > +** f2: > +** mrs (x[0-9]+), fpmr > +** cbz \1, ([^\n]+) > +** msr fpmr, xzr > +** ?\2: > +** fdot v0.2s, v1.8b, v2.8b > +** ret > +*/ > +float32x2_t > +f2 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c) > +{ > + return vdot_f32_mf8_fpm (a, b, c, 0); > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c > b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c > index 39076684345..29dece61d4a 100644 > --- a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */ > +/* { dg-additional-options "-O3 -march=armv9-a+fp8 > -moverride=tune=cheap_fpmr_write" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include "arm_neon.h" > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c > b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c > index 5fe139106c6..07decd71926 100644 > --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */ > +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2 > -moverride=tune=cheap_fpmr_write" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include "arm_neon.h" > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c > b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c > index e47a737e8b5..27c1d38434f 100644 > --- a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */ > +/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4 > -moverride=tune=cheap_fpmr_write" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include "arm_neon.h" > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c > b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c > index 51b47055ca2..8e5835af5a2 100644 > --- a/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c > +++ b/gcc/testsuite/gcc.target/aarch64/simd/vmla_fpm.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-O3 -march=armv9-a+fp8fma" } */ > +/* { dg-additional-options "-O3 -march=armv9-a+fp8fma > -moverride=tune=cheap_fpmr_write" } */ > /* { dg-final { check-function-bodies "**" "" } } */ > > #include "arm_neon.h" > diff --git > a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > index 69a3a1786f2..e950f8613da 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/aarch64-sve2-acle-asm.exp > @@ -39,7 +39,7 @@ if { [check_effective_target_aarch64_sve2] } { > > # Turn off any codegen tweaks by default that may affect expected assembly. > # Tests relying on those should turn them on explicitly. > -set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none" > +set sve2_flags "$sve2_flags -mtune=generic -moverride=tune=none > -moverride=tune=cheap_fpmr_write" > > lappend extra_flags "-fno-ipa-icf" > > -- > 2.25.1 >