When FPCR.AH is 1, the behaviour of some instructions changes: * AdvSIMD BFCVT, BFCVTN, BFCVTN2, BFMLALB, BFMLALT * SVE BFCVT, BFCVTNT, BFMLALB, BFMLALT, BFMLSLB, BFMLSLT * SME BFCVT, BFCVTN, BFMLAL, BFMLSL (these are all in SME2 which QEMU does not yet implement) * FRECPE, FRECPS, FRECPX, FRSQRTE, FRSQRTS
The behaviour change is: * the instructions do not update the FPSR cumulative exception flags * trapped floating point exceptions are disabled (a no-op for QEMU, which doesn't implement FPCR.{IDE,IXE,UFE,OFE,DZE,IOE}) * rounding is always round-to-nearest-even regardless of FPCR.RMode * denormalized inputs and outputs are always flushed to zero, as if FPCR.{FZ,FIZ} is {1,1} * FPCR.FZ16 is still honoured for half-precision inputs (See the Arm ARM DDI0487L.a section A1.5.9.) We can provide all these behaviours with another pair of float_status fields which we use only for these insns, when FPCR.AH is 1. These float_status fields will always have: * flush_to_zero and flush_inputs_to_zero set for the non-F16 field * rounding mode set to round-to-nearest-even and so the only FPCR fields they need to honour are DN and FZ16. In this commit we only define the new fp_status fields and give them the required behaviour when FPSR is updated. In subsequent commits we will arrange to use this new fp_status field for the instructions that should be affected by FPCR.AH in this way. Signed-off-by: Peter Maydell <peter.mayd...@linaro.org> --- I'm not super enthusiastic about the ah_fp_status naming, which sort of suggests it's always to be used when AH=1, rather than "for this specific group of insns when AH=1". But I couldn't think of a better name that was still reasonably short... --- target/arm/cpu.h | 15 +++++++++++++++ target/arm/internals.h | 2 ++ target/arm/tcg/translate.h | 14 ++++++++++++++ target/arm/cpu.c | 4 ++++ target/arm/vfp_helper.c | 13 ++++++++++++- 5 files changed, 47 insertions(+), 1 deletion(-) diff --git a/target/arm/cpu.h b/target/arm/cpu.h index c8b44c725d0..cfb16151577 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -640,6 +640,13 @@ typedef struct CPUArchState { * standard_fp_status : the ARM "Standard FPSCR Value" * standard_fp_status_fp16 : used for half-precision * calculations with the ARM "Standard FPSCR Value" + * ah_fp_status: used for the A64 insns which change behaviour + * when FPCR.AH == 1 (bfloat16 conversions and multiplies, + * and the reciprocal and square root estimate/step insns) + * ah_fp_status_f16: used for the A64 insns which change behaviour + * when FPCR.AH == 1 (bfloat16 conversions and multiplies, + * and the reciprocal and square root estimate/step insns); + * for half-precision * * Half-precision operations are governed by a separate * flush-to-zero control bit in FPSCR:FZ16. We pass a separate @@ -654,6 +661,12 @@ typedef struct CPUArchState { * the "standard FPSCR" tracks the FPSCR.FZ16 bit rather than * using a fixed value for it. * + * The ah_fp_status is needed because some insns have different + * behaviour when FPCR.AH == 1: they don't update cumulative + * exception flags, they act like FPCR.{FZ,FIZ} = {1,1} and + * they ignore FPCR.RMode. But they don't ignore FPCR.FZ16, + * which means we need an ah_fp_status_f16 as well. + * * To avoid having to transfer exception bits around, we simply * say that the FPSCR cumulative exception flags are the logical * OR of the flags in the four fp statuses. This relies on the @@ -666,6 +679,8 @@ typedef struct CPUArchState { float_status fp_status_f16_a64; float_status standard_fp_status; float_status standard_fp_status_f16; + float_status ah_fp_status; + float_status ah_fp_status_f16; uint64_t zcr_el[4]; /* ZCR_EL[1-3] */ uint64_t smcr_el[4]; /* SMCR_EL[1-3] */ diff --git a/target/arm/internals.h b/target/arm/internals.h index 98073acc276..b3187341456 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -1831,5 +1831,7 @@ int alle1_tlbmask(CPUARMState *env); /* Set the float_status behaviour to match the Arm defaults */ void arm_set_default_fp_behaviours(float_status *s); +/* Set the float_status behaviour to match Arm FPCR.AH=1 behaviour */ +void arm_set_ah_fp_behaviours(float_status *s); #endif diff --git a/target/arm/tcg/translate.h b/target/arm/tcg/translate.h index c37c0b539e2..d6edd8db76b 100644 --- a/target/arm/tcg/translate.h +++ b/target/arm/tcg/translate.h @@ -676,6 +676,8 @@ typedef enum ARMFPStatusFlavour { FPST_FPCR_A64, FPST_FPCR_F16_A32, FPST_FPCR_F16_A64, + FPST_FPCR_AH, + FPST_FPCR_AH_F16, FPST_STD, FPST_STD_F16, } ARMFPStatusFlavour; @@ -696,6 +698,12 @@ typedef enum ARMFPStatusFlavour { * for AArch32 operations controlled by the FPCR where FPCR.FZ16 is to be used * FPST_FPCR_F16_A64 * for AArch64 operations controlled by the FPCR where FPCR.FZ16 is to be used + * FPST_FPCR_AH: + * for AArch64 operations which change behaviour when AH=1 (specifically, + * bfloat16 conversions and multiplies, and the reciprocal and square root + * estimate/step insns) + * FPST_FPCR_AH_F16: + * ditto, but for half-precision operations * FPST_STD * for A32/T32 Neon operations using the "standard FPSCR value" * FPST_STD_F16 @@ -719,6 +727,12 @@ static inline TCGv_ptr fpstatus_ptr(ARMFPStatusFlavour flavour) case FPST_FPCR_F16_A64: offset = offsetof(CPUARMState, vfp.fp_status_f16_a64); break; + case FPST_FPCR_AH: + offset = offsetof(CPUARMState, vfp.ah_fp_status); + break; + case FPST_FPCR_AH_F16: + offset = offsetof(CPUARMState, vfp.ah_fp_status_f16); + break; case FPST_STD: offset = offsetof(CPUARMState, vfp.standard_fp_status); break; diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 1ba22c4c7aa..8fa220a7165 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -556,6 +556,10 @@ static void arm_cpu_reset_hold(Object *obj, ResetType type) arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a32); arm_set_default_fp_behaviours(&env->vfp.fp_status_f16_a64); arm_set_default_fp_behaviours(&env->vfp.standard_fp_status_f16); + arm_set_ah_fp_behaviours(&env->vfp.ah_fp_status); + set_flush_to_zero(1, &env->vfp.ah_fp_status); + set_flush_inputs_to_zero(1, &env->vfp.ah_fp_status); + arm_set_ah_fp_behaviours(&env->vfp.ah_fp_status_f16); #ifndef CONFIG_USER_ONLY if (kvm_enabled()) { diff --git a/target/arm/vfp_helper.c b/target/arm/vfp_helper.c index 2eb75bd7ecc..50a8a659577 100644 --- a/target/arm/vfp_helper.c +++ b/target/arm/vfp_helper.c @@ -64,7 +64,7 @@ void arm_set_default_fp_behaviours(float_status *s) * set Invalid for a QNaN * * default NaN has sign bit set, msb frac bit set */ -static void arm_set_ah_fp_behaviours(float_status *s) +void arm_set_ah_fp_behaviours(float_status *s) { set_float_detect_tininess(float_tininess_after_rounding, s); set_float_detect_ftz(detect_ftz_after_rounding, s); @@ -128,6 +128,11 @@ static uint32_t vfp_get_fpsr_from_host(CPUARMState *env) a64_flags |= get_float_exception_flags(&env->vfp.fp_status_a64); a64_flags |= (get_float_exception_flags(&env->vfp.fp_status_f16_a64) & ~(float_flag_input_denormal_flushed | float_flag_input_denormal_used)); + /* + * We do not merge in flags from ah_fp_status or ah_fp_status_f16, because + * they are used for insns that must not set the cumulative exception bits. + */ + /* * Flushing an input denormal only because FPCR.FIZ == 1 does * not set FPSR.IDC. So squash it unless (FPCR.AH == 0 && FPCR.FZ == 1). @@ -154,6 +159,8 @@ static void vfp_clear_float_status_exc_flags(CPUARMState *env) set_float_exception_flags(0, &env->vfp.fp_status_f16_a64); set_float_exception_flags(0, &env->vfp.standard_fp_status); set_float_exception_flags(0, &env->vfp.standard_fp_status_f16); + set_float_exception_flags(0, &env->vfp.ah_fp_status); + set_float_exception_flags(0, &env->vfp.ah_fp_status_f16); } static void vfp_sync_and_clear_float_status_exc_flags(CPUARMState *env) @@ -199,9 +206,11 @@ static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32); set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64); set_flush_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16); + set_flush_to_zero(ftz_enabled, &env->vfp.ah_fp_status_f16); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a32); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16_a64); set_flush_inputs_to_zero(ftz_enabled, &env->vfp.standard_fp_status_f16); + set_flush_inputs_to_zero(ftz_enabled, &env->vfp.ah_fp_status_f16); } if (changed & FPCR_FZ) { bool ftz_enabled = val & FPCR_FZ; @@ -225,6 +234,8 @@ static void vfp_set_fpcr_to_host(CPUARMState *env, uint32_t val, uint32_t mask) set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_a64); set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a32); set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16_a64); + set_default_nan_mode(dnan_enabled, &env->vfp.ah_fp_status); + set_default_nan_mode(dnan_enabled, &env->vfp.ah_fp_status_f16); } if (changed & FPCR_AH) { bool ah_enabled = val & FPCR_AH; -- 2.34.1