Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- target/arm/tcg/vec_helper.c | 71 ++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 25 deletions(-)
diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index b3ed6533bb..9b14885ef2 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -2126,27 +2126,24 @@ static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) */ static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) + uint64_t negx, int negf, uint32_t desc, bool fz16) { intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); int is_q = oprsz == 16; uint64_t n_4, m_4; - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); + /* + * Pre-load all of the f16 data, avoiding overlap issues. + * Negate all inputs for AH=0 FMLSL at once. + */ + n_4 = load4_f16(vn, is_q, is_2) ^ negx; m_4 = load4_f16(vm, is_q, is_2); - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - for (i = 0; i < oprsz / 4; i++) { float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); } clear_tail(d, oprsz, simd_maxsz(desc)); } @@ -2154,14 +2151,28 @@ static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], desc, + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = is_s ? 0x8000800080008000ull : 0; + + do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16])); } void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], desc, + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = 0; + int negf = 0; + + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000800080008000ull; + } + } + do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc, get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16])); } @@ -2186,29 +2197,25 @@ void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, } static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, - uint32_t desc, bool fz16) + uint64_t negx, int negf, uint32_t desc, bool fz16) { intptr_t i, oprsz = simd_oprsz(desc); - int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); int is_q = oprsz == 16; uint64_t n_4; float32 m_1; - /* Pre-load all of the f16 data, avoiding overlap issues. */ - n_4 = load4_f16(vn, is_q, is_2); - - /* Negate all inputs for FMLSL at once. */ - if (is_s) { - n_4 ^= 0x8000800080008000ull; - } - + /* + * Pre-load all of the f16 data, avoiding overlap issues. + * Negate all inputs for AH=0 FMLSL at once. + */ + n_4 = load4_f16(vn, is_q, is_2) ^ negx; m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); for (i = 0; i < oprsz / 4; i++) { float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); - d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); + d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); } clear_tail(d, oprsz, simd_maxsz(desc)); } @@ -2216,14 +2223,28 @@ static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], desc, + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = is_s ? 0x8000800080008000ull : 0; + + do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16])); } void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, CPUARMState *env, uint32_t desc) { - do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], desc, + bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t negx = 0; + int negf = 0; + + if (is_s) { + if (env->vfp.fpcr & FPCR_AH) { + negf = float_muladd_negate_product; + } else { + negx = 0x8000800080008000ull; + } + } + do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc, get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16])); } -- 2.43.0