Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- target/arm/tcg/translate-a64.c | 2 +- target/arm/tcg/vec_helper.c | 44 ++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/target/arm/tcg/translate-a64.c b/target/arm/tcg/translate-a64.c index 3748f7d145..9e751e737a 100644 --- a/target/arm/tcg/translate-a64.c +++ b/target/arm/tcg/translate-a64.c @@ -6922,7 +6922,7 @@ static bool trans_FCMLA_vi(DisasContext *s, arg_FCMLA_vi *a) if (fp_access_check(s)) { gen_gvec_op4_fpst(s, a->q, a->rd, a->rn, a->rm, a->rd, a->esz == MO_16 ? FPST_A64_F16 : FPST_A64, - (a->idx << 2) | a->rot, fn); + (s->fpcr_ah << 4) | (a->idx << 2) | a->rot, fn); } return true; } diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 76637d072d..964000773a 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -995,29 +995,33 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float16 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); + uint32_t negf_real = flip ^ negf_imag; intptr_t elements = opr_sz / sizeof(float16); intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); + float16 negx_imag, negx_real; intptr_t i, j; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 15; - neg_imag <<= 15; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 15; + negx_imag = (negf_imag & ~fpcr_ah) << 15; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < elements; i += eltspersegment) { float16 mr = m[H2(i + 2 * index + 0)]; float16 mi = m[H2(i + 2 * index + 1)]; - float16 e1 = neg_real ^ (flip ? mi : mr); - float16 e3 = neg_imag ^ (flip ? mr : mi); + float16 e1 = negx_real ^ (flip ? mi : mr); + float16 e3 = negx_imag ^ (flip ? mr : mi); for (j = i; j < i + eltspersegment; j += 2) { float16 e2 = n[H2(j + flip)]; float16 e4 = e2; - d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); - d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); + d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); + d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); } } clear_tail(d, opr_sz, simd_maxsz(desc)); @@ -1059,29 +1063,33 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, uintptr_t opr_sz = simd_oprsz(desc); float32 *d = vd, *n = vn, *m = vm, *a = va; intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); - uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); - uint32_t neg_real = flip ^ neg_imag; + uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); + uint32_t negf_real = flip ^ negf_imag; intptr_t elements = opr_sz / sizeof(float32); intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); + float32 negx_imag, negx_real; intptr_t i, j; - /* Shift boolean to the sign bit so we can xor to negate. */ - neg_real <<= 31; - neg_imag <<= 31; + /* With AH=0, use negx; with AH=1 use negf. */ + negx_real = (negf_real & ~fpcr_ah) << 31; + negx_imag = (negf_imag & ~fpcr_ah) << 31; + negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); + negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); for (i = 0; i < elements; i += eltspersegment) { float32 mr = m[H4(i + 2 * index + 0)]; float32 mi = m[H4(i + 2 * index + 1)]; - float32 e1 = neg_real ^ (flip ? mi : mr); - float32 e3 = neg_imag ^ (flip ? mr : mi); + float32 e1 = negx_real ^ (flip ? mi : mr); + float32 e3 = negx_imag ^ (flip ? mr : mi); for (j = i; j < i + eltspersegment; j += 2) { float32 e2 = n[H4(j + flip)]; float32 e4 = e2; - d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); - d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); + d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); + d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); } } clear_tail(d, opr_sz, simd_maxsz(desc)); -- 2.43.0