On Fri, 30 Apr 2021 at 21:57, Richard Henderson <richard.hender...@linaro.org> wrote: > > For SVE, we potentially have a 4th argument coming from the > movprfx instruction. Currently we do not optimize movprfx, > so the problem is not visible. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > v4: Fix double addition (zhiwei). > --- > target/arm/helper.h | 20 +++--- > target/arm/sve.decode | 7 ++- > target/arm/translate-a64.c | 15 ++++- > target/arm/translate-neon.c | 10 +-- > target/arm/translate-sve.c | 13 ++-- > target/arm/vec_helper.c | 120 ++++++++++++++++++++---------------- > 6 files changed, 109 insertions(+), 76 deletions(-)
> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index a3d80ecad0..f88e572132 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void > *vm, > * All elements are treated equally, no matter where they are. > */ > > -void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t > desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; Why the type change ? > int8_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 4; ++i) { > - d[i] += n[i * 4 + 0] * m[i * 4 + 0] > - + n[i * 4 + 1] * m[i * 4 + 1] > - + n[i * 4 + 2] * m[i * 4 + 2] > - + n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + n[i * 4 + 0] * m[i * 4 + 0] + > + n[i * 4 + 1] * m[i * 4 + 1] + > + n[i * 4 + 2] * m[i * 4 + 2] + > + n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > -void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t > desc) > { > intptr_t i, opr_sz = simd_oprsz(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; Ditto. > int16_t *n = vn, *m = vm; > > for (i = 0; i < opr_sz / 8; ++i) { > - d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] > - + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] > - + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] > - + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; > + d[i] = (a[i] + > + (int64_t)n[i * 4 + 0] * m[i * 4 + 0] + > + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] + > + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] + > + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]); > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > > -void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; > intptr_t index = simd_data(desc); > - uint32_t *d = vd; > + int32_t *d = vd, *a = va; > int8_t *n = vn; > int8_t *m_indexed = (int8_t *)vm + H4(index) * 4; Ditto. > -void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) > +void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, > + void *va, uint32_t desc) > { > intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; > intptr_t index = simd_data(desc); > - uint64_t *d = vd; > + int64_t *d = vd, *a = va; > int16_t *n = vn; > int16_t *m_indexed = (int16_t *)vm + index * 4; Ditto. > @@ -509,30 +518,33 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void > *vm, uint32_t desc) > * Process the entire segment all at once, writing back the results > * only after we've consumed all of the inputs. > */ > - for (i = 0; i < opr_sz_8 ; i += 2) { > - uint64_t d0, d1; > + for (i = 0; i < opr_sz_8; i += 2) { > + int64_t d0, d1; Ditto. > > - d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > + d0 = a[i + 0]; > + d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; > d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; > d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; > d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; > - d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > + > + d1 = a[i + 1]; > + d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; > d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; > d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; > d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; > > - d[i + 0] += d0; > - d[i + 1] += d1; > + d[i + 0] = d0; > + d[i + 1] = d1; > } > - > clear_tail(d, opr_sz, simd_maxsz(desc)); > } Otherwise Reviewed-by: Peter Maydell <peter.mayd...@linaro.org> thanks -- PMM