On Tue, 28 Jun 2022 at 05:36, Richard Henderson <richard.hender...@linaro.org> wrote: > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > v4: Drop restrict. > --- > target/arm/helper-sme.h | 5 +++ > target/arm/sme.decode | 11 +++++ > target/arm/sme_helper.c | 90 ++++++++++++++++++++++++++++++++++++++ > target/arm/translate-sme.c | 31 +++++++++++++ > 4 files changed, 137 insertions(+)
> #undef DO_ST > + > +void HELPER(sme_addha_s)(void *vzda, void *vzn, void *vpn, > + void *vpm, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_oprsz(desc) / 4; > + uint64_t *pn = vpn, *pm = vpm; > + uint32_t *zda = vzda, *zn = vzn; > + > + for (row = 0; row < oprsz; ) { > + uint64_t pa = pn[row >> 4]; > + do { > + if (pa & 1) { > + for (col = 0; col < oprsz; ) { > + uint64_t pb = pm[col >> 4]; > + do { > + if (pb & 1) { > + zda[row * sizeof(ARMVectorReg) + col] += zn[col]; > + } > + pb >>= 4; > + } while (++col & 15); > + } > + } > + pa >>= 4; > + } while (++row & 15); > + } > +} > + > +void HELPER(sme_addha_d)(void *vzda, void *vzn, void *vpn, > + void *vpm, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; > + uint8_t *pn = vpn, *pm = vpm; > + uint64_t *zda = vzda, *zn = vzn; > + > + for (row = 0; row < oprsz; ++row) { > + if (pn[H1(row)] & 1) { > + for (col = 0; col < oprsz; ++col) { > + if (pm[H1(col)] & 1) { > + zda[row * sizeof(ARMVectorReg) + col] += zn[col]; > + } > + } > + } > + } > +} These array index calculations look wrong again? Should be 'row * (sizeof(ARMVectorReg) / 8) + col' or equivalent, I think. > + > +void HELPER(sme_addva_s)(void *vzda, void *vzn, void *vpn, > + void *vpm, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_oprsz(desc) / 4; > + uint64_t *pn = vpn, *pm = vpm; > + uint32_t *zda = vzda, *zn = vzn; > + > + for (row = 0; row < oprsz; ) { > + uint64_t pa = pn[row >> 4]; > + do { > + if (pa & 1) { > + uint32_t zn_row = zn[row]; > + for (col = 0; col < oprsz; ) { > + uint64_t pb = pm[col >> 4]; > + do { > + if (pb & 1) { > + zda[row * sizeof(ARMVectorReg) + col] += zn_row; > + } > + pb >>= 4; > + } while (++col & 15); > + } > + } > + pa >>= 4; > + } while (++row & 15); > + } > +} > + > +void HELPER(sme_addva_d)(void *vzda, void *vzn, void *vpn, > + void *vpm, uint32_t desc) > +{ > + intptr_t row, col, oprsz = simd_oprsz(desc) / 8; > + uint8_t *pn = vpn, *pm = vpm; > + uint64_t *zda = vzda, *zn = vzn; > + > + for (row = 0; row < oprsz; ++row) { > + if (pn[H1(row)] & 1) { > + uint64_t zn_row = zn[row]; > + for (col = 0; col < oprsz; ++col) { > + if (pm[H1(col)] & 1) { > + zda[row * sizeof(ARMVectorReg) + col] += zn_row; > + } > + } > + } > + } > +} -- PMM