Make the dpps and dppd helpers AVX-ready I can't see any obvious reason why dppd shouldn't work on 256 bit ymm registers, but both AMD and Intel agree that it's xmm only.
Signed-off-by: Paul Brook <p...@nowt.org> --- target/i386/ops_sse.h | 54 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index d308a1ec40..4137e6e1fa 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2366,8 +2366,10 @@ SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) -void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, + uint32_t mask) { + Reg *v = d; float32 prod, iresult, iresult2; /* @@ -2375,23 +2377,23 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) * to correctly round the intermediate results */ if (mask & (1 << 4)) { - iresult = float32_mul(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status); + iresult = float32_mul(v->ZMM_S(0), s->ZMM_S(0), &env->sse_status); } else { iresult = float32_zero; } if (mask & (1 << 5)) { - prod = float32_mul(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status); + prod = float32_mul(v->ZMM_S(1), s->ZMM_S(1), &env->sse_status); } else { prod = float32_zero; } iresult = float32_add(iresult, prod, &env->sse_status); if (mask & (1 << 6)) { - iresult2 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status); + iresult2 = float32_mul(v->ZMM_S(2), s->ZMM_S(2), &env->sse_status); } else { iresult2 = float32_zero; } if (mask & (1 << 7)) { - prod = float32_mul(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status); + prod = float32_mul(v->ZMM_S(3), s->ZMM_S(3), &env->sse_status); } else { prod = float32_zero; } @@ -2402,26 +2404,62 @@ void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero; d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero; d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero; +#if SHIFT == 2 + if (mask & (1 << 4)) { + iresult = float32_mul(v->ZMM_S(4), s->ZMM_S(4), &env->sse_status); + } else { + iresult = float32_zero; + } + if (mask & (1 << 5)) { + prod = float32_mul(v->ZMM_S(5), s->ZMM_S(5), &env->sse_status); + } else { + prod = float32_zero; + } + iresult = float32_add(iresult, prod, &env->sse_status); + if (mask & (1 << 6)) { + iresult2 = float32_mul(v->ZMM_S(6), s->ZMM_S(6), &env->sse_status); + } else { + iresult2 = float32_zero; + } + if (mask & (1 << 7)) { + prod = float32_mul(v->ZMM_S(7), s->ZMM_S(7), &env->sse_status); + } else { + prod = float32_zero; + } + iresult2 = float32_add(iresult2, prod, &env->sse_status); + iresult = float32_add(iresult, iresult2, &env->sse_status); + + d->ZMM_S(4) = (mask & (1 << 0)) ? iresult : float32_zero; + d->ZMM_S(5) = (mask & (1 << 1)) ? iresult : float32_zero; + d->ZMM_S(6) = (mask & (1 << 2)) ? iresult : float32_zero; + d->ZMM_S(7) = (mask & (1 << 3)) ? iresult : float32_zero; +#endif } -void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +#if SHIFT == 1 +/* Oddly, there is no ymm version of dppd */ +void glue(helper_dppd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t mask) { + Reg *v = d; float64 iresult; if (mask & (1 << 4)) { - iresult = float64_mul(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status); + iresult = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status); } else { iresult = float64_zero; } + if (mask & (1 << 5)) { iresult = float64_add(iresult, - float64_mul(d->ZMM_D(1), s->ZMM_D(1), + float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status), &env->sse_status); } d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero; d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero; } +#endif void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t offset) -- 2.36.0