Since psubb is the second highest overhead sse operation, at 0.9%. It's simple to include add and the other sizes at the same time.
Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- target/i386/ops_sse.h | 10 --------- target/i386/ops_sse_header.h | 10 --------- target/i386/tcg/translate.c | 39 ++++++++++++++++++++++++++++-------- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 94440a9dc5..6f035b5c16 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -389,16 +389,6 @@ static inline int satsw(int x) #define FAVG(a, b) (((a) + (b) + 1) >> 1) #endif -SSE_HELPER_B(helper_paddb, FADD) -SSE_HELPER_W(helper_paddw, FADD) -SSE_HELPER_L(helper_paddl, FADD) -SSE_HELPER_Q(helper_paddq, FADD) - -SSE_HELPER_B(helper_psubb, FSUB) -SSE_HELPER_W(helper_psubw, FSUB) -SSE_HELPER_L(helper_psubl, FSUB) -SSE_HELPER_Q(helper_psubq, FSUB) - SSE_HELPER_B(helper_paddusb, FADDUB) SSE_HELPER_B(helper_paddsb, FADDSB) SSE_HELPER_B(helper_psubusb, FSUBUB) diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index b9f957daf8..da630fbc40 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -60,16 +60,6 @@ DEF_HELPER_3(glue(pslldq, SUFFIX), void, env, Reg, Reg) #define SSE_HELPER_Q(name, F)\ DEF_HELPER_3(glue(name, SUFFIX), void, env, Reg, Reg) -SSE_HELPER_B(paddb, FADD) -SSE_HELPER_W(paddw, FADD) -SSE_HELPER_L(paddl, FADD) -SSE_HELPER_Q(paddq, FADD) - -SSE_HELPER_B(psubb, FSUB) -SSE_HELPER_W(psubw, FSUB) -SSE_HELPER_L(psubl, FSUB) -SSE_HELPER_Q(psubq, FSUB) - SSE_HELPER_B(paddusb, FADDUB) SSE_HELPER_B(paddsb, FADDSB) SSE_HELPER_B(psubusb, FSUBUB) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 467d018b68..2a8ea3369a 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2882,7 +2882,7 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0xd1] = MMX_OP2(psrlw), [0xd2] = MMX_OP2(psrld), [0xd3] = MMX_OP2(psrlq), - [0xd4] = MMX_OP2(paddq), + [0xd4] = { SSE_DUMMY, SSE_DUMMY }, /* paddq */ [0xd5] = MMX_OP2(pmullw), [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */ @@ -2919,13 +2919,13 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0xf6] = MMX_OP2(psadbw), [0xf7] = { (SSEFunc_0_epp)gen_helper_maskmov_mmx, (SSEFunc_0_epp)gen_helper_maskmov_xmm }, /* XXX: casts */ - [0xf8] = MMX_OP2(psubb), - [0xf9] = MMX_OP2(psubw), - [0xfa] = MMX_OP2(psubl), - [0xfb] = MMX_OP2(psubq), - [0xfc] = MMX_OP2(paddb), - [0xfd] = MMX_OP2(paddw), - [0xfe] = MMX_OP2(paddl), + [0xf8] = { SSE_DUMMY, SSE_DUMMY }, /* psubb */ + [0xf9] = { SSE_DUMMY, SSE_DUMMY }, /* psubw */ + [0xfa] = { SSE_DUMMY, SSE_DUMMY }, /* psubl */ + [0xfb] = { SSE_DUMMY, SSE_DUMMY }, /* psubq */ + [0xfc] = { SSE_DUMMY, SSE_DUMMY }, /* paddb */ + [0xfd] = { SSE_DUMMY, SSE_DUMMY }, /* paddw */ + [0xfe] = { SSE_DUMMY, SSE_DUMMY }, /* paddl */ }; static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = { @@ -4551,6 +4551,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, tcg_gen_gvec_cmp(TCG_COND_EQ, b - 0x74, op1_offset, op1_offset, op2_offset, vec_len, vec_len); break; + case 0xf8: /* psubb */ + case 0xf9: /* psubw */ + case 0xfa: /* psubl */ + case 0xfb: /* psubq */ + op1_offset += xmm_ofs; + op2_offset += xmm_ofs; + tcg_gen_gvec_sub(b - 0xf8, op1_offset, op1_offset, + op2_offset, vec_len, vec_len); + break; + case 0xfc: /* paddb */ + case 0xfd: /* paddw */ + case 0xfe: /* paddl */ + op1_offset += xmm_ofs; + op2_offset += xmm_ofs; + tcg_gen_gvec_add(b - 0xfc, op1_offset, op1_offset, + op2_offset, vec_len, vec_len); + break; + case 0xd4: /* paddq */ + op1_offset += xmm_ofs; + op2_offset += xmm_ofs; + tcg_gen_gvec_add(MO_64, op1_offset, op1_offset, + op2_offset, vec_len, vec_len); + break; default: tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); -- 2.34.1