As pcmpeqb is used by strlen et al, this is the highest overhead sse operation, at 2.5%. It's simple to include the other compares at the same time.
Signed-off-by: Richard Henderson <richard.hender...@linaro.org> --- target/i386/ops_sse.h | 8 -------- target/i386/ops_sse_header.h | 8 -------- target/i386/tcg/translate.c | 31 +++++++++++++++++++++++++------ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 535440f882..94440a9dc5 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -420,14 +420,6 @@ SSE_HELPER_Q(helper_pandn, FANDN) SSE_HELPER_Q(helper_por, FOR) SSE_HELPER_Q(helper_pxor, FXOR) -SSE_HELPER_B(helper_pcmpgtb, FCMPGTB) -SSE_HELPER_W(helper_pcmpgtw, FCMPGTW) -SSE_HELPER_L(helper_pcmpgtl, FCMPGTL) - -SSE_HELPER_B(helper_pcmpeqb, FCMPEQ) -SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) -SSE_HELPER_L(helper_pcmpeql, FCMPEQ) - SSE_HELPER_W(helper_pmullw, FMULLW) #if SHIFT == 0 SSE_HELPER_W(helper_pmulhrw, FMULHRW) diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index cef28f2aae..b9f957daf8 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -91,14 +91,6 @@ SSE_HELPER_Q(pandn, FANDN) SSE_HELPER_Q(por, FOR) SSE_HELPER_Q(pxor, FXOR) -SSE_HELPER_B(pcmpgtb, FCMPGTB) -SSE_HELPER_W(pcmpgtw, FCMPGTW) -SSE_HELPER_L(pcmpgtl, FCMPGTL) - -SSE_HELPER_B(pcmpeqb, FCMPEQ) -SSE_HELPER_W(pcmpeqw, FCMPEQ) -SSE_HELPER_L(pcmpeql, FCMPEQ) - SSE_HELPER_W(pmullw, FMULLW) #if SHIFT == 0 SSE_HELPER_W(pmulhrw, FMULHRW) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index c1f1f6f66b..467d018b68 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2847,9 +2847,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0x61] = MMX_OP2(punpcklwd), [0x62] = MMX_OP2(punpckldq), [0x63] = MMX_OP2(packsswb), - [0x64] = MMX_OP2(pcmpgtb), - [0x65] = MMX_OP2(pcmpgtw), - [0x66] = MMX_OP2(pcmpgtl), + [0x64] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpgtb */ + [0x65] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpgtw */ + [0x66] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpgtl */ [0x67] = MMX_OP2(packuswb), [0x68] = MMX_OP2(punpckhbw), [0x69] = MMX_OP2(punpckhwd), @@ -2866,9 +2866,9 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = { [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */ [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */ [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */ - [0x74] = MMX_OP2(pcmpeqb), - [0x75] = MMX_OP2(pcmpeqw), - [0x76] = MMX_OP2(pcmpeql), + [0x74] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpeqb */ + [0x75] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpeqw */ + [0x76] = { SSE_DUMMY, SSE_DUMMY }, /* pcmpeql */ [0x77] = { SSE_DUMMY }, /* emms */ [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */ [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r }, @@ -4415,6 +4415,9 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, return; } } else { + int vec_len = is_xmm ? 16 : 8; + int xmm_ofs = is_xmm ? offsetof(ZMMReg, ZMM_X(0)) : 0; + /* generic MMX or SSE operation */ switch(b) { case 0x70: /* pshufx insn */ @@ -4532,6 +4535,22 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, sse_fn_eppt = (SSEFunc_0_eppt)sse_fn_epp; sse_fn_eppt(cpu_env, s->ptr0, s->ptr1, s->A0); break; + case 0x64: /* pcmpgtb */ + case 0x65: /* pcmpgtw */ + case 0x66: /* pcmpgtl */ + op1_offset += xmm_ofs; + op2_offset += xmm_ofs; + tcg_gen_gvec_cmp(TCG_COND_GT, b - 0x64, op1_offset, op1_offset, + op2_offset, vec_len, vec_len); + break; + case 0x74: /* pcmpeqb */ + case 0x75: /* pcmpeqw */ + case 0x76: /* pcmpeql */ + op1_offset += xmm_ofs; + op2_offset += xmm_ofs; + tcg_gen_gvec_cmp(TCG_COND_EQ, b - 0x74, op1_offset, op1_offset, + op2_offset, vec_len, vec_len); + break; default: tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); -- 2.34.1