Now that we've got two slots in ENV, store two of the three inputs. This lets us do less work when carry-out is not needed, and avoids the unpredictable CC_OP after translating these insns.
Signed-off-by: Richard Henderson <r...@twiddle.net> --- target-i386/cc_helper_template.h | 44 +++++++++++++++++------------- target-i386/cpu.h | 7 ++--- target-i386/translate.c | 58 ++++++++++------------------------------ 3 files changed, 44 insertions(+), 65 deletions(-) diff --git a/target-i386/cc_helper_template.h b/target-i386/cc_helper_template.h index 951ceaf..fcb14db 100644 --- a/target-i386/cc_helper_template.h +++ b/target-i386/cc_helper_template.h @@ -61,16 +61,19 @@ static int glue(compute_all_add, SUFFIX)(CPUX86State *env) static int glue(compute_all_adc, SUFFIX)(CPUX86State *env) { int cf, pf, af, zf, sf, of; - target_long src1, src2; + DATA_TYPE dst, src1, src2, src3; + dst = CC_DST; src1 = CC_SRC; - src2 = CC_DST - CC_SRC - 1; - cf = (DATA_TYPE)CC_DST <= (DATA_TYPE)src1; - pf = parity_table[(uint8_t)CC_DST]; - af = (CC_DST ^ src1 ^ src2) & 0x10; - zf = ((DATA_TYPE)CC_DST == 0) << 6; - sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80; - of = lshift((src1 ^ src2 ^ -1) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O; + src3 = CC_SRC2; /* carry-in: always 0/1. */ + src2 = dst - src1; + + cf = dst < src1 || dst < src3; + pf = parity_table[(uint8_t)dst]; + af = (dst ^ src1 ^ src2) & 0x10; + zf = (dst == 0) << 6; + sf = lshift(dst, 8 - DATA_BITS) & 0x80; + of = lshift((src1 ^ src2 ^ -1) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; return cf | pf | af | zf | sf | of; } @@ -93,16 +96,21 @@ static int glue(compute_all_sub, SUFFIX)(CPUX86State *env) static int glue(compute_all_sbb, SUFFIX)(CPUX86State *env) { int cf, pf, af, zf, sf, of; - target_long src1, src2; - - src1 = CC_DST + CC_SRC + 1; - src2 = CC_SRC; - cf = (DATA_TYPE)src1 <= (DATA_TYPE)src2; - pf = parity_table[(uint8_t)CC_DST]; - af = (CC_DST ^ src1 ^ src2) & 0x10; - zf = ((DATA_TYPE)CC_DST == 0) << 6; - sf = lshift(CC_DST, 8 - DATA_BITS) & 0x80; - of = lshift((src1 ^ src2) & (src1 ^ CC_DST), 12 - DATA_BITS) & CC_O; + DATA_TYPE dst, src1, src2, src3; + + dst = CC_DST; + src3 = CC_SRC2; /* borrow-in: always 0/1. */ + src2 = CC_SRC + src3; + src1 = dst + src2; + + /* If src2 + src3 overflows, then we're logically subtracting a larger + value than src1 could have held, and thus we must have borrow out. */ + cf = src2 < src3 || src1 < src2; + pf = parity_table[(uint8_t)dst]; + af = (dst ^ src1 ^ src2) & 0x10; + zf = (dst == 0) << 6; + sf = lshift(dst, 8 - DATA_BITS) & 0x80; + of = lshift((src1 ^ src2) & (src1 ^ dst), 12 - DATA_BITS) & CC_O; return cf | pf | af | zf | sf | of; } diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 868627e..fa34ff2 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -1119,9 +1119,10 @@ static inline int cpu_mmu_index (CPUX86State *env) #define EIP (env->eip) #define DF (env->df) -#define CC_SRC (env->cc_src) -#define CC_DST (env->cc_dst) -#define CC_OP (env->cc_op) +#define CC_DST (env->cc_dst) +#define CC_SRC (env->cc_src) +#define CC_SRC2 (env->cc_src2) +#define CC_OP (env->cc_op) /* n must be a constant to be efficient */ static inline target_long lshift(target_long x, int n) diff --git a/target-i386/translate.c b/target-i386/translate.c index aaee393..77d86b0 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -196,9 +196,9 @@ static const uint8_t cc_op_live[CC_OP_NB] = { [CC_OP_EFLAGS] = USES_CC_SRC, [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC, [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC, - [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC, + [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, - [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC, + [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST, [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC, [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC, @@ -876,6 +876,13 @@ static void gen_op_update2_cc(void) tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); } +static void gen_op_update3_cc(TCGv reg) +{ + tcg_gen_mov_tl(cpu_cc_src2, reg); + tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); + tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); +} + static inline void gen_op_testl_T0_T1_cc(void) { tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]); @@ -936,30 +943,6 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg) return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0, .reg2 = t1, .mask = -1, .use_reg2 = true }; - case CC_OP_SBBB ... CC_OP_SBBQ: - /* (DATA_TYPE)(CC_DST + CC_SRC + 1) <= (DATA_TYPE)CC_SRC */ - size = s->cc_op - CC_OP_SBBB; - t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false); - if (TCGV_EQUAL(t1, reg) && TCGV_EQUAL(reg, cpu_cc_src)) { - tcg_gen_mov_tl(cpu_tmp0, cpu_cc_src); - t1 = cpu_tmp0; - } - - tcg_gen_add_tl(reg, cpu_cc_dst, cpu_cc_src); - tcg_gen_addi_tl(reg, reg, 1); - gen_extu(size, reg); - t0 = reg; - goto adc_sbb; - - case CC_OP_ADCB ... CC_OP_ADCQ: - /* (DATA_TYPE)CC_DST <= (DATA_TYPE)CC_SRC */ - size = s->cc_op - CC_OP_ADCB; - t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false); - t0 = gen_ext_tl(reg, cpu_cc_dst, size, false); - adc_sbb: - return (CCPrepare) { .cond = TCG_COND_LEU, .reg = t0, - .reg2 = t1, .mask = -1, .use_reg2 = true }; - case CC_OP_LOGICB ... CC_OP_LOGICQ: return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 }; @@ -1421,18 +1404,10 @@ static void gen_op(DisasContext *s1, int op, int ot, int d) gen_op_mov_reg_T0(ot, d); else gen_op_st_T0_A0(ot + s1->mem_index); - tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); - tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); - tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4); - tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2); - tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot); - set_cc_op(s1, CC_OP_DYNAMIC); + gen_op_update3_cc(cpu_tmp4); + set_cc_op(s1, CC_OP_ADCB + ot); break; case OP_SBBL: - /* - * No need to store cpu_cc_src2, because it is used only - * when the cc_op is known. - */ gen_compute_eflags_c(s1, cpu_tmp4); tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]); tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4); @@ -1440,12 +1415,8 @@ static void gen_op(DisasContext *s1, int op, int ot, int d) gen_op_mov_reg_T0(ot, d); else gen_op_st_T0_A0(ot + s1->mem_index); - tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); - tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); - tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4); - tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2); - tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot); - set_cc_op(s1, CC_OP_DYNAMIC); + gen_op_update3_cc(cpu_tmp4); + set_cc_op(s1, CC_OP_SBBB + ot); break; case OP_ADDL: gen_op_addl_T0_T1(); @@ -1463,8 +1434,7 @@ static void gen_op(DisasContext *s1, int op, int ot, int d) gen_op_mov_reg_T0(ot, d); else gen_op_st_T0_A0(ot + s1->mem_index); - gen_op_update2_cc(); - tcg_gen_mov_tl(cpu_cc_src2, cpu_tmp0); + gen_op_update3_cc(cpu_tmp0); set_cc_op(s1, CC_OP_SUBB + ot); break; default: -- 1.7.11.7