An initial cut at conditional moves for the i386 backend. Signed-off-by: Richard Henderson <r...@twiddle.net> --- elf.h | 2 + tcg/i386/tcg-target.c | 280 ++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 233 insertions(+), 49 deletions(-)
diff --git a/elf.h b/elf.h index 11674d7..c84c8ab 100644 --- a/elf.h +++ b/elf.h @@ -243,6 +243,8 @@ typedef struct { #define R_386_GOTOFF 9 #define R_386_GOTPC 10 #define R_386_NUM 11 +/* Not a dynamic reloc, so not included in R_386_NUM. Used in TCG. */ +#define R_386_PC8 23 #define R_MIPS_NONE 0 #define R_MIPS_16 1 diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 972b102..90dbbe9 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -61,6 +61,9 @@ static void patch_reloc(uint8_t *code_ptr, int type, case R_386_PC32: *(uint32_t *)code_ptr = value - (long)code_ptr; break; + case R_386_PC8: + *(uint8_t *)code_ptr = value - (long)code_ptr; + break; default: tcg_abort(); } @@ -305,7 +308,8 @@ static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val) tgen_arithi(s, ARITH_ADD, reg, val, 0); } -static void tcg_out_jxx(TCGContext *s, int opc, int label_index) +/* Use SMALL != 0 to force a short forward branch. */ +static void tcg_out_jxx(TCGContext *s, int opc, int label_index, int small) { int32_t val, val1; TCGLabel *l = &s->labels[label_index]; @@ -320,6 +324,7 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index) tcg_out8(s, 0x70 + opc); tcg_out8(s, val1); } else { + assert (!small); if (opc == -1) { tcg_out8(s, 0xe9); tcg_out32(s, val - 5); @@ -329,6 +334,15 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index) tcg_out32(s, val - 6); } } + } else if (small) { + if (opc == -1) { + tcg_out8(s, 0xeb); + } else { + tcg_out8(s, 0x0f); + tcg_out8(s, 0x70 + opc); + } + tcg_out_reloc(s, s->code_ptr, R_386_PC8, label_index, -1); + s->code_ptr += 1; } else { if (opc == -1) { tcg_out8(s, 0xe9); @@ -341,9 +355,8 @@ static void tcg_out_jxx(TCGContext *s, int opc, int label_index) } } -static void tcg_out_brcond(TCGContext *s, int cond, - TCGArg arg1, TCGArg arg2, int const_arg2, - int label_index) +static void tcg_out_cond(TCGContext *s, int cond, + TCGArg arg1, TCGArg arg2, int const_arg2) { if (const_arg2) { if (arg2 == 0) { @@ -355,71 +368,225 @@ static void tcg_out_brcond(TCGContext *s, int cond, } else { tcg_out_modrm(s, 0x01 | (ARITH_CMP << 3), arg2, arg1); } - tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index); +} + +static void tcg_out_brcond(TCGContext *s, int cond, + TCGArg arg1, TCGArg arg2, int const_arg2, + int label_index, int small) +{ + tcg_out_cond(s, cond, arg1, arg2, const_arg2); + tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_index, small); } /* XXX: we implement it at the target level to avoid having to handle cross basic blocks temporaries */ -static void tcg_out_brcond2(TCGContext *s, - const TCGArg *args, const int *const_args) +static void tcg_out_brcond2(TCGContext *s, const TCGArg *args, + const int *const_args, int small) { - int label_next; - label_next = gen_new_label(); - switch(args[4]) { + int label_next = gen_new_label(); + int label_dest = args[5]; + int cond = args[4], c1, c2, c3; + + switch (cond) { case TCG_COND_EQ: - tcg_out_brcond(s, TCG_COND_NE, args[0], args[2], const_args[2], label_next); - tcg_out_brcond(s, TCG_COND_EQ, args[1], args[3], const_args[3], args[5]); + c1 = -1, c2 = TCG_COND_NE, c3 = TCG_COND_EQ; break; case TCG_COND_NE: - tcg_out_brcond(s, TCG_COND_NE, args[0], args[2], const_args[2], args[5]); - tcg_out_brcond(s, TCG_COND_NE, args[1], args[3], const_args[3], args[5]); + c1 = TCG_COND_NE, c2 = -1, c3 = TCG_COND_NE; break; case TCG_COND_LT: - tcg_out_brcond(s, TCG_COND_LT, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_LTU, args[0], args[2], const_args[2], args[5]); - break; - case TCG_COND_LE: - tcg_out_brcond(s, TCG_COND_LT, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_LEU, args[0], args[2], const_args[2], args[5]); - break; - case TCG_COND_GT: - tcg_out_brcond(s, TCG_COND_GT, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_GTU, args[0], args[2], const_args[2], args[5]); - break; - case TCG_COND_GE: - tcg_out_brcond(s, TCG_COND_GT, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_GEU, args[0], args[2], const_args[2], args[5]); - break; case TCG_COND_LTU: - tcg_out_brcond(s, TCG_COND_LTU, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_LTU, args[0], args[2], const_args[2], args[5]); + c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_LTU; break; + case TCG_COND_LE: case TCG_COND_LEU: - tcg_out_brcond(s, TCG_COND_LTU, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_LEU, args[0], args[2], const_args[2], args[5]); + c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_LEU; break; + case TCG_COND_GT: case TCG_COND_GTU: - tcg_out_brcond(s, TCG_COND_GTU, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_GTU, args[0], args[2], const_args[2], args[5]); + c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_GTU; break; + case TCG_COND_GE: case TCG_COND_GEU: - tcg_out_brcond(s, TCG_COND_GTU, args[1], args[3], const_args[3], args[5]); - tcg_out_jxx(s, JCC_JNE, label_next); - tcg_out_brcond(s, TCG_COND_GEU, args[0], args[2], const_args[2], args[5]); + c1 = cond, c2 = TCG_COND_NE, c3 = TCG_COND_GEU; break; default: - tcg_abort(); + tcg_abort (); + } + + tcg_out_cond(s, cond, args[1], args[3], const_args[3]); + if (c1 != -1) { + tcg_out_jxx(s, tcg_cond_to_jcc[c1], label_dest, small); + } + if (c2 != -1) { + tcg_out_jxx(s, tcg_cond_to_jcc[c2], label_next, 1); } + tcg_out_brcond(s, c3, args[0], args[2], const_args[2], label_dest, small); + tcg_out_label(s, label_next, (tcg_target_long)s->code_ptr); } +static void tcg_out_setcond(TCGContext *s, int cond, TCGArg arg0, + TCGArg arg1, TCGArg arg2, int const_arg2) +{ + int use_xor = (arg0 != arg1 && (const_arg2 || arg0 != arg2)); + + if (use_xor) + tcg_out_movi(s, TCG_TYPE_I32, arg0, 0); + tcg_out_cond(s, cond, arg1, arg2, const_arg2); + tcg_out_modrm(s, 0x90 | tcg_cond_to_jcc[cond] | P_EXT, 0, arg0); + if (!use_xor) + tgen_arithi(s, ARITH_AND, arg0, 0xff, 0); +} + +static void tcg_out_setcond2(TCGContext *s, const TCGArg *args, + const int *const_args) +{ + int overlapl, overlaph; + TCGArg new_args[6]; + int label_true, label_over; + + overlapl = (args[0] == args[1] || (!const_args[3] && args[0] == args[3])); + overlaph = (args[0] == args[2] || (!const_args[4] && args[0] == args[4])); + memcpy(new_args, args+1, 5*sizeof(TCGArg)); + + if (!overlapl && !overlaph) { + /* ??? For EQ and NE, and output register in 'q', we could + implement this as cmp lows; setb %al; cmp highs; setb %ah; + andb %ah, %al; movzbl %al, %eax it's not clear it's worth + it though. */ + + /* When possible, clear the destination first and increment in + the true case. This results in smaller code than the + general case below. */ + tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); + + label_over = gen_new_label(); + new_args[5] = label_over; + tcg_out_brcond2(s, new_args, const_args+1, 1); + + tgen_arithi(s, ARITH_ADD, args[0], 1, 0); + tcg_out_label(s, label_over, (tcg_target_long)s->code_ptr); + } else { + label_true = gen_new_label(); + label_over = gen_new_label(); + + new_args[5] = label_true; + tcg_out_brcond2(s, new_args, const_args+1, 1); + + tcg_out_movi(s, TCG_TYPE_I32, args[0], 0); + tcg_out_jxx(s, JCC_JMP, label_over, 1); + tcg_out_label(s, label_true, (tcg_target_long)s->code_ptr); + + tcg_out_movi(s, TCG_TYPE_I32, args[0], 1); + tcg_out_label(s, label_over, (tcg_target_long)s->code_ptr); + } +} + +static inline int have_cmov(void) +{ +#ifdef __i686__ + /* Compiler options say that cmov is available. */ + return 1; +#else + /* ??? Use cpuid or something and figure out what's running. */ + return 0; +#endif +} + +static void tcg_out_movcond(TCGContext *s, const TCGArg *args, + const int *const_args) +{ + int vtc, vfc, cond, use_cmov = 0, do_swap = 0; + TCGArg d, vt, vf; + + d = args[0]; + vt = args[3]; + vf = args[4]; + vtc = const_args[3]; + vfc = const_args[4]; + + /* ??? The jcc code path below assumes that one mov insn must be skipped. + Rather than complicate the code below, make sure to simplify the + conditional move here. */ + if (vtc == vfc && vt == vf) { + if (vtc) + tcg_out_movi(s, TCG_TYPE_I32, d, vt); + else + tcg_out_mov(s, d, vt); + return; + } + + cond = args[5]; + + /* If both arguments are constants, we *could* do all the funny bits that + gcc does with sbc, masks, etc. There's likely no point. Just use the + jcc version in this case. We also have to be careful about clobbering + inputs when trying to move constants into position. */ + + if (have_cmov()) { + use_cmov = 1; + if (vtc) { + if (vfc || d == vf) + use_cmov = 0; + else + do_swap = 1; + } else if (d == vt) { + if (vfc) + use_cmov = 0; + else + do_swap = 1; + } + } + + if (!use_cmov) { + /* We're going to follow the lead of cmov and set D=VF first, + which means inverting the condition upon which we jump. */ + cond = tcg_invert_cond(cond); + + /* Don't allow the move we jump over to be a nop. */ + do_swap = (!vtc && d == vt); + } + + if (do_swap) { + TCGArg t; + cond = tcg_invert_cond(cond); + t = vf, vf = vt, vt = t; + t = vfc, vfc = vtc, vtc = t; + } + + /* If possible, set D=0 before the compare, so that we can use XOR. */ + if (vfc && vf == 0 && d != args[1] && (const_args[2] || d != args[2])) { + tcg_out_movi(s, TCG_TYPE_I32, d, vf); + vf = d, vfc = 0; + } + + tcg_out_cond(s, cond, args[1], args[2], const_args[2]); + + if (vfc) { + /* Force the use of "mov $0, d" to avoid clobbering flags. */ + tcg_out8(s, 0xb8 + d); + tcg_out32(s, vf); + } else { + tcg_out_mov(s, d, vf); + } + + if (use_cmov) { + assert (!vtc); + tcg_out_modrm(s, 0x40 | tcg_cond_to_jcc[cond] | P_EXT, d, vt); + } else { + int label_next = gen_new_label(); + + tcg_out_jxx(s, tcg_cond_to_jcc[cond], label_next, 1); + if (vtc) + tcg_out_movi(s, TCG_TYPE_I32, d, vt); + else + tcg_out_mov(s, d, vt); + + tcg_out_label(s, label_next, (tcg_target_long)s->code_ptr); + } +} + #if defined(CONFIG_SOFTMMU) #include "../../softmmu_defs.h" @@ -913,7 +1080,7 @@ static inline void tcg_out_op(TCGContext *s, int opc, } break; case INDEX_op_br: - tcg_out_jxx(s, JCC_JMP, args[0]); + tcg_out_jxx(s, JCC_JMP, args[0], 0); break; case INDEX_op_movi_i32: tcg_out_movi(s, TCG_TYPE_I32, args[0], args[1]); @@ -1044,10 +1211,11 @@ static inline void tcg_out_op(TCGContext *s, int opc, tcg_out_modrm(s, 0x01 | (ARITH_SBB << 3), args[5], args[1]); break; case INDEX_op_brcond_i32: - tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], args[3]); + tcg_out_brcond(s, args[2], args[0], args[1], const_args[1], + args[3], 0); break; case INDEX_op_brcond2_i32: - tcg_out_brcond2(s, args, const_args); + tcg_out_brcond2(s, args, const_args, 0); break; case INDEX_op_bswap16_i32: @@ -1080,6 +1248,16 @@ static inline void tcg_out_op(TCGContext *s, int opc, tcg_out_modrm(s, 0xb7 | P_EXT, args[0], args[1]); break; + case INDEX_op_setcond_i32: + tcg_out_setcond(s, args[3], args[0], args[1], args[2], const_args[2]); + break; + case INDEX_op_movcond_i32: + tcg_out_movcond(s, args, const_args); + break; + case INDEX_op_setcond2_i32: + tcg_out_setcond2(s, args, const_args); + break; + case INDEX_op_qemu_ld8u: tcg_out_qemu_ld(s, args, 0); break; @@ -1168,6 +1346,10 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_ext8u_i32, { "r", "q"} }, { INDEX_op_ext16u_i32, { "r", "r"} }, + { INDEX_op_setcond_i32, { "q", "r", "ri" } }, + { INDEX_op_movcond_i32, { "r", "r", "ri", "ri", "ri" } }, + { INDEX_op_setcond2_i32, { "r", "r", "r", "ri", "ri" } }, + #if TARGET_LONG_BITS == 32 { INDEX_op_qemu_ld8u, { "r", "L" } }, { INDEX_op_qemu_ld8s, { "r", "L" } }, -- 1.6.2.5