On Fri, Dec 20, 2013 at 03:00:12PM -0800, Richard Henderson wrote: > As present on Atom and Haswell processors. > > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > disas/i386.c | 8 ++-- > tcg/i386/tcg-target.c | 127 > ++++++++++++++++++++++++++++++++++---------------- > 2 files changed, 91 insertions(+), 44 deletions(-) > > Here's to "stress testing" a Haswell laptop before Santa delivers it. ;-) > > > r~ > > > diff --git a/disas/i386.c b/disas/i386.c > index 47f1f2e..beb33f0 100644 > --- a/disas/i386.c > +++ b/disas/i386.c > @@ -2632,16 +2632,16 @@ static const struct dis386 prefix_user_table[][4] = { > > /* PREGRP87 */ > { > - { "(bad)", { XX } }, > - { "(bad)", { XX } }, > + { "movbe", { Gv, M } }, > + { "movbe", { Gw, M } },
This is not correct, this disassemble movbe with the REPZ prefix. > { "(bad)", { XX } }, You want it there, that is for the DATA prefix. > { "crc32", { Gdq, { CRC32_Fixup, b_mode } } }, > }, > > /* PREGRP88 */ > { > - { "(bad)", { XX } }, > - { "(bad)", { XX } }, > + { "movbe", { M, Gv } }, > + { "movbe", { M, Gw } }, > { "(bad)", { XX } }, Ditto. > { "crc32", { Gdq, { CRC32_Fixup, v_mode } } }, > }, > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index 7ac8e45..e76f3f3 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -99,18 +99,28 @@ static const int tcg_target_call_oarg_regs[] = { > # define TCG_REG_L1 TCG_REG_EDX > #endif > > +#ifdef CONFIG_CPUID_H > +#include <cpuid.h> > +#endif > + > /* For 32-bit, we are going to attempt to determine at runtime whether cmov > is available. However, the host compiler must supply <cpuid.h>, as we're > not going to go so far as our own inline assembly. */ > #if TCG_TARGET_REG_BITS == 64 > # define have_cmov 1 > #elif defined(CONFIG_CPUID_H) > -#include <cpuid.h> > static bool have_cmov; > #else > # define have_cmov 0 > #endif > > +/* There is no pre-processor definition for MOVBE to fall back on. */ > +#ifdef CONFIG_CPUID_H As you already remarked, you should check for bit_MOVBE, as it is something which has been introduced in GCC 4.6. > +static bool have_movbe; > +#else > +#define have_movbe 0 Very minor nitpick, but you probably want to indent the define, like above for cmov. > +#endif > + > static uint8_t *tb_ret_addr; > > static void patch_reloc(uint8_t *code_ptr, int type, > @@ -254,6 +264,7 @@ static inline int tcg_target_const_match(tcg_target_long > val, > # define P_REXB_RM 0 > # define P_GS 0 > #endif > +#define P_EXT38 0x8000 /* 0x0f 0x38 opcode prefix */ > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -279,6 +290,8 @@ static inline int tcg_target_const_match(tcg_target_long > val, > #define OPC_MOVB_EvIz (0xc6) > #define OPC_MOVL_EvIz (0xc7) > #define OPC_MOVL_Iv (0xb8) > +#define OPC_MOVBE_GyMy (0xf0 | P_EXT | P_EXT38) > +#define OPC_MOVBE_MyGy (0xf1 | P_EXT | P_EXT38) > #define OPC_MOVSBL (0xbe | P_EXT) > #define OPC_MOVSWL (0xbf | P_EXT) > #define OPC_MOVSLQ (0x63 | P_REXW) > @@ -400,6 +413,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, > int rm, int x) > > if (opc & P_EXT) { > tcg_out8(s, 0x0f); > + if (opc & P_EXT38) { > + tcg_out8(s, 0x38); > + } > } > tcg_out8(s, opc); > } > @@ -411,6 +427,9 @@ static void tcg_out_opc(TCGContext *s, int opc) > } > if (opc & P_EXT) { > tcg_out8(s, 0x0f); > + if (opc & P_EXT38) { > + tcg_out8(s, 0x38); > + } > } > tcg_out8(s, opc); > } > @@ -1336,7 +1355,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > TCGReg base, intptr_t ofs, int seg, > TCGMemOp memop) > { > - const TCGMemOp bswap = memop & MO_BSWAP; > + const TCGMemOp real_bswap = memop & MO_BSWAP; > + TCGMemOp bswap = real_bswap; > + int movop = OPC_MOVL_GvEv; > + > + if (real_bswap && have_movbe) { > + bswap = 0; > + movop = OPC_MOVBE_GyMy; > + } Defining movop is clever, that said the name real_bswap doesn't sounds very self describing. Moreover you still need to check for have_movbe below... > switch (memop & MO_SSIZE) { > case MO_UB: > @@ -1346,32 +1372,45 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW + seg, datalo, base, > ofs); > break; > case MO_UW: > - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > - if (bswap) { > - tcg_out_rolw_8(s, datalo); > + if (real_bswap && have_movbe) { > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > + datalo, base, ofs); > + tcg_out_ext16u(s, datalo, datalo); > + } else { > + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > + if (bswap) { > + tcg_out_rolw_8(s, datalo); > + } What about keeping the bswap definition like before and do something like: if (bswap && have_movbe) { tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, datalo, base, ofs); tcg_out_ext16u(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); if (bswap) { tcg_out_rolw_8(s, datalo); } } > } > break; > case MO_SW: > - if (bswap) { > - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > - tcg_out_rolw_8(s, datalo); > - tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); > + if (real_bswap) { > + if (have_movbe) { > + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, > + datalo, base, ofs); > + } else { > + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); > + tcg_out_rolw_8(s, datalo); > + } > + tcg_out_ext16s(s, datalo, datalo, P_REXW); > } else { > tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, > datalo, base, ofs); > } > break; > case MO_UL: > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > if (bswap) { > tcg_out_bswap32(s, datalo); > } And here: tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (bswap && !has_movbe) { tcg_out_bswap32(s, datalo); } > break; > #if TCG_TARGET_REG_BITS == 64 > case MO_SL: > - if (bswap) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); > - tcg_out_bswap32(s, datalo); > + if (real_bswap) { > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + if (bswap) { > + tcg_out_bswap32(s, datalo); > + } > tcg_out_ext32s(s, datalo, datalo); > } else { And here: if (bswap) { tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (!has_movbe) { tcg_out_bswap32(s, datalo); } tcg_out_ext32s(s, datalo, datalo); } else { > tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); > @@ -1380,27 +1419,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > #endif > case MO_Q: > if (TCG_TARGET_REG_BITS == 64) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); > if (bswap) { > tcg_out_bswap64(s, datalo); > } > } else { > - if (bswap) { > + if (real_bswap) { > int t = datalo; > datalo = datahi; > datahi = t; > } > if (base != datalo) { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datalo, base, ofs); > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datahi, base, ofs + 4); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); > } else { > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datahi, base, ofs + 4); > - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > } > if (bswap) { > tcg_out_bswap32(s, datalo); > @@ -1476,13 +1510,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > TCGReg base, intptr_t ofs, int seg, > TCGMemOp memop) > { > - const TCGMemOp bswap = memop & MO_BSWAP; > - > /* ??? Ideally we wouldn't need a scratch register. For user-only, > we could perform the bswap twice to restore the original value > instead of moving to the scratch. But as it is, the L constraint > means that TCG_REG_L0 is definitely free here. */ > const TCGReg scratch = TCG_REG_L0; > + const TCGMemOp real_bswap = memop & MO_BSWAP; > + TCGMemOp bswap = real_bswap; > + int movop = OPC_MOVL_EvGv; > + > + if (real_bswap && have_movbe) { > + bswap = 0; > + movop = OPC_MOVBE_MyGy; > + } Ditto here > switch (memop & MO_SIZE) { > case MO_8: > @@ -1501,8 +1541,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > tcg_out_rolw_8(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs); > break; > case MO_32: > if (bswap) { > @@ -1510,7 +1549,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > tcg_out_bswap32(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > break; > case MO_64: > if (TCG_TARGET_REG_BITS == 64) { > @@ -1519,8 +1558,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > tcg_out_bswap64(s, scratch); > datalo = scratch; > } > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, > - datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); > } else if (bswap) { > tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); > tcg_out_bswap32(s, scratch); > @@ -1529,8 +1567,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > tcg_out_bswap32(s, scratch); > tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, > ofs+4); > } else { > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); > - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, > ofs+4); > + if (real_bswap) { > + int t = datalo; > + datalo = datahi; > + datahi = t; > + } > + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); > + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4); > } > break; > default: > @@ -2157,15 +2200,19 @@ static void tcg_target_qemu_prologue(TCGContext *s) > > static void tcg_target_init(TCGContext *s) > { > - /* For 32-bit, 99% certainty that we're running on hardware that supports > - cmov, but we still need to check. In case cmov is not available, > we'll > - use a small forward branch. */ > + unsigned a, b, c, d; > + > + if (__get_cpuid(1, &a, &b, &c, &d)) { > #ifndef have_cmov > - { > - unsigned a, b, c, d; > - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); > - } > + /* For 32-bit, 99% certainty that we're running on hardware that > + supports cmov, but we still need to check. In case cmov is not > + available, we'll use a small forward branch. */ > + have_cmov = (d & bit_CMOV) != 0; > #endif > +#ifndef have_movbe > + have_movbe = (c & bit_MOVBE) != 0; > +#endif > + } Your code doesn't check the result of __get_cpuid anymore to initialize have_cmov or have_movbe. While C standard mandates that static variables are initialized to 0, it might be a good idea to do so explicitly. > if (TCG_TARGET_REG_BITS == 64) { > tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); Otherwise looks good. -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurel...@aurel32.net http://www.aurel32.net