Richard Henderson <richard.hender...@linaro.org> writes:
> This is an incremental step toward moving the qemu_ld/st > code sequence out of line. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > tcg/i386/tcg-target.inc.c | 203 +++++++++++++++++++++++++++++++------- > 1 file changed, 169 insertions(+), 34 deletions(-) > > diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c > index 07df4b2b12..50e5dc31b3 100644 > --- a/tcg/i386/tcg-target.inc.c > +++ b/tcg/i386/tcg-target.inc.c > @@ -171,6 +171,80 @@ static bool have_lzcnt; > > static tcg_insn_unit *tb_ret_addr; > > +typedef enum { > + ARG_ADDR, > + ARG_STVAL, > + ARG_LDVAL, > +} QemuMemArgType; > + > +#ifdef CONFIG_SOFTMMU > +/* > + * Constraint to choose a particular register. This is used for softmmu > + * loads and stores. Registers with no assignment get an empty string. > + */ > +static const char * const one_reg_constraint[TCG_TARGET_NB_REGS] = { > + [TCG_REG_EAX] = "a", > + [TCG_REG_EBX] = "b", > + [TCG_REG_ECX] = "c", > + [TCG_REG_EDX] = "d", > + [TCG_REG_ESI] = "S", > + [TCG_REG_EDI] = "D", > +#if TCG_TARGET_REG_BITS == 64 > + [TCG_REG_R8] = "E", > + [TCG_REG_R9] = "N", > +#endif > +}; > + > +/* > + * Calling convention for the softmmu load and store thunks. > + * > + * For 64-bit, we mostly use the host calling convention, therefore the > + * real first argument is reserved for the ENV parameter that is passed > + * on to the slow path helpers. > + * > + * For 32-bit, the host calling convention is stack based; we invent a > + * private convention that uses 4 of the 6 available host registers. > + * We reserve EAX and EDX as temporaries for use by the thunk, we require > + * INDEX_op_qemu_st_i32 to have a 'q' register from which to store, and > + * further complicate this last by wanting a call-clobbered for that store. > + * The 'q' requirement allows MO_8 stores at all; the call-clobbered part > + * allows bswap to operate in-place, clobbering the input. > + */ > +static TCGReg softmmu_arg(QemuMemArgType type, bool is_64, int hi) > +{ > + switch (type) { > + case ARG_ADDR: > + tcg_debug_assert(!hi || TARGET_LONG_BITS > TCG_TARGET_REG_BITS); > + if (TCG_TARGET_REG_BITS == 64) { > + return tcg_target_call_iarg_regs[1]; > + } else { > + return hi ? TCG_REG_EDI : TCG_REG_ESI; > + } > + case ARG_STVAL: > + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64)); > + if (TCG_TARGET_REG_BITS == 64) { > + return tcg_target_call_iarg_regs[2]; > + } else { > + return hi ? TCG_REG_EBX : TCG_REG_ECX; > + } > + case ARG_LDVAL: > + tcg_debug_assert(!hi || (TCG_TARGET_REG_BITS == 32 && is_64)); > + return tcg_target_call_oarg_regs[hi]; > + } > + g_assert_not_reached(); > +} > + > +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int > hi) > +{ > + return one_reg_constraint[softmmu_arg(type, is_64, hi)]; > +} > +#else > +static const char *constrain_memop_arg(QemuMemArgType type, bool is_64, int > hi) > +{ > + return "L"; > +} > +#endif /* CONFIG_SOFTMMU */ > + > static bool patch_reloc(tcg_insn_unit *code_ptr, int type, > intptr_t value, intptr_t addend) > { > @@ -1680,11 +1754,15 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg > addrlo, TCGReg addrhi, > copies the entire guest address for the slow path, while truncation > for the 32-bit host happens with the fastpath ADDL below. */ > if (TCG_TARGET_REG_BITS == 64) { > - base = tcg_target_call_iarg_regs[1]; > + tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]); > + if (TARGET_LONG_BITS == 32) { > + tcg_out_ext32u(s, addrlo, addrlo); > + } > + base = addrlo; > } else { > base = r1; > + tcg_out_mov(s, ttype, base, addrlo); > } > - tcg_out_mov(s, ttype, base, addrlo); > > /* jne slow_path */ > tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); > @@ -2009,16 +2087,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > common. */ > static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64) > { > - TCGReg datalo, datahi, addrlo; > - TCGReg addrhi __attribute__((unused)); > + TCGReg datalo, addrlo; > + TCGReg datahi __attribute__((unused)) = -1; > + TCGReg addrhi __attribute__((unused)) = -1; > TCGMemOpIdx oi; > TCGMemOp opc; > + int i = -1; > > - datalo = *args++; > - datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); > - addrlo = *args++; > - addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); > - oi = *args++; > + datalo = args[++i]; Swapping to i = -1 and pre-indexes seems a little unnatural here compared to a more normal 0 and i++ unless there was a specific reason to have i in the range of 2-4? > + if (TCG_TARGET_REG_BITS == 32 && is64) { > + datahi = args[++i]; > + } > + addrlo = args[++i]; > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + addrhi = args[++i]; > + } > + oi = args[++i]; > opc = get_memop(oi); > > #if defined(CONFIG_SOFTMMU) > @@ -2027,6 +2111,15 @@ static void tcg_out_qemu_ld(TCGContext *s, const > TCGArg *args, bool is64) > tcg_insn_unit *label_ptr[2]; > TCGReg base; > > + tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0)); > + if (TCG_TARGET_REG_BITS == 32 && is64) { > + tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1)); > + } > + tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0)); > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1)); > + } > + > base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, > label_ptr, offsetof(CPUTLBEntry, addr_read)); > > @@ -2149,16 +2242,22 @@ static void tcg_out_qemu_st_direct(TCGContext *s, > TCGReg datalo, TCGReg datahi, > > static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) > { > - TCGReg datalo, datahi, addrlo; > - TCGReg addrhi __attribute__((unused)); > + TCGReg datalo, addrlo; > + TCGReg datahi __attribute__((unused)) = -1; > + TCGReg addrhi __attribute__((unused)) = -1; > TCGMemOpIdx oi; > TCGMemOp opc; > + int i = -1; And again here > > - datalo = *args++; > - datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0); > - addrlo = *args++; > - addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0); > - oi = *args++; > + datalo = args[++i]; > + if (TCG_TARGET_REG_BITS == 32 && is64) { > + datahi = args[++i]; > + } > + addrlo = args[++i]; > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + addrhi = args[++i]; > + } > + oi = args[++i]; > opc = get_memop(oi); > > #if defined(CONFIG_SOFTMMU) > @@ -2167,6 +2266,15 @@ static void tcg_out_qemu_st(TCGContext *s, const > TCGArg *args, bool is64) > tcg_insn_unit *label_ptr[2]; > TCGReg base; > > + tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0)); > + if (TCG_TARGET_REG_BITS == 32 && is64) { > + tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1)); > + } > + tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0)); > + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { > + tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1)); > + } > + > base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, > label_ptr, offsetof(CPUTLBEntry, > addr_write)); > > @@ -2836,15 +2944,6 @@ static const TCGTargetOpDef > *tcg_target_op_def(TCGOpcode op) > static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } > }; > static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } > }; > static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } > }; > - static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } }; > - static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } }; > - static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } }; > - static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } }; > - static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } }; > - static const TCGTargetOpDef r_r_L_L > - = { .args_ct_str = { "r", "r", "L", "L" } }; > - static const TCGTargetOpDef L_L_L_L > - = { .args_ct_str = { "L", "L", "L", "L" } }; > static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } }; > static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; > static const TCGTargetOpDef x_x_x_x > @@ -3026,17 +3125,53 @@ static const TCGTargetOpDef > *tcg_target_op_def(TCGOpcode op) > } > > case INDEX_op_qemu_ld_i32: > - return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L; > - case INDEX_op_qemu_st_i32: > - return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L; > + { > + static TCGTargetOpDef ld32; > + int i; > + > + ld32.args_ct_str[0] = constrain_memop_arg(ARG_LDVAL, 0, 0); > + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; > ++i) { This formulation is a bit weird to follow, at first I thought it was going to overflow TCG_MAX_OP_ARGS until I realised what it was doing. Maybe a helper function would be a little clearer: for (src_reg = 0; src_reg < tcg_target_regs_for(TARGET_LONG_BITS), ++i) Same comment applies bellow. > + ld32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0, > i); > + } > + return &ld32; > + } > case INDEX_op_qemu_ld_i64: > - return (TCG_TARGET_REG_BITS == 64 ? &r_L > - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L > - : &r_r_L_L); > + { > + static TCGTargetOpDef ld64; > + int i, j = 0; > + > + for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) { > + ld64.args_ct_str[j++] = constrain_memop_arg(ARG_LDVAL, 1, i); > + } > + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) { > + ld64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i); > + } > + return &ld64; > + } > + case INDEX_op_qemu_st_i32: > + { > + static TCGTargetOpDef st32; > + int i; > + > + st32.args_ct_str[0] = constrain_memop_arg(ARG_STVAL, 0, 0); > + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) { > + st32.args_ct_str[i + 1] = constrain_memop_arg(ARG_ADDR, 0, > i); > + } > + return &st32; > + } > case INDEX_op_qemu_st_i64: > - return (TCG_TARGET_REG_BITS == 64 ? &L_L > - : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L > - : &L_L_L_L); > + { > + static TCGTargetOpDef st64; > + int i, j = 0; > + > + for (i = 0; i * TCG_TARGET_REG_BITS < 64; ++i) { > + st64.args_ct_str[j++] = constrain_memop_arg(ARG_STVAL, 1, i); > + } > + for (i = 0; i * TCG_TARGET_REG_BITS < TARGET_LONG_BITS; ++i) { > + st64.args_ct_str[j++] = constrain_memop_arg(ARG_ADDR, 0, i); > + } > + return &st64; > + } > > case INDEX_op_brcond2_i32: > { -- Alex Bennée