Add optimized TCG qemu_ld/st generation which generates the code for TLB miss case handling at the end of TB after generating other IRs.
Signed-off-by: Yeongkyoon Lee --- tcg/i386/tcg-target.c | 328 +++++++++++++++++++++++++++++++++++++++++++++++++ tcg/tcg.c | 12 ++ tcg/tcg.h | 35 +++++ 3 files changed, 375 insertions(+), 0 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index da17bba..3f2f640 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -984,6 +984,8 @@ static const void *qemu_st_helpers[4] = { helper_stq_mmu, }; #else + +#ifndef CONFIG_QEMU_LDST_OPTIMIZATION /* legacy helper signature: __ld_mmu(target_ulong addr, int mmu_idx) */ static void *qemu_ld_helpers[4] = { @@ -1001,6 +1003,35 @@ static void *qemu_st_helpers[4] = { __stl_mmu, __stq_mmu, }; +#else +/* extended legacy helper signature: __ext_ld_mmu(target_ulong addr, int + mmu_idx, uintptr raddr) */ +static void *qemu_ld_helpers[4] = { + __ext_ldb_mmu, + __ext_ldw_mmu, + __ext_ldl_mmu, + __ext_ldq_mmu, +}; + +/* extended legacy helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val, + int mmu_idx) */ +static void *qemu_st_helpers[4] = { + __ext_stb_mmu, + __ext_stw_mmu, + __ext_stl_mmu, + __ext_stq_mmu, +}; + +static void add_qemu_ldst_label(TCGContext *s, + int opc_ext, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint8_t **label_ptr); +#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */ #endif /* Perform the TLB load and compare. @@ -1061,19 +1092,36 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx, tcg_out_mov(s, type, r0, addrlo); +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION + /* jne slow_path */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); + if (!label_ptr) { + tcg_abort(); + } + label_ptr[0] = s->code_ptr; + s->code_ptr += 4; +#else /* jne label1 */ tcg_out8(s, OPC_JCC_short + JCC_JNE); label_ptr[0] = s->code_ptr; s->code_ptr++; +#endif if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { /* cmp 4(r1), addrhi */ tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4); +#ifdef CONFIG_QEMU_LDST_OPTIMIZATION + /* jne slow_path */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); + label_ptr[1] = s->code_ptr; + s->code_ptr += 4; +#else /* jne label1 */ tcg_out8(s, OPC_JCC_short + JCC_JNE); label_ptr[1] = s->code_ptr; s->code_ptr++; +#endif } /* TLB Hit. */ @@ -1171,11 +1219,13 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; +#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION) #if TCG_TARGET_REG_BITS == 64 int arg_idx; #else int stack_adjust; #endif +#endif /* !CONFIG_QEMU_LDST_OPTIMIZATION */ uint8_t *label_ptr[3]; #endif @@ -1197,6 +1247,18 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, tcg_out_qemu_ld_direct(s, data_reg, data_reg2, tcg_target_call_iarg_regs[0], 0, opc); +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) + /* helper stub will be jumped back here */ + add_qemu_ldst_label(s, + opc, + data_reg, + data_reg2, + args[addrlo_idx], + args[addrlo_idx + 1], + mem_index, + s->code_ptr, + label_ptr); +#else /* jmp label2 */ tcg_out8(s, OPC_JMP_short); label_ptr[2] = s->code_ptr; @@ -1292,6 +1354,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, /* label2: */ *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; +#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ #else { int32_t offset = GUEST_BASE; @@ -1385,7 +1448,9 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; +#if !defined(CONFIG_QEMU_LDST_OPTIMIZATION) int stack_adjust; +#endif uint8_t *label_ptr[3]; #endif @@ -1407,6 +1472,18 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, tcg_out_qemu_st_direct(s, data_reg, data_reg2, tcg_target_call_iarg_regs[0], 0, opc); +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) + /* helper stub will be jumped back here */ + add_qemu_ldst_label(s, + opc | HL_ST_MASK, + data_reg, + data_reg2, + args[addrlo_idx], + args[addrlo_idx + 1], + mem_index, + s->code_ptr, + label_ptr); +#else /* jmp label2 */ tcg_out8(s, OPC_JMP_short); label_ptr[2] = s->code_ptr; @@ -1469,6 +1546,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, /* label2: */ *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; +#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ #else { int32_t offset = GUEST_BASE; @@ -1496,6 +1574,256 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, #endif } +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) +/* optimization to reduce jump overheads for qemu_ld/st IRs */ + +/* + * qemu_ld/st code generator call add_qemu_ldst_label, + * so that slow case(TLB miss or I/O rw) is handled at the end of TB + */ +static void add_qemu_ldst_label(TCGContext *s, + int opc_ext, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint8_t **label_ptr) +{ + int idx; + TCGLabelQemuLdst *label; + + if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) + tcg_abort(); + + idx = s->nb_qemu_ldst_labels++; + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx]; + label->opc_ext = opc_ext; + label->datalo_reg = data_reg; + label->datahi_reg = data_reg2; + label->addrlo_reg = addrlo_reg; + label->addrhi_reg = addrhi_reg; + label->mem_index = mem_index; + label->raddr = raddr; + if (!label_ptr) { + tcg_abort(); + } + label->label_ptr[0] = label_ptr[0]; + label->label_ptr[1] = label_ptr[1]; +} + +/* generates slow case of qemu_ld at the end of TB */ +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int opc = label->opc_ext & HL_OPC_MASK; + int mem_index = label->mem_index; +#if TCG_TARGET_REG_BITS == 64 + int arg_idx; +#else + int stack_adjust; + int addrlo_reg = label->addrlo_reg; + int addrhi_reg = label->addrhi_reg; +#endif + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); + } + + /* extended helper signature: __ext_ld_mmu(target_ulong addr, int mmu_idx, + uintptr_t raddr) */ +#if TCG_TARGET_REG_BITS == 32 + tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */ + stack_adjust = 4; + tcg_out_pushi(s, mem_index); /* mmu index */ + stack_adjust += 4; + if (TARGET_LONG_BITS == 64) { + tcg_out_push(s, addrhi_reg); + stack_adjust += 4; + } + tcg_out_push(s, addrlo_reg); /* guest addr */ + stack_adjust += 4; +#ifdef CONFIG_TCG_PASS_AREG0 + tcg_out_push(s, TCG_AREG0); + stack_adjust += 4; +#endif +#else + /* The first argument is already loaded with addrlo. */ + arg_idx = 1; + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + mem_index); + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[arg_idx++], + (uintptr_t)(raddr - 1)); +#ifdef CONFIG_TCG_PASS_AREG0 + /* XXX/FIXME: suboptimal */ + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], + tcg_target_call_iarg_regs[2]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], + tcg_target_call_iarg_regs[1]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], + tcg_target_call_iarg_regs[0]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], + TCG_AREG0); +#endif +#endif + + tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); + +#if TCG_TARGET_REG_BITS == 32 + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { + /* Pop and discard. This is 2 bytes smaller than the add. */ + tcg_out_pop(s, TCG_REG_ECX); + } else if (stack_adjust != 0) { + tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); + } +#endif + + switch(opc) { + case 0 | 4: + tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 1 | 4: + tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 0: + tcg_out_ext8u(s, data_reg, TCG_REG_EAX); + break; + case 1: + tcg_out_ext16u(s, data_reg, TCG_REG_EAX); + break; + case 2: + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + break; +#if TCG_TARGET_REG_BITS == 64 + case 2 | 4: + tcg_out_ext32s(s, data_reg, TCG_REG_EAX); + break; +#endif + case 3: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); + } else if (data_reg == TCG_REG_EDX) { + /* xchg %edx, %eax */ + tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX); + } else { + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX); + } + break; + default: + tcg_abort(); + } + + /* jump back to original code */ + tcg_out_jmp(s, (tcg_target_long) raddr); +} + +/* generates slow case of qemu_st at the end of TB */ +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int stack_adjust; + int opc = label->opc_ext & HL_OPC_MASK; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; +#if TCG_TARGET_REG_BITS == 32 + int data_reg2 = label->datahi_reg; + int addrlo_reg = label->addrlo_reg; + int addrhi_reg = label->addrhi_reg; +#endif + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); + } + + /* extended helper signature: __ext_st_mmu(target_ulong addr, uintxx_t val, + int mmu_idx, uintptr_t raddr) */ +#if TCG_TARGET_REG_BITS == 32 + tcg_out_pushi(s, (uintptr_t)(raddr - 1)); /* return address */ + stack_adjust = 4; + tcg_out_pushi(s, mem_index); /* mmu index */ + stack_adjust += 4; + if (opc == 3) { + tcg_out_push(s, data_reg2); + stack_adjust += 4; + } + tcg_out_push(s, data_reg); /* guest data */ + stack_adjust += 4; + if (TARGET_LONG_BITS == 64) { + tcg_out_push(s, addrhi_reg); + stack_adjust += 4; + } + tcg_out_push(s, addrlo_reg); /* guest addr */ + stack_adjust += 4; +#ifdef CONFIG_TCG_PASS_AREG0 + tcg_out_push(s, TCG_AREG0); + stack_adjust += 4; +#endif +#else + tcg_out_mov(s, (opc == 3 ? TCG_TYPE_I64 : TCG_TYPE_I32), + tcg_target_call_iarg_regs[1], data_reg); + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); + tcg_out_movi(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], (uintptr_t)(raddr - 1)); + stack_adjust = 0; +#ifdef CONFIG_TCG_PASS_AREG0 + /* XXX/FIXME: suboptimal */ + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[3], + tcg_target_call_iarg_regs[2]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[2], + tcg_target_call_iarg_regs[1]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[1], + tcg_target_call_iarg_regs[0]); + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], + TCG_AREG0); +#endif +#endif + + tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); + + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { + /* Pop and discard. This is 2 bytes smaller than the add. */ + tcg_out_pop(s, TCG_REG_ECX); + } else if (stack_adjust != 0) { + tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); + } + + /* jump back to original code */ + tcg_out_jmp(s, (tcg_target_long) raddr); +} + +/* generates all of the slow cases of qemu_ld/st at the end of TB */ +void tcg_out_qemu_ldst_slow_path(TCGContext *s) +{ + int i; + TCGLabelQemuLdst *label; + + for (i = 0; i < s->nb_qemu_ldst_labels; i++) { + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i]; + if (IS_QEMU_LD_LABEL(label)) { + tcg_out_qemu_ld_slow_path(s, label); + } else { + tcg_out_qemu_st_slow_path(s, label); + } + } +} +#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ + static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) { diff --git a/tcg/tcg.c b/tcg/tcg.c index 8386b70..c33cb96 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -301,6 +301,14 @@ void tcg_func_start(TCGContext *s) gen_opc_ptr = gen_opc_buf; gen_opparam_ptr = gen_opparam_buf; +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) + /* initialize qemu_ld/st labels which help to generate TLB miss case codes at the end of TB */ + s->qemu_ldst_labels = tcg_malloc(sizeof(TCGLabelQemuLdst) * TCG_MAX_QEMU_LDST); + if (!s->qemu_ldst_labels) { + tcg_abort(); + } + s->nb_qemu_ldst_labels = 0; +#endif } static inline void tcg_temp_alloc(TCGContext *s, int n) @@ -2169,6 +2177,10 @@ static inline int tcg_gen_code_common(TCGContext *s, uint8_t *gen_code_buf, #endif } the_end: +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) + /* Generate MMU call helpers at the end of block (currently only for qemu_ld/st) */ + tcg_out_qemu_ldst_slow_path(s); +#endif return -1; } diff --git a/tcg/tcg.h b/tcg/tcg.h index d710694..e52d3a4 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -187,6 +187,29 @@ typedef tcg_target_ulong TCGArg; are aliases for target_ulong and host pointer sized values respectively. */ +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) +/* Macros and structures for qemu_ld/st IR code optimization: + It looks good for TCG_MAX_HELPER_LABELS to be half of OPC_BUF_SIZE in exec-all.h. */ +#define TCG_MAX_QEMU_LDST 320 +#define HL_LDST_SHIFT 4 +#define HL_LDST_MASK (1 << HL_LDST_SHIFT) +#define HL_ST_MASK HL_LDST_MASK +#define HL_OPC_MASK (HL_LDST_MASK - 1) +#define IS_QEMU_LD_LABEL(L) (!((L)->opc_ext & HL_LDST_MASK)) +#define IS_QEMU_ST_LABEL(L) ((L)->opc_ext & HL_LDST_MASK) + +typedef struct TCGLabelQemuLdst { + int opc_ext; /* | 27bit (reserved) | 1bit (ld/st flag) | 4bit (opc) | */ + int addrlo_reg; /* reg index for the low word of guest virtual address */ + int addrhi_reg; /* reg index for the high word of guest virtual address */ + int datalo_reg; /* reg index for the low word to be loaded or to be stored */ + int datahi_reg; /* reg index for the high word to be loaded or to be stored */ + int mem_index; /* soft MMU memory index */ + uint8_t *raddr; /* return address (located end of TB) */ + uint8_t *label_ptr[2]; /* label pointers to be updated */ +} TCGLabelQemuLdst; +#endif /* CONFIG_QEMU_LDST_OPTIMIZATION */ + #ifdef CONFIG_DEBUG_TCG #define DEBUG_TCGV 1 #endif @@ -389,6 +412,13 @@ struct TCGContext { #ifdef CONFIG_DEBUG_TCG int temps_in_use; #endif + +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) + /* labels info for qemu_ld/st IRs + The labels help to generate TLB miss case codes at the end of TB */ + TCGLabelQemuLdst *qemu_ldst_labels; + int nb_qemu_ldst_labels; +#endif }; extern TCGContext tcg_ctx; @@ -588,3 +618,8 @@ extern uint8_t code_gen_prologue[]; #endif void tcg_register_jit(void *buf, size_t buf_size); + +#if defined(CONFIG_QEMU_LDST_OPTIMIZATION) +/* qemu_ld/st generation at the end of TB */ +void tcg_out_qemu_ldst_slow_path(TCGContext *s); +#endif __________________________________ Principal Engineer VM Team Yeongkyoon Lee S-Core Co., Ltd. D.L.: +82-31-696-7249 M.P.: +82-10-9965-1265 __________________________________