On Thu, Mar 28, 2013 at 08:32:55AM -0700, Richard Henderson wrote: > Eliminate 2 disabled code blocks. Choose the load-to-pc method of > jumping so that we can eliminate the 16M code_gen_buffer limitation. > Remove a test in the indirect jump method that is always true. > > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > include/exec/exec-all.h | 23 +++-------------------- > tcg/arm/tcg-target.c | 26 ++++++-------------------- > translate-all.c | 2 -- > 3 files changed, 9 insertions(+), 42 deletions(-)
I already proposed such a patch, but it seems to improve things only on some specific cases (kernel boot), while increasing the I/D cache and TLB pressure. See https://lists.gnu.org/archive/html/qemu-devel/2012-10/msg01684.html > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index e856191..190effa 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -233,26 +233,9 @@ static inline void tb_set_jmp_target1(uintptr_t > jmp_addr, uintptr_t addr) > #elif defined(__arm__) > static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr) > { > -#if !QEMU_GNUC_PREREQ(4, 1) > - register unsigned long _beg __asm ("a1"); > - register unsigned long _end __asm ("a2"); > - register unsigned long _flg __asm ("a3"); > -#endif > - > - /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */ > - *(uint32_t *)jmp_addr = > - (*(uint32_t *)jmp_addr & ~0xffffff) > - | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff); > - > -#if QEMU_GNUC_PREREQ(4, 1) > - __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4); > -#else > - /* flush icache */ > - _beg = jmp_addr; > - _end = jmp_addr + 4; > - _flg = 0; > - __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" > (_flg)); > -#endif > + /* We're using "ldr pc, [pc,#-4]", so we can just store the raw > + address, without caring for flushing the icache. */ > + *(uint32_t *)jmp_addr = addr; > } > #elif defined(__sparc__) > void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr); > diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c > index 7bcba19..6de5e90 100644 > --- a/tcg/arm/tcg-target.c > +++ b/tcg/arm/tcg-target.c > @@ -1595,30 +1595,16 @@ static inline void tcg_out_op(TCGContext *s, > TCGOpcode opc, > break; > case INDEX_op_goto_tb: > if (s->tb_jmp_offset) { > - /* Direct jump method */ > -#if defined(USE_DIRECT_JUMP) > - s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf; > - tcg_out_b_noaddr(s, COND_AL); > -#else > + /* "Direct" jump method. Rather than limit the code gen buffer > + to 16M, load the destination from the next word. */ > tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4); > s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf; > - tcg_out32(s, 0); > -#endif > + s->code_ptr += 4; > } else { > - /* Indirect jump method */ > -#if 1 > - c = (int) (s->tb_next + args[0]) - ((int) s->code_ptr + 8); > - if (c > 0xfff || c < -0xfff) { > - tcg_out_movi32(s, COND_AL, TCG_REG_R0, > - (tcg_target_long) (s->tb_next + args[0])); > - tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0); > - } else > - tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, c); > -#else > - tcg_out_ld32_12(s, COND_AL, TCG_REG_R0, TCG_REG_PC, 0); > + /* Indirect jump method. */ > + tcg_out_movi32(s, COND_AL, TCG_REG_R0, > + (tcg_target_long) (s->tb_next + args[0])); > tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, 0); > - tcg_out32(s, (tcg_target_long) (s->tb_next + args[0])); > -#endif > } > s->tb_next_offset[args[0]] = s->code_ptr - s->code_buf; > break; > diff --git a/translate-all.c b/translate-all.c > index a98c646..3ca839f 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -460,8 +460,6 @@ static inline PageDesc *page_find(tb_page_addr_t index) > # define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) > #elif defined(__sparc__) > # define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) > -#elif defined(__arm__) > -# define MAX_CODE_GEN_BUFFER_SIZE (16u * 1024 * 1024) > #elif defined(__s390x__) > /* We have a +- 4GB range on the branches; leave some slop. */ > # define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024) > -- > 1.8.1.4 > > > -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurel...@aurel32.net http://www.aurel32.net