On Thu, Dec 10, 2015 at 4:02 PM, Aurelien Jarno <aurel...@aurel32.net> wrote: > Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the > need to flush the icache on TB linking, and allow to remove the limit > on the code generation buffer. > > Cc: Richard Henderson <r...@twiddle.net> > Cc: TeLeMan <gele...@gmail.com> > Cc: Andrzej Zaborowski <balr...@gmail.com> > Signed-off-by: Aurelien Jarno <aurel...@aurel32.net> > --- > include/exec/exec-all.h | 24 ++++-------------------- > tcg/arm/tcg-target.c | 8 +++----- > translate-all.c | 2 -- > 3 files changed, 7 insertions(+), 27 deletions(-) > > Note: I don't really get the reason for the current 16MB limit. With the > standard branch instructions the offset is coded on 24 bits, but shifted > right by 2, which should give us a +/-32MB jumps, and therefore a 32MB > limit. > > Therefore it might be a good idea to benchmark the original QEMU vs a > QEMU with a 32MB buffer vs QEMU with this patch. > > If mixing data and instructions like in this patch causes too much > performances trouble, at least on ARMv7 we might want to try with movw + > movt + movpc. It's only 4 bytes longer than the current patch. > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index d900b0d..3cd63c9 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -274,26 +274,10 @@ void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, > uintptr_t addr); > #elif defined(__arm__) > static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr) > { > -#if !QEMU_GNUC_PREREQ(4, 1) > - register unsigned long _beg __asm ("a1"); > - register unsigned long _end __asm ("a2"); > - register unsigned long _flg __asm ("a3"); > -#endif > - > - /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */ > - *(uint32_t *)jmp_addr = > - (*(uint32_t *)jmp_addr & ~0xffffff) > - | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff); > - > -#if QEMU_GNUC_PREREQ(4, 1) > - __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4); > -#else > - /* flush icache */ > - _beg = jmp_addr; > - _end = jmp_addr + 4; > - _flg = 0; > - __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" > (_flg)); > -#endif > + /* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind > + of branch so we write absolute address and we don't need to > + flush icache. */ > + *(uint32_t *)jmp_addr = addr; > } > #elif defined(__sparc__) || defined(__mips__) > void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr); > diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c > index 3edf6a6..f28b9ba 100644 > --- a/tcg/arm/tcg-target.c > +++ b/tcg/arm/tcg-target.c > @@ -986,10 +986,6 @@ static inline void tcg_out_st8(TCGContext *s, int cond, > tcg_out_st8_12(s, cond, rd, rn, offset); > } > > -/* The _goto case is normally between TBs within the same code buffer, and > - * with the code buffer limited to 16MB we wouldn't need the long case. > - * But we also use it for the tail-call to the qemu_ld/st helpers, which > does. > - */ > static inline void tcg_out_goto(TCGContext *s, int cond, tcg_insn_unit *addr) > { > intptr_t addri = (intptr_t)addr; > @@ -1649,8 +1645,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode > opc, > case INDEX_op_goto_tb: > if (s->tb_jmp_offset) { > /* Direct jump method */ > + tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4); > s->tb_jmp_offset[args[0]] = tcg_current_code_size(s); > - tcg_out_b_noaddr(s, COND_AL); > + /* Skip over address */ > + s->code_ptr++; > } else { > /* Indirect jump method */ > intptr_t ptr = (intptr_t)(s->tb_next + args[0]); > diff --git a/translate-all.c b/translate-all.c > index 042a857..1ca113c 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -472,8 +472,6 @@ static inline PageDesc *page_find(tb_page_addr_t index) > # define MAX_CODE_GEN_BUFFER_SIZE (2ul * 1024 * 1024 * 1024) > #elif defined(__aarch64__) > # define MAX_CODE_GEN_BUFFER_SIZE (128ul * 1024 * 1024) > -#elif defined(__arm__) > -# define MAX_CODE_GEN_BUFFER_SIZE (16u * 1024 * 1024) > #elif defined(__s390x__) > /* We have a +- 4GB range on the branches; leave some slop. */ > # define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024) > -- > 2.6.2 >
Tested-by: TeLeMan <gele...@gmail.com> tb_size flush_count boot_time 32MB about 90 about 37 minutes 128MB 4 about 30 minutes 256MB 1 about 30 minutes boot_time is the time of booting Windows XP until the qemu's cpu usage is under 50%. I tested on ARM A7 1.5GHz and boot_time is not precise.