On Tue, Oct 9, 2012 at 10:30 PM, Aurelien Jarno <aurel...@aurel32.net> wrote: > Use ldr pc, [pc, #-4] kind of branch for direct jump. This removes the > need to flush the icache on TB linking, and allow to remove the limit > on the code generation buffer.
I'm not sure I like it. In general having data in the middle of code will increase I/D cache and I/D TLB pressure. > This improves the boot-up speed of a MIPS guest by 11%. Boot speed is very specific. Did you test some other code? Also what was your host? Testing on a quad core Cortex-A9, using all of your patches (including TCG optimizations), I get this running nbench i386 in user mode: TEST : Iter/sec. : Old Index : New Index : : Pentium 90 : AMD K6/233 --------------------:------------:------------:----------- NUMERIC SORT : 119.48 : 3.06 : 1.01 STRING SORT : 7.7907 : 3.48 : 0.54 BITFIELD : 2.2049e+07 : 3.78 : 0.79 FP EMULATION : 5.094 : 2.44 : 0.56 FOURIER : 483.73 : 0.55 : 0.31 ASSIGNMENT : 1.778 : 6.77 : 1.75 IDEA : 341.43 : 5.22 : 1.55 HUFFMAN : 45.942 : 1.27 : 0.41 NEURAL NET : 0.16667 : 0.27 : 0.11 LU DECOMPOSITION : 5.969 : 0.31 : 0.22 ===================ORIGINAL BYTEMARK RESULTS============== INTEGER INDEX : 3.319 FLOATING-POINT INDEX: 0.357 =======================LINUX DATA BELOW=================== MEMORY INDEX : 0.907 INTEGER INDEX : 0.774 FLOATING-POINT INDEX: 0.198 Not using this patch, I get: TEST : Iter/sec. : Old Index : New Index : : Pentium 90 : AMD K6/233 --------------------:------------:------------:----------- NUMERIC SORT : 121.88 : 3.13 : 1.03 STRING SORT : 7.8438 : 3.50 : 0.54 BITFIELD : 2.2597e+07 : 3.88 : 0.81 FP EMULATION : 5.1424 : 2.47 : 0.57 FOURIER : 466.04 : 0.53 : 0.30 ASSIGNMENT : 1.809 : 6.88 : 1.79 IDEA : 359.28 : 5.50 : 1.63 HUFFMAN : 46.225 : 1.28 : 0.41 NEURAL NET : 0.16644 : 0.27 : 0.11 LU DECOMPOSITION : 5.77 : 0.30 : 0.22 ===================ORIGINAL BYTEMARK RESULTS============== INTEGER INDEX : 3.384 FLOATING-POINT INDEX: 0.349 =======================LINUX DATA BELOW=================== MEMORY INDEX : 0.922 INTEGER INDEX : 0.790 FLOATING-POINT INDEX: 0.193 This patch doesn't bring any speedup in that case. I guess we need more testing as a synthetic benchmark is as specific as kernel booting :-) Laurent > Cc: Andrzej Zaborowski <balr...@gmail.com> > Cc: Peter Maydell <peter.mayd...@linaro.org> > Signed-off-by: Aurelien Jarno <aurel...@aurel32.net> > --- > exec-all.h | 24 ++++-------------------- > exec.c | 4 ---- > tcg/arm/tcg-target.c | 7 +------ > 3 files changed, 5 insertions(+), 30 deletions(-) > > diff --git a/exec-all.h b/exec-all.h > index 6516da0..662b916 100644 > --- a/exec-all.h > +++ b/exec-all.h > @@ -224,26 +224,10 @@ static inline void tb_set_jmp_target1(uintptr_t > jmp_addr, uintptr_t addr) > #elif defined(__arm__) > static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr) > { > -#if !QEMU_GNUC_PREREQ(4, 1) > - register unsigned long _beg __asm ("a1"); > - register unsigned long _end __asm ("a2"); > - register unsigned long _flg __asm ("a3"); > -#endif > - > - /* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */ > - *(uint32_t *)jmp_addr = > - (*(uint32_t *)jmp_addr & ~0xffffff) > - | (((addr - (jmp_addr + 8)) >> 2) & 0xffffff); > - > -#if QEMU_GNUC_PREREQ(4, 1) > - __builtin___clear_cache((char *) jmp_addr, (char *) jmp_addr + 4); > -#else > - /* flush icache */ > - _beg = jmp_addr; > - _end = jmp_addr + 4; > - _flg = 0; > - __asm __volatile__ ("swi 0x9f0002" : : "r" (_beg), "r" (_end), "r" > (_flg)); > -#endif > + /* Patch the branch destination. It uses a ldr pc, [pc, #-4] kind > + of branch so we write absolute address and we don't need to > + flush icache. */ > + *(uint32_t *)jmp_addr = addr; > } > #elif defined(__sparc__) > void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr); > diff --git a/exec.c b/exec.c > index 7899042..8d115ac 100644 > --- a/exec.c > +++ b/exec.c > @@ -546,10 +546,6 @@ static void code_gen_alloc(unsigned long tb_size) > start = (void *) 0x40000000UL; > if (code_gen_buffer_size > (512 * 1024 * 1024)) > code_gen_buffer_size = (512 * 1024 * 1024); > -#elif defined(__arm__) > - /* Keep the buffer no bigger than 16MB to branch between blocks */ > - if (code_gen_buffer_size > 16 * 1024 * 1024) > - code_gen_buffer_size = 16 * 1024 * 1024; > #elif defined(__s390x__) > /* Map the buffer so that we can use direct calls and branches. */ > /* We have a +- 4GB range on the branches; leave some slop. */ > diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c > index fafbd5d..e04cfa7 100644 > --- a/tcg/arm/tcg-target.c > +++ b/tcg/arm/tcg-target.c > @@ -1501,14 +1501,9 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode > opc, > case INDEX_op_goto_tb: > if (s->tb_jmp_offset) { > /* Direct jump method */ > -#if defined(USE_DIRECT_JUMP) > - s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf; > - tcg_out_b_noaddr(s, COND_AL); > -#else > tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_PC, -4); > s->tb_jmp_offset[args[0]] = s->code_ptr - s->code_buf; > - tcg_out32(s, 0); > -#endif > + tcg_out32(s, (int)s->code_ptr + 4); > } else { > /* Indirect jump method */ > #if 1 > -- > 1.7.10.4 > >