Alex Bennée <alex.ben...@linaro.org> writes: > Emilio G. Cota <c...@braap.org> writes: > >> This is a first attempt at making tb_flush not have to stop all CPUs. >> There are issues as pointed out below, but this could be a good start. >> >> Context: >> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html >> https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html >> >> Known issues: >> - Basically compile-tested only, since I've only run this with >> single-threaded TCG; I also tried running it with linux-user, >> but in order to trigger tb_flush I had to make code_gen_buffer >> so small that the CPU calling tb_flush would immediately fill >> the 2nd buffer, triggering the assert. If you have a working >> multi-threaded workload that would be good to test this, please >> let me know. > > With my latest mttcg unit tests: > > ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ > -device virtio-serial-device -device virtconsole,chardev=ctd \ > -chardev testdev,id=ctd -display none -serial stdio \ > -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ > -append "tight smc irq mod=1 rounds=100000" -name > arm,debug-threads=on
Ahh, I just realised you wanted a linux-user workload. > > >> - Windows; not even compile-tested! >> >> Signed-off-by: Emilio G. Cota <c...@braap.org> >> --- >> translate-all.c | 122 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 117 insertions(+), 5 deletions(-) >> >> diff --git a/translate-all.c b/translate-all.c >> index bba9b62..4c14b4d 100644 >> --- a/translate-all.c >> +++ b/translate-all.c >> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, >> size_t size1) >> #endif >> >> #ifdef USE_STATIC_CODE_GEN_BUFFER >> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] >> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] >> __attribute__((aligned(CODE_GEN_ALIGN))); >> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] >> + __attribute__((aligned(CODE_GEN_ALIGN))); >> +static int static_buf_mask = 1; >> +static void *static_buf1; >> +static void *static_buf2; >> >> # ifdef _WIN32 >> static inline void do_protect(void *addr, long size, int prot) >> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) >> } >> # endif /* WIN32 */ >> >> -static inline void *alloc_code_gen_buffer(void) >> +static void *alloc_static_code_gen_buffer(void *buf) >> { >> - void *buf = static_code_gen_buffer; >> size_t full_size, size; >> >> /* The size of the buffer, rounded down to end on a page boundary. */ >> - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) >> + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) >> & qemu_real_host_page_mask) - (uintptr_t)buf; >> >> /* Reserve a guard page. */ >> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) >> >> return buf; >> } >> + >> +static inline void *alloc_code_gen_buffer(void) >> +{ >> + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); >> + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); >> + >> + assert(static_buf_mask == 1); >> + return static_buf1; >> +} >> #elif defined(_WIN32) >> static inline void *alloc_code_gen_buffer(void) >> { >> @@ -829,8 +842,100 @@ static void page_flush_tb(void) >> } >> } >> >> +#ifdef USE_STATIC_CODE_GEN_BUFFER >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + int clear_bit; >> +}; >> + >> +static void code_gen_buffer_clear(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, >> rcu); >> + >> + tb_lock(); >> + static_buf_mask &= ~desc->clear_bit; >> + tb_unlock(); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); >> + >> + /* >> + * If both bits are set, we're having two concurrent flushes. This >> + * can easily happen if the buffers are heavily undersized. >> + */ >> + assert(static_buf_mask == 1 || static_buf_mask == 2); >> + >> + desc->clear_bit = static_buf_mask; >> + call_rcu1(&desc->rcu, code_gen_buffer_clear); >> + >> + if (static_buf_mask == 1) { >> + static_buf_mask |= 2; >> + return static_buf2; >> + } >> + static_buf_mask |= 1; >> + return static_buf1; >> +} >> + >> +#elif defined(_WIN32) >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + void *buf; >> +}; >> + >> +static void code_gen_buffer_vfree(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, >> rcu); >> + >> + VirtualFree(desc->buf, 0, MEM_RELEASE); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc; >> + >> + desc = g_malloc0(sizeof(*desc)); >> + desc->buf = tcg_ctx.code_gen_buffer; >> + call_rcu1(&desc->rcu, code_gen_buffer_vfree); >> + >> + return alloc_code_gen_buffer(); >> +} >> + >> +#else /* UNIX, dynamically-allocated code buffer */ >> + >> +struct code_gen_desc { >> + struct rcu_head rcu; >> + void *buf; >> + size_t size; >> +}; >> + >> +static void code_gen_buffer_unmap(struct rcu_head *rcu) >> +{ >> + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, >> rcu); >> + >> + munmap(desc->buf, desc->size + qemu_real_host_page_size); >> + g_free(desc); >> +} >> + >> +static void *code_gen_buffer_replace(void) >> +{ >> + struct code_gen_desc *desc; >> + >> + desc = g_malloc0(sizeof(*desc)); >> + desc->buf = tcg_ctx.code_gen_buffer; >> + desc->size = tcg_ctx.code_gen_buffer_size; >> + call_rcu1(&desc->rcu, code_gen_buffer_unmap); >> + >> + return alloc_code_gen_buffer(); >> +} >> +#endif /* USE_STATIC_CODE_GEN_BUFFER */ >> + >> /* flush all the translation blocks */ >> -/* XXX: tb_flush is currently not thread safe */ >> void tb_flush(CPUState *cpu) >> { >> #if defined(DEBUG_FLUSH) >> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) >> qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); >> page_flush_tb(); >> >> + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); >> tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; >> + tcg_prologue_init(&tcg_ctx); >> /* XXX: flush processor icache at this point if cache flush is >> expensive */ >> tcg_ctx.tb_ctx.tb_flush_count++; >> + >> + /* exit all CPUs so that the old buffer is quickly cleared. */ >> + CPU_FOREACH(cpu) { >> + cpu_exit(cpu); >> + } >> } >> >> #ifdef DEBUG_TB_CHECK -- Alex Bennée