Emilio G. Cota <c...@braap.org> writes: > This is a first attempt at making tb_flush not have to stop all CPUs. > There are issues as pointed out below, but this could be a good start. > > Context: > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html > https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html > > Known issues: > - Basically compile-tested only, since I've only run this with > single-threaded TCG; I also tried running it with linux-user, > but in order to trigger tb_flush I had to make code_gen_buffer > so small that the CPU calling tb_flush would immediately fill > the 2nd buffer, triggering the assert. If you have a working > multi-threaded workload that would be good to test this, please > let me know.
With my latest mttcg unit tests: ./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \ -device virtio-serial-device -device virtconsole,chardev=ctd \ -chardev testdev,id=ctd -display none -serial stdio \ -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \ -append "tight smc irq mod=1 rounds=100000" -name arm,debug-threads=on > - Windows; not even compile-tested! > > Signed-off-by: Emilio G. Cota <c...@braap.org> > --- > translate-all.c | 122 > +++++++++++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 117 insertions(+), 5 deletions(-) > > diff --git a/translate-all.c b/translate-all.c > index bba9b62..4c14b4d 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t > size1) > #endif > > #ifdef USE_STATIC_CODE_GEN_BUFFER > -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] > +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE] > __attribute__((aligned(CODE_GEN_ALIGN))); > +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE] > + __attribute__((aligned(CODE_GEN_ALIGN))); > +static int static_buf_mask = 1; > +static void *static_buf1; > +static void *static_buf2; > > # ifdef _WIN32 > static inline void do_protect(void *addr, long size, int prot) > @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size) > } > # endif /* WIN32 */ > > -static inline void *alloc_code_gen_buffer(void) > +static void *alloc_static_code_gen_buffer(void *buf) > { > - void *buf = static_code_gen_buffer; > size_t full_size, size; > > /* The size of the buffer, rounded down to end on a page boundary. */ > - full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) > + full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1)) > & qemu_real_host_page_mask) - (uintptr_t)buf; > > /* Reserve a guard page. */ > @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void) > > return buf; > } > + > +static inline void *alloc_code_gen_buffer(void) > +{ > + static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1); > + static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2); > + > + assert(static_buf_mask == 1); > + return static_buf1; > +} > #elif defined(_WIN32) > static inline void *alloc_code_gen_buffer(void) > { > @@ -829,8 +842,100 @@ static void page_flush_tb(void) > } > } > > +#ifdef USE_STATIC_CODE_GEN_BUFFER > + > +struct code_gen_desc { > + struct rcu_head rcu; > + int clear_bit; > +}; > + > +static void code_gen_buffer_clear(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, > rcu); > + > + tb_lock(); > + static_buf_mask &= ~desc->clear_bit; > + tb_unlock(); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); > + > + /* > + * If both bits are set, we're having two concurrent flushes. This > + * can easily happen if the buffers are heavily undersized. > + */ > + assert(static_buf_mask == 1 || static_buf_mask == 2); > + > + desc->clear_bit = static_buf_mask; > + call_rcu1(&desc->rcu, code_gen_buffer_clear); > + > + if (static_buf_mask == 1) { > + static_buf_mask |= 2; > + return static_buf2; > + } > + static_buf_mask |= 1; > + return static_buf1; > +} > + > +#elif defined(_WIN32) > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > +}; > + > +static void code_gen_buffer_vfree(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, > rcu); > + > + VirtualFree(desc->buf, 0, MEM_RELEASE); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + call_rcu1(&desc->rcu, code_gen_buffer_vfree); > + > + return alloc_code_gen_buffer(); > +} > + > +#else /* UNIX, dynamically-allocated code buffer */ > + > +struct code_gen_desc { > + struct rcu_head rcu; > + void *buf; > + size_t size; > +}; > + > +static void code_gen_buffer_unmap(struct rcu_head *rcu) > +{ > + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, > rcu); > + > + munmap(desc->buf, desc->size + qemu_real_host_page_size); > + g_free(desc); > +} > + > +static void *code_gen_buffer_replace(void) > +{ > + struct code_gen_desc *desc; > + > + desc = g_malloc0(sizeof(*desc)); > + desc->buf = tcg_ctx.code_gen_buffer; > + desc->size = tcg_ctx.code_gen_buffer_size; > + call_rcu1(&desc->rcu, code_gen_buffer_unmap); > + > + return alloc_code_gen_buffer(); > +} > +#endif /* USE_STATIC_CODE_GEN_BUFFER */ > + > /* flush all the translation blocks */ > -/* XXX: tb_flush is currently not thread safe */ > void tb_flush(CPUState *cpu) > { > #if defined(DEBUG_FLUSH) > @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu) > qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); > page_flush_tb(); > > + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); > tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; > + tcg_prologue_init(&tcg_ctx); > /* XXX: flush processor icache at this point if cache flush is > expensive */ > tcg_ctx.tb_ctx.tb_flush_count++; > + > + /* exit all CPUs so that the old buffer is quickly cleared. */ > + CPU_FOREACH(cpu) { > + cpu_exit(cpu); > + } > } > > #ifdef DEBUG_TB_CHECK -- Alex Bennée