Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU

Alex Bennée Fri, 22 Apr 2016 07:42:30 -0700

Emilio G. Cota <c...@braap.org> writes:

> This is a first attempt at making tb_flush not have to stop all CPUs.
> There are issues as pointed out below, but this could be a good start.
>
> Context:
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
>   https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html
>
> Known issues:
> - Basically compile-tested only, since I've only run this with
>   single-threaded TCG; I also tried running it with linux-user,
>   but in order to trigger tb_flush I had to make code_gen_buffer
>   so small that the CPU calling tb_flush would immediately fill
>   the 2nd buffer, triggering the assert. If you have a working
>   multi-threaded workload that would be good to test this, please
>   let me know.


With my latest mttcg unit tests:

./arm-softmmu/qemu-system-arm -machine virt,accel=tcg -cpu cortex-a15 \
  -device virtio-serial-device -device virtconsole,chardev=ctd \
  -chardev testdev,id=ctd -display none -serial stdio \
  -kernel arm/tcg-test.flat -smp 4 -tcg mttcg=on \
  -append "tight smc irq mod=1 rounds=100000"  -name arm,debug-threads=on


> - Windows; not even compile-tested!
>
> Signed-off-by: Emilio G. Cota <c...@braap.org>
> ---
>  translate-all.c | 122 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 117 insertions(+), 5 deletions(-)
>
> diff --git a/translate-all.c b/translate-all.c
> index bba9b62..4c14b4d 100644
> --- a/translate-all.c
> +++ b/translate-all.c
> @@ -536,8 +536,13 @@ static inline void *split_cross_256mb(void *buf1, size_t 
> size1)
>  #endif
>
>  #ifdef USE_STATIC_CODE_GEN_BUFFER
> -static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +static uint8_t static_code_gen_buffer1[DEFAULT_CODE_GEN_BUFFER_SIZE]
>      __attribute__((aligned(CODE_GEN_ALIGN)));
> +static uint8_t static_code_gen_buffer2[DEFAULT_CODE_GEN_BUFFER_SIZE]
> +    __attribute__((aligned(CODE_GEN_ALIGN)));
> +static int static_buf_mask = 1;
> +static void *static_buf1;
> +static void *static_buf2;
>
>  # ifdef _WIN32
>  static inline void do_protect(void *addr, long size, int prot)
> @@ -580,13 +585,12 @@ static inline void map_none(void *addr, long size)
>  }
>  # endif /* WIN32 */
>
> -static inline void *alloc_code_gen_buffer(void)
> +static void *alloc_static_code_gen_buffer(void *buf)
>  {
> -    void *buf = static_code_gen_buffer;
>      size_t full_size, size;
>
>      /* The size of the buffer, rounded down to end on a page boundary.  */
> -    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
> +    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer1))
>                   & qemu_real_host_page_mask) - (uintptr_t)buf;
>
>      /* Reserve a guard page.  */
> @@ -612,6 +616,15 @@ static inline void *alloc_code_gen_buffer(void)
>
>      return buf;
>  }
> +
> +static inline void *alloc_code_gen_buffer(void)
> +{
> +    static_buf1 = alloc_static_code_gen_buffer(static_code_gen_buffer1);
> +    static_buf2 = alloc_static_code_gen_buffer(static_code_gen_buffer2);
> +
> +    assert(static_buf_mask == 1);
> +    return static_buf1;
> +}
>  #elif defined(_WIN32)
>  static inline void *alloc_code_gen_buffer(void)
>  {
> @@ -829,8 +842,100 @@ static void page_flush_tb(void)
>      }
>  }
>
> +#ifdef USE_STATIC_CODE_GEN_BUFFER
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    int clear_bit;
> +};
> +
> +static void code_gen_buffer_clear(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, 
> rcu);
> +
> +    tb_lock();
> +    static_buf_mask &= ~desc->clear_bit;
> +    tb_unlock();
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
> +
> +    /*
> +     * If both bits are set, we're having two concurrent flushes. This
> +     * can easily happen if the buffers are heavily undersized.
> +     */
> +    assert(static_buf_mask == 1 || static_buf_mask == 2);
> +
> +    desc->clear_bit = static_buf_mask;
> +    call_rcu1(&desc->rcu, code_gen_buffer_clear);
> +
> +    if (static_buf_mask == 1) {
> +        static_buf_mask |= 2;
> +        return static_buf2;
> +    }
> +    static_buf_mask |= 1;
> +    return static_buf1;
> +}
> +
> +#elif defined(_WIN32)
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +};
> +
> +static void code_gen_buffer_vfree(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, 
> rcu);
> +
> +    VirtualFree(desc->buf, 0, MEM_RELEASE);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    call_rcu1(&desc->rcu, code_gen_buffer_vfree);
> +
> +    return alloc_code_gen_buffer();
> +}
> +
> +#else /* UNIX, dynamically-allocated code buffer */
> +
> +struct code_gen_desc {
> +    struct rcu_head rcu;
> +    void *buf;
> +    size_t size;
> +};
> +
> +static void code_gen_buffer_unmap(struct rcu_head *rcu)
> +{
> +    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, 
> rcu);
> +
> +    munmap(desc->buf, desc->size + qemu_real_host_page_size);
> +    g_free(desc);
> +}
> +
> +static void *code_gen_buffer_replace(void)
> +{
> +    struct code_gen_desc *desc;
> +
> +    desc = g_malloc0(sizeof(*desc));
> +    desc->buf = tcg_ctx.code_gen_buffer;
> +    desc->size = tcg_ctx.code_gen_buffer_size;
> +    call_rcu1(&desc->rcu, code_gen_buffer_unmap);
> +
> +    return alloc_code_gen_buffer();
> +}
> +#endif /* USE_STATIC_CODE_GEN_BUFFER */
> +
>  /* flush all the translation blocks */
> -/* XXX: tb_flush is currently not thread safe */
>  void tb_flush(CPUState *cpu)
>  {
>  #if defined(DEBUG_FLUSH)
> @@ -853,10 +958,17 @@ void tb_flush(CPUState *cpu)
>      qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
>      page_flush_tb();
>
> +    tcg_ctx.code_gen_buffer = code_gen_buffer_replace();
>      tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
> +    tcg_prologue_init(&tcg_ctx);
>      /* XXX: flush processor icache at this point if cache flush is
>         expensive */
>      tcg_ctx.tb_ctx.tb_flush_count++;
> +
> +    /* exit all CPUs so that the old buffer is quickly cleared. */
> +    CPU_FOREACH(cpu) {
> +        cpu_exit(cpu);
> +    }
>  }
>
>  #ifdef DEBUG_TB_CHECK


--
Alex Bennée

Re: [Qemu-devel] [RFC] translate-all: protect code_gen_buffer with RCU

Reply via email to