[Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU

Emilio G. Cota Mon, 25 Apr 2016 16:48:04 -0700

Context:
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html
  https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html


This seems to half-work[*] although I'm uneasy about the while idea.
I see two major hurdles:

* If the TB size is too small, this breaks badly, because we're not
  out of the RCU read critical section when another call to tb_flush
  is made. For instance, when booting ARM with this patch applied,
  I need to at least pass -tb-size 10 for it to fully boot debian
  jessie.
* We have different tb_flush callers that should be audited:
 $ git grep '\stb_flush(' | grep -v '//' | grep -v 'CPUState'
 exec.c:            tb_flush(cpu);
 gdbstub.c:        tb_flush(cpu);
 gdbstub.c:    tb_flush(cpu);
 hw/ppc/spapr_hcall.c:            tb_flush(CPU(cpu));
 target-alpha/sys_helper.c:    tb_flush(CPU(alpha_env_get_cpu(env)));
 target-i386/translate.c:        tb_flush(CPU(x86_env_get_cpu(env)));
 translate-all.c:        tb_flush(cpu);

With two code_gen "halves", if two tb_flush calls are done in the same
RCU read critical section, we're screwed. I added a cpu_exit at the end
of tb_flush to try to mitigate this, but I haven't audited all the callers
(for instance, what does the gdbstub do?).

If we end up having a mechanism to "stop all  CPUs to do something", as
I think we'll end up needing for correct LL/SC emulation, we'll probably
be better off using that mechanism for tb_flush as well -- plus, we'll avoid
wasting memory.

Other issues:
- This could be split into at least 2 patches, one that touches
  tcg/ and another to deal with translate-all.
  Note that in translate-all, the initial allocation of code_gen doesn't
  allocate extra space for the guard page; reserving guard page space is
  done instead by the added split_code_gen_buffer function.
- Windows: not even compile-tested.

[*] "Seems to half-work". At least it boots ARM OK with MTTCG and -smp 4.
    Alex' tests, however, sometimes fail with:

Unhandled exception 3 (pabt)
Exception frame registers:
pc : [<fffffb44>]    lr : [<00000001>]    psr: 20000173
sp : 4004f528  ip : 40012048  fp : 40032ca8
r10: 40032ca8  r9 : 00000000  r8 : 00000000
r7 : 0000000e  r6 : 40030000  r5 : 40032ca8  r4 : 00001ac6
r3 : 40012030  r2 : 40012030  r1 : d5ffffe7  r0 : 00000028
Flags: nzCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000
IFAR: fffffb44    IFSR: 00000205

or with:

CPU0: 16986 irqs (0 races, 11 slow,  1322 ticks avg latency)
FAIL: smc: irq: 17295 IRQs sent, 16986 received

Unhandled exception 6 (irq)
Exception frame registers:
pc : [<00000020>]    lr : [<40010800>]    psr: 60000153
sp : 400b45c0  ip : 400b34e8  fp : 40032ca8
r10: 00000000  r9 : 00000000  r8 : 00000000
r7 : 00000000  r6 : 00000000  r5 : 00000000  r4 : 00000000
r3 : 00000000  r2 : 00000000  r1 : 000000ff  r0 : 00000000
Flags: nZCv  IRQs on  FIQs off  Mode SVC_32
Control: 00c5107d  Table: 40060000  DAC: 00000000

I built with --enable-tcg-debug.

Signed-off-by: Emilio G. Cota <c...@braap.org>
---
 tcg/tcg.c       |  22 +++++---
 tcg/tcg.h       |   1 +
 translate-all.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 144 insertions(+), 34 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index b46bf1a..7db8ce9 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -380,6 +380,20 @@ void tcg_context_init(TCGContext *s)
     }
 }
 
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size)
+{
+    size_t size = s->code_gen_buffer_size - prologue_size;
+
+    s->code_gen_ptr = buf;
+    s->code_gen_buffer = buf;
+    s->code_buf = buf;
+
+    /* Compute a high-water mark, at which we voluntarily flush the buffer
+       and start over.  The size here is arbitrary, significantly larger
+       than we expect the code generation for any one opcode to require.  */
+    s->code_gen_highwater = s->code_buf + (size - 1024);
+}
+
 void tcg_prologue_init(TCGContext *s)
 {
     size_t prologue_size, total_size;
@@ -398,16 +412,10 @@ void tcg_prologue_init(TCGContext *s)
 
     /* Deduct the prologue from the buffer.  */
     prologue_size = tcg_current_code_size(s);
-    s->code_gen_ptr = buf1;
-    s->code_gen_buffer = buf1;
-    s->code_buf = buf1;
     total_size = s->code_gen_buffer_size - prologue_size;
     s->code_gen_buffer_size = total_size;
 
-    /* Compute a high-water mark, at which we voluntarily flush the buffer
-       and start over.  The size here is arbitrary, significantly larger
-       than we expect the code generation for any one opcode to require.  */
-    s->code_gen_highwater = s->code_gen_buffer + (total_size - 1024);
+    tcg_code_gen_init(s, buf1, prologue_size);
 
     tcg_register_jit(s->code_gen_buffer, total_size);
 
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 40c8fbe..e849d3e 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -634,6 +634,7 @@ static inline void *tcg_malloc(int size)
 
 void tcg_context_init(TCGContext *s);
 void tcg_prologue_init(TCGContext *s);
+void tcg_code_gen_init(TCGContext *s, void *buf, size_t prologue_size);
 void tcg_func_start(TCGContext *s);
 
 int tcg_gen_code(TCGContext *s, TranslationBlock *tb);
diff --git a/translate-all.c b/translate-all.c
index bba9b62..7a83176 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -485,6 +485,11 @@ static inline PageDesc *page_find(tb_page_addr_t index)
   (DEFAULT_CODE_GEN_BUFFER_SIZE_1 < MAX_CODE_GEN_BUFFER_SIZE \
    ? DEFAULT_CODE_GEN_BUFFER_SIZE_1 : MAX_CODE_GEN_BUFFER_SIZE)
 
+static int code_gen_buf_mask;
+static void *code_gen_buf1;
+static void *code_gen_buf2;
+static size_t code_gen_prologue_size;
+
 static inline size_t size_code_gen_buffer(size_t tb_size)
 {
     /* Size the buffer.  */
@@ -508,6 +513,43 @@ static inline size_t size_code_gen_buffer(size_t tb_size)
     return tb_size;
 }
 
+static void *split_code_gen_buffer(void *buf, size_t size)
+{
+    /* enforce alignment of the buffer to a page boundary */
+    if (unlikely((uintptr_t)buf & ~qemu_real_host_page_mask)) {
+        uintptr_t b;
+
+        b = QEMU_ALIGN_UP((uintptr_t)buf, qemu_real_host_page_size);
+        size -= b - (uintptr_t)buf;
+        buf = (void *)b;
+    }
+    /*
+     * Make sure the size is an even number of pages so that both halves will
+     * end on a page boundary.
+     */
+    size = QEMU_ALIGN_DOWN(size, 2 * qemu_real_host_page_size);
+
+    /* split in half, making room for 2 guard pages */
+    size -= 2 * qemu_real_host_page_size;
+    size /= 2;
+    code_gen_buf1 = buf;
+    code_gen_buf2 = buf + size + qemu_real_host_page_size;
+
+    /*
+     * write the prologue into buf2. This is safe because we'll later call
+     * tcg_prologue_init on buf1, from which we'll start execution.
+     */
+    tcg_ctx.code_gen_buffer = code_gen_buf2;
+    tcg_prologue_init(&tcg_ctx);
+    code_gen_prologue_size = (void *)tcg_ctx.code_ptr - code_gen_buf2;
+
+    tcg_ctx.code_gen_buffer_size = size;
+
+    /* start execution from buf1 */
+    code_gen_buf_mask = 1;
+    return code_gen_buf1;
+}
+
 #ifdef __mips__
 /* In order to use J and JAL within the code_gen_buffer, we require
    that the buffer not cross a 256MB boundary.  */
@@ -583,21 +625,17 @@ static inline void map_none(void *addr, long size)
 static inline void *alloc_code_gen_buffer(void)
 {
     void *buf = static_code_gen_buffer;
-    size_t full_size, size;
+    size_t size;
 
     /* The size of the buffer, rounded down to end on a page boundary.  */
-    full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
-                 & qemu_real_host_page_mask) - (uintptr_t)buf;
-
-    /* Reserve a guard page.  */
-    size = full_size - qemu_real_host_page_size;
+    size = (((uintptr_t)buf + sizeof(static_code_gen_buffer))
+            & qemu_real_host_page_mask) - (uintptr_t)buf;
 
     /* Honor a command-line option limiting the size of the buffer.  */
     if (size > tcg_ctx.code_gen_buffer_size) {
         size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size)
                 & qemu_real_host_page_mask) - (uintptr_t)buf;
     }
-    tcg_ctx.code_gen_buffer_size = size;
 
 #ifdef __mips__
     if (cross_256mb(buf, size)) {
@@ -607,27 +645,40 @@ static inline void *alloc_code_gen_buffer(void)
 #endif
 
     map_exec(buf, size);
-    map_none(buf + size, qemu_real_host_page_size);
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    map_none(code_gen_buf1 + size, qemu_real_host_page_size);
+    map_none(code_gen_buf2 + size, qemu_real_host_page_size);
+
     return buf;
 }
 #elif defined(_WIN32)
 static inline void *alloc_code_gen_buffer(void)
 {
     size_t size = tcg_ctx.code_gen_buffer_size;
-    void *buf1, *buf2;
+    void *buf;
+    void *ret;
 
-    /* Perform the allocation in two steps, so that the guard page
-       is reserved but uncommitted.  */
-    buf1 = VirtualAlloc(NULL, size + qemu_real_host_page_size,
-                        MEM_RESERVE, PAGE_NOACCESS);
-    if (buf1 != NULL) {
-        buf2 = VirtualAlloc(buf1, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
-        assert(buf1 == buf2);
+    /* Perform the allocation in two steps, so that the guard pages
+       are reserved but uncommitted.  */
+    ret = VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+    if (ret == NULL) {
+        return NULL;
     }
 
-    return buf1;
+    ret = split_code_gen_buffer(ret, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    buf = VirtualAlloc(code_gen_buf1, size, MEM_COMMIT, 
PAGE_EXECUTE_READWRITE);
+    assert(buf);
+    buf = VirtualAlloc(code_gen_buf2, size, MEM_COMMIT, 
PAGE_EXECUTE_READWRITE);
+    assert(buf);
+
+    return ret;
 }
 #else
 static inline void *alloc_code_gen_buffer(void)
@@ -665,8 +716,7 @@ static inline void *alloc_code_gen_buffer(void)
 #  endif
 # endif
 
-    buf = mmap((void *)start, size + qemu_real_host_page_size,
-               PROT_NONE, flags, -1, 0);
+    buf = mmap((void *)start, size, PROT_NONE, flags, -1, 0);
     if (buf == MAP_FAILED) {
         return NULL;
     }
@@ -676,24 +726,24 @@ static inline void *alloc_code_gen_buffer(void)
         /* Try again, with the original still mapped, to avoid re-acquiring
            that 256mb crossing.  This time don't specify an address.  */
         size_t size2;
-        void *buf2 = mmap(NULL, size + qemu_real_host_page_size,
-                          PROT_NONE, flags, -1, 0);
+        void *buf2 = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+
         switch (buf2 != MAP_FAILED) {
         case 1:
             if (!cross_256mb(buf2, size)) {
                 /* Success!  Use the new buffer.  */
-                munmap(buf, size + qemu_real_host_page_size);
+                munmap(buf, size);
                 break;
             }
             /* Failure.  Work with what we had.  */
-            munmap(buf2, size + qemu_real_host_page_size);
+            munmap(buf2, size);
             /* fallthru */
         default:
             /* Split the original buffer.  Free the smaller half.  */
             buf2 = split_cross_256mb(buf, size);
             size2 = tcg_ctx.code_gen_buffer_size;
             if (buf == buf2) {
-                munmap(buf + size2 + qemu_real_host_page_size, size - size2);
+                munmap(buf + size2, size - size2);
             } else {
                 munmap(buf, size - size2);
             }
@@ -704,13 +754,19 @@ static inline void *alloc_code_gen_buffer(void)
     }
 #endif
 
-    /* Make the final buffer accessible.  The guard page at the end
-       will remain inaccessible with PROT_NONE.  */
+    /* Make the final buffer accessible. */
     mprotect(buf, size, PROT_WRITE | PROT_READ | PROT_EXEC);
 
     /* Request large pages for the buffer.  */
     qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE);
 
+    buf = split_code_gen_buffer(buf + 1, size);
+    size = tcg_ctx.code_gen_buffer_size;
+
+    /* page guards */
+    mprotect(code_gen_buf1 + size, qemu_real_host_page_size, PROT_NONE);
+    mprotect(code_gen_buf2 + size, qemu_real_host_page_size, PROT_NONE);
+
     return buf;
 }
 #endif /* USE_STATIC_CODE_GEN_BUFFER, WIN32, POSIX */
@@ -829,10 +885,48 @@ static void page_flush_tb(void)
     }
 }
 
+struct code_gen_desc {
+    struct rcu_head rcu;
+    int clear_bit;
+};
+
+static void clear_code_gen_buffer(struct rcu_head *rcu)
+{
+    struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu);
+
+    tb_lock();
+    code_gen_buf_mask &= ~desc->clear_bit;
+    tb_unlock();
+    g_free(desc);
+}
+
+static void *replace_code_gen_buffer(void)
+{
+    struct code_gen_desc *desc = g_malloc0(sizeof(*desc));
+
+    /*
+     * If both bits are set, we're having two concurrent flushes. This
+     * can easily happen if the buffers are heavily undersized.
+     */
+    assert(code_gen_buf_mask == 1 || code_gen_buf_mask == 2);
+
+    desc->clear_bit = code_gen_buf_mask;
+    call_rcu1(&desc->rcu, clear_code_gen_buffer);
+
+    if (code_gen_buf_mask == 1) {
+        code_gen_buf_mask |= 2;
+        return code_gen_buf2 + code_gen_prologue_size;
+    }
+    code_gen_buf_mask |= 1;
+    return code_gen_buf1 + code_gen_prologue_size;
+}
+
 /* flush all the translation blocks */
 /* XXX: tb_flush is currently not thread safe */
 void tb_flush(CPUState *cpu)
 {
+    void *buf;
+
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -853,10 +947,17 @@ void tb_flush(CPUState *cpu)
     qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
     page_flush_tb();
 
-    tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
+    buf = replace_code_gen_buffer();
+    tcg_code_gen_init(&tcg_ctx, buf, code_gen_prologue_size);
+
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
     tcg_ctx.tb_ctx.tb_flush_count++;
+
+    /* exit all CPUs so that the old buffer is quickly cleared */
+    CPU_FOREACH(cpu) {
+        cpu_exit(cpu);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
-- 
2.5.0

[Qemu-devel] [RFC v3] translate-all: protect code_gen_buffer with RCU

Reply via email to