From: KONRAD Frederic <fred.kon...@greensocs.com> This protects TBContext with tb_lock to make tb_* thread safe.
We can still have issue with tb_flush in case of multithread TCG: An other CPU can be executing code during a flush. This can be fixed later by making all other TCG thread exiting before calling tb_flush(). tb_find_slow is separated into tb_find_slow and tb_find_physical as the whole tb_find_slow doesn't require to lock the tb. Signed-off-by: KONRAD Frederic <fred.kon...@greensocs.com> Changes: V6 -> V7: * Drop a tb_lock in already locked restore_state_to_opc. V5 -> V6: * Drop a tb_lock arround tb_find_fast in cpu-exec.c. --- cpu-exec.c | 58 +++++++++++++------- include/exec/exec-all.h | 1 + target-arm/translate.c | 3 ++ tcg/tcg.h | 14 ++++- translate-all.c | 137 +++++++++++++++++++++++++++++++++++++----------- 5 files changed, 162 insertions(+), 51 deletions(-) diff --git a/cpu-exec.c b/cpu-exec.c index f3358a9..a012e9d 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -131,6 +131,8 @@ static void init_delay_params(SyncClocks *sc, const CPUState *cpu) void cpu_loop_exit(CPUState *cpu) { cpu->current_tb = NULL; + /* Release those mutex before long jump so other thread can work. */ + tb_lock_reset(); siglongjmp(cpu->jmp_env, 1); } @@ -143,6 +145,8 @@ void cpu_resume_from_signal(CPUState *cpu, void *puc) /* XXX: restore cpu registers saved in host registers */ cpu->exception_index = -1; + /* Release those mutex before long jump so other thread can work. */ + tb_lock_reset(); siglongjmp(cpu->jmp_env, 1); } @@ -253,10 +257,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles, tb_free(tb); } -static TranslationBlock *tb_find_slow(CPUState *cpu, - target_ulong pc, - target_ulong cs_base, - uint64_t flags) +static TranslationBlock *tb_find_physical(CPUState *cpu, target_ulong pc, + target_ulong cs_base, uint64_t flags) { CPUArchState *env = (CPUArchState *)cpu->env_ptr; TranslationBlock *tb, **ptb1; @@ -273,8 +275,9 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h]; for(;;) { tb = *ptb1; - if (!tb) - goto not_found; + if (!tb) { + return tb; + } if (tb->pc == pc && tb->page_addr[0] == phys_page1 && tb->cs_base == cs_base && @@ -282,28 +285,42 @@ static TranslationBlock *tb_find_slow(CPUState *cpu, /* check next page if needed */ if (tb->page_addr[1] != -1) { tb_page_addr_t phys_page2; - virt_page2 = (pc & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE; phys_page2 = get_page_addr_code(env, virt_page2); - if (tb->page_addr[1] == phys_page2) - goto found; + if (tb->page_addr[1] == phys_page2) { + return tb; + } } else { - goto found; + return tb; } } ptb1 = &tb->phys_hash_next; } - not_found: - /* if no translated code available, then translate it now */ - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); - - found: - /* Move the last found TB to the head of the list */ - if (likely(*ptb1)) { - *ptb1 = tb->phys_hash_next; - tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h]; - tcg_ctx.tb_ctx.tb_phys_hash[h] = tb; + return tb; +} + +static TranslationBlock *tb_find_slow(CPUState *cpu, target_ulong pc, + target_ulong cs_base, uint64_t flags) +{ + /* + * First try to get the tb if we don't find it we need to lock and compile + * it. + */ + TranslationBlock *tb; + + tb = tb_find_physical(cpu, pc, cs_base, flags); + if (!tb) { + tb_lock(); + /* + * Retry to get the TB in case a CPU just translate it to avoid having + * duplicated TB in the pool. + */ + tb = tb_find_physical(cpu, pc, cs_base, flags); + if (!tb) { + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); + } + tb_unlock(); } /* we add the TB in the virtual pc hash table */ cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb; @@ -326,6 +343,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu) tb->flags != flags)) { tb = tb_find_slow(cpu, pc, cs_base, flags); } + return tb; } diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 55a6ff2..9f1c1cb 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -74,6 +74,7 @@ typedef struct TranslationBlock TranslationBlock; void gen_intermediate_code(CPUArchState *env, struct TranslationBlock *tb); void gen_intermediate_code_pc(CPUArchState *env, struct TranslationBlock *tb); +/* Called with tb_lock held. */ void restore_state_to_opc(CPUArchState *env, struct TranslationBlock *tb, int pc_pos); diff --git a/target-arm/translate.c b/target-arm/translate.c index 69ac18c..960c75e 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -11166,6 +11166,8 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu, dc->tb = tb; + tb_lock(); + dc->is_jmp = DISAS_NEXT; dc->pc = pc_start; dc->singlestep_enabled = cs->singlestep_enabled; @@ -11506,6 +11508,7 @@ done_generating: tb->size = dc->pc - pc_start; tb->icount = num_insns; } + tb_unlock(); } void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb) diff --git a/tcg/tcg.h b/tcg/tcg.h index 231a781..1932323 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -590,21 +590,33 @@ static inline bool tcg_op_buf_full(void) /* pool based memory allocation */ +/* tb_lock must be help for tcg_malloc_internal. */ void *tcg_malloc_internal(TCGContext *s, int size); + void tcg_pool_reset(TCGContext *s); void tcg_pool_delete(TCGContext *s); +void tb_lock(void); +void tb_unlock(void); +void tb_lock_reset(void); + static inline void *tcg_malloc(int size) { TCGContext *s = &tcg_ctx; uint8_t *ptr, *ptr_end; + void *ret; + + tb_lock(); size = (size + sizeof(long) - 1) & ~(sizeof(long) - 1); ptr = s->pool_cur; ptr_end = ptr + size; if (unlikely(ptr_end > s->pool_end)) { - return tcg_malloc_internal(&tcg_ctx, size); + ret = tcg_malloc_internal(&tcg_ctx, size); + tb_unlock(); + return ret; } else { s->pool_cur = ptr_end; + tb_unlock(); return ptr; } } diff --git a/translate-all.c b/translate-all.c index 60a3d8b..046565c 100644 --- a/translate-all.c +++ b/translate-all.c @@ -129,6 +129,34 @@ static void *l1_map[V_L1_SIZE]; /* code generation context */ TCGContext tcg_ctx; +/* translation block context */ +__thread volatile int have_tb_lock; + +void tb_lock(void) +{ + if (!have_tb_lock) { + qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); + } + have_tb_lock++; +} + +void tb_unlock(void) +{ + assert(have_tb_lock > 0); + have_tb_lock--; + if (!have_tb_lock) { + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + } +} + +void tb_lock_reset(void) +{ + if (have_tb_lock) { + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + } + have_tb_lock = 0; +} + static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, tb_page_addr_t phys_page2); static TranslationBlock *tb_find_pc(uintptr_t tc_ptr); @@ -217,6 +245,7 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, #ifdef CONFIG_PROFILER ti = profile_getclock(); #endif + tb_lock(); tcg_func_start(s); gen_intermediate_code_pc(env, tb); @@ -230,8 +259,10 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, /* find opc index corresponding to search_pc */ tc_ptr = (uintptr_t)tb->tc_ptr; - if (searched_pc < tc_ptr) + if (searched_pc < tc_ptr) { + tb_unlock(); return -1; + } s->tb_next_offset = tb->tb_next_offset; #ifdef USE_DIRECT_JUMP @@ -243,8 +274,10 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, #endif j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr, searched_pc - tc_ptr); - if (j < 0) + if (j < 0) { + tb_unlock(); return -1; + } /* now find start of instruction before */ while (s->gen_opc_instr_start[j] == 0) { j--; @@ -257,6 +290,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb, s->restore_time += profile_getclock() - ti; s->restore_count++; #endif + + tb_unlock(); return 0; } @@ -675,6 +710,7 @@ static inline void code_gen_alloc(size_t tb_size) CODE_GEN_AVG_BLOCK_SIZE; tcg_ctx.tb_ctx.tbs = g_malloc(tcg_ctx.code_gen_max_blocks * sizeof(TranslationBlock)); + qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); } /* Must be called before using the QEMU cpus. 'tb_size' is the size @@ -699,16 +735,22 @@ bool tcg_enabled(void) return tcg_ctx.code_gen_buffer != NULL; } -/* Allocate a new translation block. Flush the translation buffer if - too many translation blocks or too much generated code. */ +/* + * Allocate a new translation block. Flush the translation buffer if + * too many translation blocks or too much generated code. + * tb_alloc is not thread safe but tb_gen_code is protected by a mutex so this + * function is called only by one thread. + */ static TranslationBlock *tb_alloc(target_ulong pc) { - TranslationBlock *tb; + TranslationBlock *tb = NULL; if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks || (tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer) >= tcg_ctx.code_gen_buffer_max_size) { - return NULL; + tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; + tb->pc = pc; + tb->cflags = 0; } tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; tb->pc = pc; @@ -721,11 +763,16 @@ void tb_free(TranslationBlock *tb) /* In practice this is mostly used for single use temporary TB Ignore the hard cases and just back up if this TB happens to be the last one generated. */ + + tb_lock(); + if (tcg_ctx.tb_ctx.nb_tbs > 0 && tb == &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) { tcg_ctx.code_gen_ptr = tb->tc_ptr; tcg_ctx.tb_ctx.nb_tbs--; } + + tb_unlock(); } static inline void invalidate_page_bitmap(PageDesc *p) @@ -774,6 +821,8 @@ static void page_flush_tb(void) /* XXX: tb_flush is currently not thread safe */ void tb_flush(CPUState *cpu) { + tb_lock(); + #if defined(DEBUG_FLUSH) printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n", (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer), @@ -798,6 +847,8 @@ void tb_flush(CPUState *cpu) /* XXX: flush processor icache at this point if cache flush is expensive */ tcg_ctx.tb_ctx.tb_flush_count++; + + tb_unlock(); } #ifdef DEBUG_TB_CHECK @@ -807,6 +858,8 @@ static void tb_invalidate_check(target_ulong address) TranslationBlock *tb; int i; + tb_lock(); + address &= TARGET_PAGE_MASK; for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { for (tb = tb_ctx.tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) { @@ -818,6 +871,8 @@ static void tb_invalidate_check(target_ulong address) } } } + + tb_unlock(); } /* verify that all the pages have correct rights for code */ @@ -826,6 +881,8 @@ static void tb_page_check(void) TranslationBlock *tb; int i, flags1, flags2; + tb_lock(); + for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL; tb = tb->phys_hash_next) { @@ -837,6 +894,8 @@ static void tb_page_check(void) } } } + + tb_unlock(); } #endif @@ -917,6 +976,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) tb_page_addr_t phys_pc; TranslationBlock *tb1, *tb2; + tb_lock(); + /* remove the TB from the hash list */ phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); h = tb_phys_hash_func(phys_pc); @@ -964,6 +1025,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) tb->jmp_first = (TranslationBlock *)((uintptr_t)tb | 2); /* fail safe */ tcg_ctx.tb_ctx.tb_phys_invalidate_count++; + tb_unlock(); } static void build_page_bitmap(PageDesc *p) @@ -1005,6 +1067,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, target_ulong virt_page2; int code_gen_size; + tb_lock(); + phys_pc = get_page_addr_code(env, pc); if (use_icount) { cflags |= CF_USE_ICOUNT; @@ -1033,6 +1097,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, phys_page2 = get_page_addr_code(env, virt_page2); } tb_link_page(tb, phys_pc, phys_page2); + + tb_unlock(); return tb; } @@ -1331,13 +1397,15 @@ static inline void tb_alloc_page(TranslationBlock *tb, } /* add a new TB and link it to the physical page tables. phys_page2 is - (-1) to indicate that only one page contains the TB. */ + * (-1) to indicate that only one page contains the TB. */ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, tb_page_addr_t phys_page2) { unsigned int h; TranslationBlock **ptb; + tb_lock(); + /* Grab the mmap lock to stop another thread invalidating this TB before we are done. */ mmap_lock(); @@ -1371,6 +1439,8 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, tb_page_check(); #endif mmap_unlock(); + + tb_unlock(); } /* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr < @@ -1379,31 +1449,34 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr) { int m_min, m_max, m; uintptr_t v; - TranslationBlock *tb; - - if (tcg_ctx.tb_ctx.nb_tbs <= 0) { - return NULL; - } - if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer || - tc_ptr >= (uintptr_t)tcg_ctx.code_gen_ptr) { - return NULL; - } - /* binary search (cf Knuth) */ - m_min = 0; - m_max = tcg_ctx.tb_ctx.nb_tbs - 1; - while (m_min <= m_max) { - m = (m_min + m_max) >> 1; - tb = &tcg_ctx.tb_ctx.tbs[m]; - v = (uintptr_t)tb->tc_ptr; - if (v == tc_ptr) { - return tb; - } else if (tc_ptr < v) { - m_max = m - 1; - } else { - m_min = m + 1; + TranslationBlock *tb = NULL; + + tb_lock(); + + if ((tcg_ctx.tb_ctx.nb_tbs > 0) + && (tc_ptr >= (uintptr_t)tcg_ctx.code_gen_buffer && + tc_ptr < (uintptr_t)tcg_ctx.code_gen_ptr)) { + /* binary search (cf Knuth) */ + m_min = 0; + m_max = tcg_ctx.tb_ctx.nb_tbs - 1; + while (m_min <= m_max) { + m = (m_min + m_max) >> 1; + tb = &tcg_ctx.tb_ctx.tbs[m]; + v = (uintptr_t)tb->tc_ptr; + if (v == tc_ptr) { + tb_unlock(); + return tb; + } else if (tc_ptr < v) { + m_max = m - 1; + } else { + m_min = m + 1; + } } + tb = &tcg_ctx.tb_ctx.tbs[m_max]; } - return &tcg_ctx.tb_ctx.tbs[m_max]; + + tb_unlock(); + return tb; } #if !defined(CONFIG_USER_ONLY) @@ -1565,6 +1638,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) int direct_jmp_count, direct_jmp2_count, cross_page; TranslationBlock *tb; + tb_lock(); + target_code_size = 0; max_target_code_size = 0; cross_page = 0; @@ -1620,6 +1695,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) tcg_ctx.tb_ctx.tb_phys_invalidate_count); cpu_fprintf(f, "TLB flush count %d\n", tlb_flush_count); tcg_dump_info(f, cpu_fprintf); + + tb_unlock(); } void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf) -- 1.9.0