fred.kon...@greensocs.com writes: > From: KONRAD Frederic <fred.kon...@greensocs.com> > > This protects TBContext with tb_lock to make tb_* thread safe. > > We can still have issue with tb_flush in case of multithread TCG: > An other CPU can be executing code during a flush. > > This can be fixed later by making all other TCG thread exiting before calling > tb_flush(). > > tb_find_slow is separated into tb_find_slow and tb_find_physical as the whole > tb_find_slow doesn't require to lock the tb. > > Signed-off-by: KONRAD Frederic <fred.kon...@greensocs.com>
So my comments from earlier about the different locking between CONFIG_USER and system emulation still stand. Ultimately we need a good reason (or an abstraction) before sprinkling #ifdefs in the code if only for ease of reading. > > Changes: > V1 -> V2: > * Drop a tb_lock arround tb_find_fast in cpu-exec.c. > --- > cpu-exec.c | 60 ++++++++++++++-------- > target-arm/translate.c | 5 ++ > tcg/tcg.h | 7 +++ > translate-all.c | 137 > ++++++++++++++++++++++++++++++++++++++----------- > 4 files changed, 158 insertions(+), 51 deletions(-) > > diff --git a/cpu-exec.c b/cpu-exec.c > index d6336d9..5d9b518 100644 > --- a/cpu-exec.c > +++ b/cpu-exec.c > @@ -130,6 +130,8 @@ static void init_delay_params(SyncClocks *sc, const > CPUState *cpu) > void cpu_loop_exit(CPUState *cpu) > { > cpu->current_tb = NULL; > + /* Release those mutex before long jump so other thread can work. */ > + tb_lock_reset(); > siglongjmp(cpu->jmp_env, 1); > } > > @@ -142,6 +144,8 @@ void cpu_resume_from_signal(CPUState *cpu, void *puc) > /* XXX: restore cpu registers saved in host registers */ > > cpu->exception_index = -1; > + /* Release those mutex before long jump so other thread can work. */ > + tb_lock_reset(); > siglongjmp(cpu->jmp_env, 1); > } > > @@ -253,12 +257,9 @@ static void cpu_exec_nocache(CPUArchState *env, int > max_cycles, > tb_free(tb); > } > > -static TranslationBlock *tb_find_slow(CPUArchState *env, > - target_ulong pc, > - target_ulong cs_base, > - uint64_t flags) > +static TranslationBlock *tb_find_physical(CPUArchState *env, target_ulong pc, > + target_ulong cs_base, uint64_t > flags) > { As Paolo has already mentioned comments on functions expecting to have locks held when called. > - CPUState *cpu = ENV_GET_CPU(env); > TranslationBlock *tb, **ptb1; > unsigned int h; > tb_page_addr_t phys_pc, phys_page1; > @@ -273,8 +274,9 @@ static TranslationBlock *tb_find_slow(CPUArchState *env, > ptb1 = &tcg_ctx.tb_ctx.tb_phys_hash[h]; > for(;;) { > tb = *ptb1; > - if (!tb) > - goto not_found; > + if (!tb) { > + return tb; > + } > if (tb->pc == pc && > tb->page_addr[0] == phys_page1 && > tb->cs_base == cs_base && > @@ -282,28 +284,43 @@ static TranslationBlock *tb_find_slow(CPUArchState *env, > /* check next page if needed */ > if (tb->page_addr[1] != -1) { > tb_page_addr_t phys_page2; > - > virt_page2 = (pc & TARGET_PAGE_MASK) + > TARGET_PAGE_SIZE; > phys_page2 = get_page_addr_code(env, virt_page2); > - if (tb->page_addr[1] == phys_page2) > - goto found; > + if (tb->page_addr[1] == phys_page2) { > + return tb; > + } > } else { > - goto found; > + return tb; > } > } > ptb1 = &tb->phys_hash_next; > } > - not_found: > - /* if no translated code available, then translate it now */ > - tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > - > - found: > - /* Move the last found TB to the head of the list */ > - if (likely(*ptb1)) { > - *ptb1 = tb->phys_hash_next; > - tb->phys_hash_next = tcg_ctx.tb_ctx.tb_phys_hash[h]; > - tcg_ctx.tb_ctx.tb_phys_hash[h] = tb; > + return tb; > +} > + > +static TranslationBlock *tb_find_slow(CPUArchState *env, target_ulong pc, > + target_ulong cs_base, uint64_t flags) > +{ > + /* > + * First try to get the tb if we don't find it we need to lock and > compile > + * it. > + */ > + CPUState *cpu = ENV_GET_CPU(env); > + TranslationBlock *tb; > + > + tb = tb_find_physical(env, pc, cs_base, flags); > + if (!tb) { > + tb_lock(); > + /* > + * Retry to get the TB in case a CPU just translate it to avoid > having > + * duplicated TB in the pool. > + */ > + tb = tb_find_physical(env, pc, cs_base, flags); > + if (!tb) { > + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); > + } > + tb_unlock(); > } > /* we add the TB in the virtual pc hash table */ > cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb; > @@ -326,6 +343,7 @@ static inline TranslationBlock *tb_find_fast(CPUArchState > *env) > tb->flags != flags)) { > tb = tb_find_slow(env, pc, cs_base, flags); > } > + > return tb; > } > > diff --git a/target-arm/translate.c b/target-arm/translate.c > index 971b6db..47345aa 100644 > --- a/target-arm/translate.c > +++ b/target-arm/translate.c > @@ -11162,6 +11162,8 @@ static inline void > gen_intermediate_code_internal(ARMCPU *cpu, > > dc->tb = tb; > > + tb_lock(); > + > dc->is_jmp = DISAS_NEXT; > dc->pc = pc_start; > dc->singlestep_enabled = cs->singlestep_enabled; > @@ -11499,6 +11501,7 @@ done_generating: > tb->size = dc->pc - pc_start; > tb->icount = num_insns; > } > + tb_unlock(); > } > > void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb) > @@ -11567,6 +11570,7 @@ void arm_cpu_dump_state(CPUState *cs, FILE *f, > fprintf_function cpu_fprintf, > > void restore_state_to_opc(CPUARMState *env, TranslationBlock *tb, int pc_pos) > { > + tb_lock(); > if (is_a64(env)) { > env->pc = tcg_ctx.gen_opc_pc[pc_pos]; > env->condexec_bits = 0; > @@ -11574,4 +11578,5 @@ void restore_state_to_opc(CPUARMState *env, > TranslationBlock *tb, int pc_pos) > env->regs[15] = tcg_ctx.gen_opc_pc[pc_pos]; > env->condexec_bits = gen_opc_condexec_bits[pc_pos]; > } > + tb_unlock(); > } > diff --git a/tcg/tcg.h b/tcg/tcg.h > index 41e4869..032fe10 100644 > --- a/tcg/tcg.h > +++ b/tcg/tcg.h > @@ -592,17 +592,24 @@ void *tcg_malloc_internal(TCGContext *s, int size); > void tcg_pool_reset(TCGContext *s); > void tcg_pool_delete(TCGContext *s); > > +void tb_lock(void); > +void tb_unlock(void); > +void tb_lock_reset(void); > + > static inline void *tcg_malloc(int size) > { > TCGContext *s = &tcg_ctx; > uint8_t *ptr, *ptr_end; > + tb_lock(); > size = (size + sizeof(long) - 1) & ~(sizeof(long) - 1); > ptr = s->pool_cur; > ptr_end = ptr + size; > if (unlikely(ptr_end > s->pool_end)) { > + tb_unlock(); > return tcg_malloc_internal(&tcg_ctx, size); If the purpose of the lock is to protect the global tcg_ctx then we shouldn't be unlocking before calling the _internal function which also messes with the context. > } else { > s->pool_cur = ptr_end; > + tb_unlock(); > return ptr; > } > } > diff --git a/translate-all.c b/translate-all.c > index b6b0e1c..c25b79b 100644 > --- a/translate-all.c > +++ b/translate-all.c > @@ -127,6 +127,34 @@ static void *l1_map[V_L1_SIZE]; > /* code generation context */ > TCGContext tcg_ctx; > > +/* translation block context */ > +__thread volatile int have_tb_lock; > + > +void tb_lock(void) > +{ > + if (!have_tb_lock) { > + qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); > + } > + have_tb_lock++; > +} > + > +void tb_unlock(void) > +{ > + assert(have_tb_lock > 0); > + have_tb_lock--; > + if (!have_tb_lock) { > + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); > + } > +} > + > +void tb_lock_reset(void) > +{ > + if (have_tb_lock) { > + qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); > + } > + have_tb_lock = 0; > +} > + > static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, > tb_page_addr_t phys_page2); > static TranslationBlock *tb_find_pc(uintptr_t tc_ptr); > @@ -215,6 +243,7 @@ static int cpu_restore_state_from_tb(CPUState *cpu, > TranslationBlock *tb, > #ifdef CONFIG_PROFILER > ti = profile_getclock(); > #endif > + tb_lock(); > tcg_func_start(s); > > gen_intermediate_code_pc(env, tb); > @@ -228,8 +257,10 @@ static int cpu_restore_state_from_tb(CPUState *cpu, > TranslationBlock *tb, > > /* find opc index corresponding to search_pc */ > tc_ptr = (uintptr_t)tb->tc_ptr; > - if (searched_pc < tc_ptr) > + if (searched_pc < tc_ptr) { > + tb_unlock(); > return -1; > + } > > s->tb_next_offset = tb->tb_next_offset; > #ifdef USE_DIRECT_JUMP > @@ -241,8 +272,10 @@ static int cpu_restore_state_from_tb(CPUState *cpu, > TranslationBlock *tb, > #endif > j = tcg_gen_code_search_pc(s, (tcg_insn_unit *)tc_ptr, > searched_pc - tc_ptr); > - if (j < 0) > + if (j < 0) { > + tb_unlock(); > return -1; > + } > /* now find start of instruction before */ > while (s->gen_opc_instr_start[j] == 0) { > j--; > @@ -255,6 +288,8 @@ static int cpu_restore_state_from_tb(CPUState *cpu, > TranslationBlock *tb, > s->restore_time += profile_getclock() - ti; > s->restore_count++; > #endif > + > + tb_unlock(); > return 0; > } > > @@ -672,6 +707,7 @@ static inline void code_gen_alloc(size_t tb_size) > CODE_GEN_AVG_BLOCK_SIZE; > tcg_ctx.tb_ctx.tbs = > g_malloc(tcg_ctx.code_gen_max_blocks * sizeof(TranslationBlock)); > + qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); > } > > /* Must be called before using the QEMU cpus. 'tb_size' is the size > @@ -696,16 +732,22 @@ bool tcg_enabled(void) > return tcg_ctx.code_gen_buffer != NULL; > } > > -/* Allocate a new translation block. Flush the translation buffer if > - too many translation blocks or too much generated code. */ > +/* > + * Allocate a new translation block. Flush the translation buffer if > + * too many translation blocks or too much generated code. > + * tb_alloc is not thread safe but tb_gen_code is protected by a mutex so > this > + * function is called only by one thread. maybe: "..is not thread safe but tb_gen_code is protected by tb_lock so only one thread calls it at a time."? > + */ > static TranslationBlock *tb_alloc(target_ulong pc) > { > - TranslationBlock *tb; > + TranslationBlock *tb = NULL; > > if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks || > (tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer) >= > tcg_ctx.code_gen_buffer_max_size) { > - return NULL; > + tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; > + tb->pc = pc; > + tb->cflags = 0; > } > tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++]; > tb->pc = pc; That looks weird. if (cond) return &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++] then return &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++];? Also rendering the setting of tb = NULL pointless as it will always be from the array? > @@ -718,11 +760,16 @@ void tb_free(TranslationBlock *tb) > /* In pr actice this is mostly used for single use temporary TB > Ignore the hard cases and just back up if this TB happens to > be the last one generated. */ > + > + tb_lock(); > + > if (tcg_ctx.tb_ctx.nb_tbs > 0 && > tb == &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) { > tcg_ctx.code_gen_ptr = tb->tc_ptr; > tcg_ctx.tb_ctx.nb_tbs--; > } > + > + tb_unlock(); > } > > static inline void invalidate_page_bitmap(PageDesc *p) > @@ -773,6 +820,8 @@ void tb_flush(CPUArchState *env1) > { > CPUState *cpu = ENV_GET_CPU(env1); > > + tb_lock(); > + > #if defined(DEBUG_FLUSH) > printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n", > (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer), > @@ -797,6 +846,8 @@ void tb_flush(CPUArchState *env1) > /* XXX: flush processor icache at this point if cache flush is > expensive */ > tcg_ctx.tb_ctx.tb_flush_count++; > + > + tb_unlock(); > } > > #ifdef DEBUG_TB_CHECK > @@ -806,6 +857,8 @@ static void tb_invalidate_check(target_ulong address) > TranslationBlock *tb; > int i; > > + tb_lock(); > + > address &= TARGET_PAGE_MASK; > for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { > for (tb = tb_ctx.tb_phys_hash[i]; tb != NULL; tb = > tb->phys_hash_next) { > @@ -817,6 +870,8 @@ static void tb_invalidate_check(target_ulong address) > } > } > } > + > + tb_unlock(); > } > > /* verify that all the pages have correct rights for code */ > @@ -825,6 +880,8 @@ static void tb_page_check(void) > TranslationBlock *tb; > int i, flags1, flags2; > > + tb_lock(); > + > for (i = 0; i < CODE_GEN_PHYS_HASH_SIZE; i++) { > for (tb = tcg_ctx.tb_ctx.tb_phys_hash[i]; tb != NULL; > tb = tb->phys_hash_next) { > @@ -836,6 +893,8 @@ static void tb_page_check(void) > } > } > } > + > + tb_unlock(); > } > > #endif > @@ -916,6 +975,8 @@ void tb_phys_invalidate(TranslationBlock *tb, > tb_page_addr_t page_addr) > tb_page_addr_t phys_pc; > TranslationBlock *tb1, *tb2; > > + tb_lock(); > + > /* remove the TB from the hash list */ > phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); > h = tb_phys_hash_func(phys_pc); > @@ -963,6 +1024,7 @@ void tb_phys_invalidate(TranslationBlock *tb, > tb_page_addr_t page_addr) > tb->jmp_first = (TranslationBlock *)((uintptr_t)tb | 2); /* fail safe */ > > tcg_ctx.tb_ctx.tb_phys_invalidate_count++; > + tb_unlock(); > } > > static void build_page_bitmap(PageDesc *p) > @@ -1004,6 +1066,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > target_ulong virt_page2; > int code_gen_size; > > + tb_lock(); > + > phys_pc = get_page_addr_code(env, pc); > if (use_icount) { > cflags |= CF_USE_ICOUNT; > @@ -1032,6 +1096,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu, > phys_page2 = get_page_addr_code(env, virt_page2); > } > tb_link_page(tb, phys_pc, phys_page2); > + > + tb_unlock(); > return tb; > } > > @@ -1330,13 +1396,15 @@ static inline void tb_alloc_page(TranslationBlock *tb, > } > > /* add a new TB and link it to the physical page tables. phys_page2 is > - (-1) to indicate that only one page contains the TB. */ > + * (-1) to indicate that only one page contains the TB. */ > static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, > tb_page_addr_t phys_page2) > { > unsigned int h; > TranslationBlock **ptb; > > + tb_lock(); > + > /* Grab the mmap lock to stop another thread invalidating this TB > before we are done. */ > mmap_lock(); > @@ -1370,6 +1438,8 @@ static void tb_link_page(TranslationBlock *tb, > tb_page_addr_t phys_pc, > tb_page_check(); > #endif > mmap_unlock(); > + > + tb_unlock(); > } > > /* find the TB 'tb' such that tb[0].tc_ptr <= tc_ptr < > @@ -1378,31 +1448,34 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr) > { > int m_min, m_max, m; > uintptr_t v; > - TranslationBlock *tb; > - > - if (tcg_ctx.tb_ctx.nb_tbs <= 0) { > - return NULL; > - } > - if (tc_ptr < (uintptr_t)tcg_ctx.code_gen_buffer || > - tc_ptr >= (uintptr_t)tcg_ctx.code_gen_ptr) { > - return NULL; > - } > - /* binary search (cf Knuth) */ > - m_min = 0; > - m_max = tcg_ctx.tb_ctx.nb_tbs - 1; > - while (m_min <= m_max) { > - m = (m_min + m_max) >> 1; > - tb = &tcg_ctx.tb_ctx.tbs[m]; > - v = (uintptr_t)tb->tc_ptr; > - if (v == tc_ptr) { > - return tb; > - } else if (tc_ptr < v) { > - m_max = m - 1; > - } else { > - m_min = m + 1; > + TranslationBlock *tb = NULL; > + > + tb_lock(); > + > + if ((tcg_ctx.tb_ctx.nb_tbs > 0) > + && (tc_ptr >= (uintptr_t)tcg_ctx.code_gen_buffer && > + tc_ptr < (uintptr_t)tcg_ctx.code_gen_ptr)) { > + /* binary search (cf Knuth) */ > + m_min = 0; > + m_max = tcg_ctx.tb_ctx.nb_tbs - 1; > + while (m_min <= m_max) { > + m = (m_min + m_max) >> 1; > + tb = &tcg_ctx.tb_ctx.tbs[m]; > + v = (uintptr_t)tb->tc_ptr; > + if (v == tc_ptr) { > + tb_unlock(); > + return tb; > + } else if (tc_ptr < v) { > + m_max = m - 1; > + } else { > + m_min = m + 1; > + } > } > + tb = &tcg_ctx.tb_ctx.tbs[m_max]; > } > - return &tcg_ctx.tb_ctx.tbs[m_max]; > + > + tb_unlock(); > + return tb; > } > > #if !defined(CONFIG_USER_ONLY) > @@ -1564,6 +1637,8 @@ void dump_exec_info(FILE *f, fprintf_function > cpu_fprintf) > int direct_jmp_count, direct_jmp2_count, cross_page; > TranslationBlock *tb; > > + tb_lock(); > + > target_code_size = 0; > max_target_code_size = 0; > cross_page = 0; > @@ -1619,6 +1694,8 @@ void dump_exec_info(FILE *f, fprintf_function > cpu_fprintf) > tcg_ctx.tb_ctx.tb_phys_invalidate_count); > cpu_fprintf(f, "TLB flush count %d\n", tlb_flush_count); > tcg_dump_info(f, cpu_fprintf); > + > + tb_unlock(); > } > > void dump_opcount_info(FILE *f, fprintf_function cpu_fprintf) -- Alex Bennée