Sergey Fedorov <sergey.fedo...@linaro.org> writes: > From: Sergey Fedorov <serge.f...@gmail.com> > > This patch is based on the ideas found in work of KONRAD Frederic [1], > Alex Bennée [2], and Alvise Rigo [3]. > > This mechanism allows to perform an operation safely in a quiescent > state. Quiescent state means: (1) no vCPU is running and (2) BQL in > system-mode or 'exclusive_lock' in user-mode emulation is held while > performing the operation. This functionality is required e.g. for > performing translation buffer flush safely in multi-threaded user-mode > emulation. <snip> > > QemuCond qemu_work_cond; > +QemuCond qemu_safe_work_cond; > +QemuCond qemu_exclusive_cond; > + > +static int safe_work_pending; > + > +void wait_safe_cpu_work(void) > +{ > + while (atomic_mb_read(&safe_work_pending) > 0) { > + qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex()); > + } > +}
Testing on system mode I got a dead-lock with one thread sitting here (I assume after doing its work) and the other threads all waiting on the exclusive condition. I think we need this: void wait_safe_cpu_work(void) { while (atomic_mb_read(&safe_work_pending) > 0) { /* * If there is pending safe work and no pending threads we * need to signal another thread to start its work. */ if (tcg_pending_threads == 0) { qemu_cond_signal(&qemu_exclusive_cond); } qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex()); } } > > static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) > { > @@ -91,9 +103,18 @@ static void queue_work_on_cpu(CPUState *cpu, struct > qemu_work_item *wi) > cpu->queued_work_last = wi; > wi->next = NULL; > wi->done = false; > + if (wi->safe) { > + atomic_inc(&safe_work_pending); > + } > qemu_mutex_unlock(&cpu->work_mutex); > > - qemu_cpu_kick(cpu); > + if (!wi->safe) { > + qemu_cpu_kick(cpu); > + } else { > + CPU_FOREACH(cpu) { > + qemu_cpu_kick(cpu); > + } > + } > } > > void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) > @@ -108,6 +129,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void > *data) > wi.func = func; > wi.data = data; > wi.free = false; > + wi.safe = false; > > queue_work_on_cpu(cpu, &wi); > while (!atomic_mb_read(&wi.done)) { > @@ -131,6 +153,20 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func > func, void *data) > wi->func = func; > wi->data = data; > wi->free = true; > + wi->safe = false; > + > + queue_work_on_cpu(cpu, wi); > +} > + > +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) > +{ > + struct qemu_work_item *wi; > + > + wi = g_malloc0(sizeof(struct qemu_work_item)); > + wi->func = func; > + wi->data = data; > + wi->free = true; > + wi->safe = true; > > queue_work_on_cpu(cpu, wi); > } > @@ -150,9 +186,20 @@ void process_queued_cpu_work(CPUState *cpu) > if (!cpu->queued_work_first) { > cpu->queued_work_last = NULL; > } > + if (wi->safe) { > + while (tcg_pending_threads) { > + qemu_cond_wait(&qemu_exclusive_cond, > + qemu_get_cpu_work_mutex()); > + } > + } > qemu_mutex_unlock(&cpu->work_mutex); > wi->func(cpu, wi->data); > qemu_mutex_lock(&cpu->work_mutex); > + if (wi->safe) { > + if (!atomic_dec_fetch(&safe_work_pending)) { > + qemu_cond_broadcast(&qemu_safe_work_cond); > + } > + } > if (wi->free) { > g_free(wi); > } else { > diff --git a/cpus.c b/cpus.c > index 282d7e399902..b7122043f650 100644 > --- a/cpus.c > +++ b/cpus.c > @@ -903,6 +903,8 @@ void qemu_init_cpu_loop(void) > qemu_cond_init(&qemu_cpu_cond); > qemu_cond_init(&qemu_pause_cond); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > qemu_cond_init(&qemu_io_proceeded_cond); > qemu_mutex_init(&qemu_global_mutex); > > @@ -926,6 +928,20 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu) > { > } > > +/* called with qemu_global_mutex held */ > +static inline void tcg_cpu_exec_start(CPUState *cpu) > +{ > + tcg_pending_threads++; > +} > + > +/* called with qemu_global_mutex held */ > +static inline void tcg_cpu_exec_end(CPUState *cpu) > +{ > + if (--tcg_pending_threads) { > + qemu_cond_broadcast(&qemu_exclusive_cond); Is the broadcast a bit excessive here. Shouldn't we just wake up one thread with a qemu_cond_signal? > + } > +} > + > static void qemu_wait_io_event_common(CPUState *cpu) > { > if (cpu->stop) { > @@ -950,6 +966,8 @@ static void qemu_tcg_wait_io_event(CPUState *cpu) > CPU_FOREACH(cpu) { > qemu_wait_io_event_common(cpu); > } > + > + wait_safe_cpu_work(); > } > > static void qemu_kvm_wait_io_event(CPUState *cpu) > @@ -1485,7 +1503,9 @@ static void tcg_exec_all(void) > (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); > > if (cpu_can_run(cpu)) { > + tcg_cpu_exec_start(cpu); > r = tcg_cpu_exec(cpu); > + tcg_cpu_exec_end(cpu); > if (r == EXCP_DEBUG) { > cpu_handle_guest_debug(cpu); > break; > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index 8d5c7dbcf5a9..1f0272beb362 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -405,12 +405,22 @@ extern int singlestep; > > /* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */ > extern CPUState *tcg_current_cpu; > +extern int tcg_pending_threads; > extern bool exit_request; > > /** > * qemu_work_cond - condition to wait for CPU work items completion > */ > extern QemuCond qemu_work_cond; > +/** > + * qemu_safe_work_cond - condition to wait for safe CPU work items completion > + */ > +extern QemuCond qemu_safe_work_cond; > +/** > + * qemu_exclusive_cond - condition to wait for all TCG threads to be out of > + * guest code execution loop > + */ > +extern QemuCond qemu_exclusive_cond; > > /** > * qemu_get_cpu_work_mutex() - get the mutex which protects CPU work > execution > @@ -423,5 +433,9 @@ QemuMutex *qemu_get_cpu_work_mutex(void); > * @cpu: The CPU which work queue to process. > */ > void process_queued_cpu_work(CPUState *cpu); > +/** > + * wait_safe_cpu_work() - wait until all safe CPU work items has processed > + */ > +void wait_safe_cpu_work(void); > > #endif > diff --git a/include/qom/cpu.h b/include/qom/cpu.h > index c19a673f9f68..ab67bf2ba19f 100644 > --- a/include/qom/cpu.h > +++ b/include/qom/cpu.h > @@ -238,6 +238,7 @@ struct qemu_work_item { > void *data; > int done; > bool free; > + bool safe; > }; > > /** > @@ -632,6 +633,19 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, > void *data); > void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); > > /** > + * async_safe_run_on_cpu: > + * @cpu: The vCPU to run on. > + * @func: The function to be executed. > + * @data: Data to pass to the function. > + * > + * Schedules the function @func for execution on the vCPU @cpu asynchronously > + * and in quiescent state. Quiescent state means: (1) all other vCPUs are > + * halted and (2) #qemu_global_mutex (a.k.a. BQL) in system-mode or > + * #exclusive_lock in user-mode emulation is held while @func is executing. > + */ > +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); > + > +/** > * qemu_get_cpu: > * @index: The CPUState@cpu_index value of the CPU to obtain. > * > diff --git a/linux-user/main.c b/linux-user/main.c > index fce61d5a35fc..d0ff5f9976e5 100644 > --- a/linux-user/main.c > +++ b/linux-user/main.c > @@ -110,18 +110,17 @@ int cpu_get_pic_interrupt(CPUX86State *env) > which requires quite a lot of per host/target work. */ > static QemuMutex cpu_list_mutex; > static QemuMutex exclusive_lock; > -static QemuCond exclusive_cond; > static QemuCond exclusive_resume; > static bool exclusive_pending; > -static int tcg_pending_threads; > > void qemu_init_cpu_loop(void) > { > qemu_mutex_init(&cpu_list_mutex); > qemu_mutex_init(&exclusive_lock); > - qemu_cond_init(&exclusive_cond); > qemu_cond_init(&exclusive_resume); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > } > > /* Make sure everything is in a consistent state for calling fork(). */ > @@ -148,9 +147,10 @@ void fork_end(int child) > exclusive_pending = false; > qemu_mutex_init(&exclusive_lock); > qemu_mutex_init(&cpu_list_mutex); > - qemu_cond_init(&exclusive_cond); > qemu_cond_init(&exclusive_resume); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); > gdbserver_fork(thread_cpu); > } else { > @@ -190,7 +190,7 @@ static inline void start_exclusive(void) > } > } > while (tcg_pending_threads) { > - qemu_cond_wait(&exclusive_cond, &exclusive_lock); > + qemu_cond_wait(&qemu_exclusive_cond, &exclusive_lock); > } > } > > @@ -219,10 +219,11 @@ static inline void cpu_exec_end(CPUState *cpu) > cpu->running = false; > tcg_pending_threads--; > if (!tcg_pending_threads) { > - qemu_cond_signal(&exclusive_cond); > + qemu_cond_broadcast(&qemu_exclusive_cond); > } > exclusive_idle(); > process_queued_cpu_work(cpu); > + wait_safe_cpu_work(); > qemu_mutex_unlock(&exclusive_lock); > } -- Alex Bennée