Sergey Fedorov <sergey.fedo...@linaro.org> writes: > From: Sergey Fedorov <serge.f...@gmail.com> > > This patch is based on the ideas found in work of KONRAD Frederic [1], > Alex Bennée [2], and Alvise Rigo [3]. > > This mechanism allows to perform an operation safely in a quiescent > state. Quiescent state means: (1) no vCPU is running and (2) BQL in > system-mode or 'exclusive_lock' in user-mode emulation is held while > performing the operation. This functionality is required e.g. for > performing translation buffer flush safely in multi-threaded user-mode > emulation. > > The existing CPU work queue is used to schedule such safe operations. A > new 'safe' flag is added into struct qemu_work_item to designate the > special requirements of the safe work. An operation in a quiescent sate > can be scheduled by using async_safe_run_on_cpu() function which is > actually the same as sync_run_on_cpu() except that it marks the queued > work item with the 'safe' flag set to true. Given this flag set > queue_work_on_cpu() atomically increments 'safe_work_pending' global > counter and kicks all the CPUs instead of just the target CPU as in case > of normal CPU work. This allows to force other CPUs to exit their > execution loops and wait in wait_safe_cpu_work() function for the safe > work to finish. When a CPU drains its work queue, if it encounters a > work item marked as safe, it first waits for other CPUs to exit their > execution loops, then called the work item function, and finally > decrements 'safe_work_pending' counter with signalling other CPUs to let > them continue execution as soon as all pending safe work items have been > processed. The 'tcg_pending_threads' protected by 'exclusive_lock' in > user-mode or by 'qemu_global_mutex' in system-mode emulation is used to > determine if there is any CPU run and wait for it to exit the execution > loop. The fairness of all the CPU work queues is ensured by draining all > the pending safe work items before any CPU can run. > > [1] http://lists.nongnu.org/archive/html/qemu-devel/2015-08/msg01128.html > [2] http://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02531.html > [3] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04792.html > > Signed-off-by: Sergey Fedorov <serge.f...@gmail.com> > Signed-off-by: Sergey Fedorov <sergey.fedo...@linaro.org> > --- > > Changes in v2: > - some conditional varialbes moved to cpu-exec-common.c > - documentation commend for new public API added > --- > cpu-exec-common.c | 49 > ++++++++++++++++++++++++++++++++++++++++++++++++- > cpus.c | 20 ++++++++++++++++++++ > include/exec/exec-all.h | 14 ++++++++++++++ > include/qom/cpu.h | 14 ++++++++++++++ > linux-user/main.c | 13 +++++++------ > 5 files changed, 103 insertions(+), 7 deletions(-) > > diff --git a/cpu-exec-common.c b/cpu-exec-common.c > index a233f0124559..6f278d6d3b70 100644 > --- a/cpu-exec-common.c > +++ b/cpu-exec-common.c > @@ -25,6 +25,7 @@ > > bool exit_request; > CPUState *tcg_current_cpu; > +int tcg_pending_threads; > > /* exit the current TB, but without causing any exception to be raised */ > void cpu_loop_exit_noexc(CPUState *cpu) > @@ -79,6 +80,17 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc) > } > > QemuCond qemu_work_cond; > +QemuCond qemu_safe_work_cond; > +QemuCond qemu_exclusive_cond; > + > +static int safe_work_pending; > + > +void wait_safe_cpu_work(void) > +{ > + while (atomic_mb_read(&safe_work_pending) > 0) { > + qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex()); > + } > +} > > static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) > { > @@ -91,9 +103,18 @@ static void queue_work_on_cpu(CPUState *cpu, struct > qemu_work_item *wi) > cpu->queued_work_last = wi; > wi->next = NULL; > wi->done = false; > + if (wi->safe) { > + atomic_inc(&safe_work_pending); > + } > qemu_mutex_unlock(&cpu->work_mutex); > > - qemu_cpu_kick(cpu); > + if (!wi->safe) { > + qemu_cpu_kick(cpu); > + } else { > + CPU_FOREACH(cpu) { > + qemu_cpu_kick(cpu); > + } > + } > } > > void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) > @@ -108,6 +129,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void > *data) > wi.func = func; > wi.data = data; > wi.free = false; > + wi.safe = false; > > queue_work_on_cpu(cpu, &wi); > while (!atomic_mb_read(&wi.done)) { > @@ -131,6 +153,20 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func > func, void *data) > wi->func = func; > wi->data = data; > wi->free = true; > + wi->safe = false; > + > + queue_work_on_cpu(cpu, wi); > +} > + > +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data) > +{ > + struct qemu_work_item *wi; > + > + wi = g_malloc0(sizeof(struct qemu_work_item)); > + wi->func = func; > + wi->data = data; > + wi->free = true; > + wi->safe = true; > > queue_work_on_cpu(cpu, wi); > } > @@ -150,9 +186,20 @@ void process_queued_cpu_work(CPUState *cpu) > if (!cpu->queued_work_first) { > cpu->queued_work_last = NULL; > } > + if (wi->safe) { > + while (tcg_pending_threads) { > + qemu_cond_wait(&qemu_exclusive_cond, > + qemu_get_cpu_work_mutex()); > + } > + } > qemu_mutex_unlock(&cpu->work_mutex); > wi->func(cpu, wi->data); > qemu_mutex_lock(&cpu->work_mutex); > + if (wi->safe) { > + if (!atomic_dec_fetch(&safe_work_pending)) { > + qemu_cond_broadcast(&qemu_safe_work_cond); > + } > + } > if (wi->free) { > g_free(wi); > } else { > diff --git a/cpus.c b/cpus.c > index 282d7e399902..b7122043f650 100644 > --- a/cpus.c > +++ b/cpus.c > @@ -903,6 +903,8 @@ void qemu_init_cpu_loop(void) > qemu_cond_init(&qemu_cpu_cond); > qemu_cond_init(&qemu_pause_cond); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > qemu_cond_init(&qemu_io_proceeded_cond); > qemu_mutex_init(&qemu_global_mutex); > > @@ -926,6 +928,20 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu) > { > } > > +/* called with qemu_global_mutex held */ > +static inline void tcg_cpu_exec_start(CPUState *cpu) > +{ > + tcg_pending_threads++; > +} > + > +/* called with qemu_global_mutex held */ > +static inline void tcg_cpu_exec_end(CPUState *cpu) > +{ > + if (--tcg_pending_threads) { > + qemu_cond_broadcast(&qemu_exclusive_cond); > + } > +} > + > static void qemu_wait_io_event_common(CPUState *cpu) > { > if (cpu->stop) { > @@ -950,6 +966,8 @@ static void qemu_tcg_wait_io_event(CPUState *cpu) > CPU_FOREACH(cpu) { > qemu_wait_io_event_common(cpu); > } > + > + wait_safe_cpu_work(); > } > > static void qemu_kvm_wait_io_event(CPUState *cpu) > @@ -1485,7 +1503,9 @@ static void tcg_exec_all(void) > (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); > > if (cpu_can_run(cpu)) { > + tcg_cpu_exec_start(cpu); > r = tcg_cpu_exec(cpu); > + tcg_cpu_exec_end(cpu); > if (r == EXCP_DEBUG) { > cpu_handle_guest_debug(cpu); > break; > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index bceecbd5222a..992055ce36bf 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -405,12 +405,22 @@ extern int singlestep; > > /* cpu-exec.c, accessed with atomic_mb_read/atomic_mb_set */ > extern CPUState *tcg_current_cpu; > +extern int tcg_pending_threads; > extern bool exit_request; > > /** > * qemu_work_cond - condition to wait for CPU work items completion > */ > extern QemuCond qemu_work_cond; > +/** > + * qemu_safe_work_cond - condition to wait for safe CPU work items completion > + */ > +extern QemuCond qemu_safe_work_cond; > +/** > + * qemu_exclusive_cond - condition to wait for all TCG threads to be out of > + * guest code execution loop > + */ > +extern QemuCond qemu_exclusive_cond; > > /** > * qemu_get_cpu_work_mutex() - get the mutex which protects CPU work > execution > @@ -423,5 +433,9 @@ QemuMutex *qemu_get_cpu_work_mutex(void); > * @cpu: The CPU which work queue to process. > */ > void process_queued_cpu_work(CPUState *cpu); > +/** > + * wait_safe_cpu_work() - wait until all safe CPU work items has processed > + */ > +void wait_safe_cpu_work(void); > > #endif > diff --git a/include/qom/cpu.h b/include/qom/cpu.h > index 4e688f645b4a..5128fcc1745a 100644 > --- a/include/qom/cpu.h > +++ b/include/qom/cpu.h > @@ -231,6 +231,7 @@ struct qemu_work_item { > void *data; > int done; > bool free; > + bool safe; > }; > > /** > @@ -625,6 +626,19 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, > void *data); > void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); > > /** > + * async_safe_run_on_cpu: > + * @cpu: The vCPU to run on. > + * @func: The function to be executed. > + * @data: Data to pass to the function. > + * > + * Schedules the function @func for execution on the vCPU @cpu asynchronously > + * and in quiescent state. Quiescent state means: (1) all other vCPUs are > + * halted and (2) #qemu_global_mutex (a.k.a. BQL) in system-mode or > + * #exclusive_lock in user-mode emulation is held while @func is executing. > + */ > +void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data); > + > +/** > * qemu_get_cpu: > * @index: The CPUState@cpu_index value of the CPU to obtain. > * > diff --git a/linux-user/main.c b/linux-user/main.c > index fce61d5a35fc..d0ff5f9976e5 100644 > --- a/linux-user/main.c > +++ b/linux-user/main.c > @@ -110,18 +110,17 @@ int cpu_get_pic_interrupt(CPUX86State *env) > which requires quite a lot of per host/target work. */ > static QemuMutex cpu_list_mutex; > static QemuMutex exclusive_lock; > -static QemuCond exclusive_cond; > static QemuCond exclusive_resume; > static bool exclusive_pending; > -static int tcg_pending_threads; > > void qemu_init_cpu_loop(void) > { > qemu_mutex_init(&cpu_list_mutex); > qemu_mutex_init(&exclusive_lock); > - qemu_cond_init(&exclusive_cond); > qemu_cond_init(&exclusive_resume); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > } > > /* Make sure everything is in a consistent state for calling fork(). */ > @@ -148,9 +147,10 @@ void fork_end(int child) > exclusive_pending = false; > qemu_mutex_init(&exclusive_lock); > qemu_mutex_init(&cpu_list_mutex); > - qemu_cond_init(&exclusive_cond); > qemu_cond_init(&exclusive_resume); > qemu_cond_init(&qemu_work_cond); > + qemu_cond_init(&qemu_safe_work_cond); > + qemu_cond_init(&qemu_exclusive_cond); > qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); > gdbserver_fork(thread_cpu); > } else { > @@ -190,7 +190,7 @@ static inline void start_exclusive(void) > } > } > while (tcg_pending_threads) { > - qemu_cond_wait(&exclusive_cond, &exclusive_lock); > + qemu_cond_wait(&qemu_exclusive_cond, &exclusive_lock); > } > } > > @@ -219,10 +219,11 @@ static inline void cpu_exec_end(CPUState *cpu) > cpu->running = false; > tcg_pending_threads--; > if (!tcg_pending_threads) { > - qemu_cond_signal(&exclusive_cond); > + qemu_cond_broadcast(&qemu_exclusive_cond); > } > exclusive_idle(); > process_queued_cpu_work(cpu); > + wait_safe_cpu_work(); > qemu_mutex_unlock(&exclusive_lock); > }
Reviewed-by: Alex Bennée <alex.ben...@linaro.org> -- Alex Bennée