With this patch, the iothread lock will be only hold around 1.kvm_flush_coalesced_mmio_buffer(); 2.kvm_handle_io(); 3.cpu_physical_memory_rw()
And I think the cpu_lock is something which we can treat as apic_state device's lock, which the first device, we free it out of the iothread lock Regards, pingfan On Sat, Jun 23, 2012 at 5:32 PM, liu ping fan <qemul...@gmail.com> wrote: > On Sat, Jun 23, 2012 at 4:06 AM, Anthony Liguori <anth...@codemonkey.ws> > wrote: >> On 06/21/2012 09:49 AM, Liu Ping Fan wrote: >>> >>> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec() >>> to protect the race from other cpu's access to env->apic_state& related >>> >>> field in env. >>> Also, we need to protect agaist run_on_cpu(). >>> >>> Race condition can be like this: >>> 1. vcpu-1 IPI vcpu-2 >>> vcpu-3 IPI vcpu-2 >>> Open window exists for accessing to vcpu-2's apic_state& env >>> >>> >>> 2. run_on_cpu() write env->queued_work_last, while flush_queued_work() >>> read >>> >>> Signed-off-by: Liu Ping Fan<pingf...@linux.vnet.ibm.com> >>> --- >>> cpus.c | 6 ++++-- >>> hw/apic.c | 58 >>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- >>> hw/pc.c | 8 +++++++- >>> kvm-all.c | 13 +++++++++++-- >>> 4 files changed, 76 insertions(+), 9 deletions(-) >>> >>> diff --git a/cpus.c b/cpus.c >>> index 554f7bc..ac99afe 100644 >>> --- a/cpus.c >>> +++ b/cpus.c >>> @@ -649,6 +649,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void >>> *data), void *data) >>> >>> wi.func = func; >>> wi.data = data; >>> + qemu_mutex_lock(env->cpu_lock); >>> if (!env->queued_work_first) { >>> env->queued_work_first =&wi; >>> } else { >>> @@ -657,6 +658,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void >>> *data), void *data) >>> env->queued_work_last =&wi; >>> wi.next = NULL; >>> wi.done = false; >>> + qemu_mutex_unlock(env->cpu_lock); >>> >>> qemu_cpu_kick(env); >>> while (!wi.done) { >>> @@ -718,7 +720,7 @@ static void qemu_tcg_wait_io_event(void) >>> static void qemu_kvm_wait_io_event(CPUArchState *env) >>> { >>> while (cpu_thread_is_idle(env)) { >>> - qemu_cond_wait(env->halt_cond,&qemu_global_mutex); >>> + qemu_cond_wait(env->halt_cond, env->cpu_lock); >>> } >>> >>> qemu_kvm_eat_signals(env); >>> @@ -729,8 +731,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) >>> { >>> CPUArchState *env = arg; >>> int r; >>> + qemu_mutex_lock_cpu(env); >>> >>> - qemu_mutex_lock(&qemu_global_mutex); >>> qemu_thread_get_self(env->thread); >>> env->thread_id = qemu_get_thread_id(); >>> cpu_single_env = env; >>> diff --git a/hw/apic.c b/hw/apic.c >>> index 4eeaf88..b999a40 100644 >>> --- a/hw/apic.c >>> +++ b/hw/apic.c >>> @@ -22,6 +22,7 @@ >>> #include "host-utils.h" >>> #include "trace.h" >>> #include "pc.h" >>> +#include "qemu-thread.h" >>> >>> #define MAX_APIC_WORDS 8 >>> >>> @@ -94,6 +95,7 @@ static int get_highest_priority_int(uint32_t *tab) >>> return -1; >>> } >>> >>> +/* Caller must hold the lock */ >>> static void apic_sync_vapic(APICCommonState *s, int sync_type) >>> { >>> VAPICState vapic_state; >>> @@ -141,11 +143,13 @@ static void apic_sync_vapic(APICCommonState *s, int >>> sync_type) >>> } >>> } >>> >>> +/* Caller must hold lock */ >>> static void apic_vapic_base_update(APICCommonState *s) >>> { >>> apic_sync_vapic(s, SYNC_TO_VAPIC); >>> } >>> >>> +/* Caller must hold the lock */ >>> static void apic_local_deliver(APICCommonState *s, int vector) >>> { >>> uint32_t lvt = s->lvt[vector]; >>> @@ -175,9 +179,11 @@ static void apic_local_deliver(APICCommonState *s, >>> int vector) >>> (lvt& APIC_LVT_LEVEL_TRIGGER)) >>> trigger_mode = APIC_TRIGGER_LEVEL; >>> apic_set_irq(s, lvt& 0xff, trigger_mode); >>> >>> + break; >>> } >>> } >>> >>> +/* Caller must hold the lock */ >>> void apic_deliver_pic_intr(DeviceState *d, int level) >>> { >>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >>> @@ -200,9 +206,12 @@ void apic_deliver_pic_intr(DeviceState *d, int level) >>> } >>> } >>> >>> +/* Must hold lock */ >>> static void apic_external_nmi(APICCommonState *s) >>> { >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> apic_local_deliver(s, APIC_LVT_LINT1); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> >>> #define foreach_apic(apic, deliver_bitmask, code) \ >>> @@ -215,7 +224,9 @@ static void apic_external_nmi(APICCommonState *s) >>> if (__mask& (1<< __j)) {\ >>> >>> apic = local_apics[__i * 32 + __j];\ >>> if (apic) {\ >>> + qemu_mutex_lock_cpu(apic->cpu_env);\ >>> code;\ >>> + qemu_mutex_unlock_cpu(apic->cpu_env);\ >>> }\ >>> }\ >>> }\ >>> @@ -244,7 +255,9 @@ static void apic_bus_deliver(const uint32_t >>> *deliver_bitmask, >>> if (d>= 0) { >>> apic_iter = local_apics[d]; >>> if (apic_iter) { >>> + qemu_mutex_lock_cpu(apic_iter->cpu_env); >>> apic_set_irq(apic_iter, vector_num, >>> trigger_mode); >>> + qemu_mutex_unlock_cpu(apic_iter->cpu_env); >>> } >>> } >>> } >>> @@ -293,6 +306,7 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode, >>> uint8_t delivery_mode, >>> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, >>> trigger_mode); >>> } >>> >>> +/* Caller must hold lock */ >>> static void apic_set_base(APICCommonState *s, uint64_t val) >>> { >>> s->apicbase = (val& 0xfffff000) | >>> >>> @@ -305,6 +319,7 @@ static void apic_set_base(APICCommonState *s, uint64_t >>> val) >>> } >>> } >>> >>> +/* caller must hold lock */ >>> static void apic_set_tpr(APICCommonState *s, uint8_t val) >>> { >>> /* Updates from cr8 are ignored while the VAPIC is active */ >>> @@ -314,12 +329,14 @@ static void apic_set_tpr(APICCommonState *s, uint8_t >>> val) >>> } >>> } >>> >>> +/* caller must hold lock */ >>> static uint8_t apic_get_tpr(APICCommonState *s) >>> { >>> apic_sync_vapic(s, SYNC_FROM_VAPIC); >>> return s->tpr>> 4; >>> } >>> >>> +/* Caller must hold the lock */ >>> static int apic_get_ppr(APICCommonState *s) >>> { >>> int tpr, isrv, ppr; >>> @@ -348,6 +365,7 @@ static int apic_get_arb_pri(APICCommonState *s) >>> * 0 - no interrupt, >>> *>0 - interrupt number >>> */ >>> +/* Caller must hold cpu_lock */ >>> static int apic_irq_pending(APICCommonState *s) >>> { >>> int irrv, ppr; >>> @@ -363,6 +381,7 @@ static int apic_irq_pending(APICCommonState *s) >>> return irrv; >>> } >>> >>> +/* caller must ensure the lock has been taken */ >>> /* signal the CPU if an irq is pending */ >>> static void apic_update_irq(APICCommonState *s) >>> { >>> @@ -380,11 +399,13 @@ static void apic_update_irq(APICCommonState *s) >>> void apic_poll_irq(DeviceState *d) >>> { >>> APICCommonState *s = APIC_COMMON(d); >>> - >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> apic_sync_vapic(s, SYNC_FROM_VAPIC); >>> apic_update_irq(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> >>> +/* caller must hold the lock */ >>> static void apic_set_irq(APICCommonState *s, int vector_num, int >>> trigger_mode) >>> { >>> apic_report_irq_delivered(!get_bit(s->irr, vector_num)); >>> @@ -407,6 +428,7 @@ static void apic_set_irq(APICCommonState *s, int >>> vector_num, int trigger_mode) >>> apic_update_irq(s); >>> } >>> >>> +/* caller must hold the lock */ >>> static void apic_eoi(APICCommonState *s) >>> { >>> int isrv; >>> @@ -415,6 +437,7 @@ static void apic_eoi(APICCommonState *s) >>> return; >>> reset_bit(s->isr, isrv); >>> if (!(s->spurious_vec& APIC_SV_DIRECTED_IO)&& get_bit(s->tmr, >>> isrv)) { >>> >>> + //fix me,need to take extra lock for the bus? >>> ioapic_eoi_broadcast(isrv); >>> } >>> apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); >>> @@ -480,18 +503,23 @@ static void apic_get_delivery_bitmask(uint32_t >>> *deliver_bitmask, >>> static void apic_startup(APICCommonState *s, int vector_num) >>> { >>> s->sipi_vector = vector_num; >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> >>> void apic_sipi(DeviceState *d) >>> { >>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >>> - >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI); >>> >>> - if (!s->wait_for_sipi) >>> + if (!s->wait_for_sipi) { >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> return; >>> + } >>> cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> s->wait_for_sipi = 0; >>> } >>> >>> @@ -543,6 +571,7 @@ static void apic_deliver(DeviceState *d, uint8_t dest, >>> uint8_t dest_mode, >>> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num, >>> trigger_mode); >>> } >>> >>> +/* Caller must take lock */ >>> int apic_get_interrupt(DeviceState *d) >>> { >>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >>> @@ -572,6 +601,7 @@ int apic_get_interrupt(DeviceState *d) >>> return intno; >>> } >>> >>> +/* Caller must hold the cpu_lock */ >>> int apic_accept_pic_intr(DeviceState *d) >>> { >>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); >>> @@ -589,6 +619,7 @@ int apic_accept_pic_intr(DeviceState *d) >>> return 0; >>> } >>> >>> +/* Caller hold lock */ >>> static uint32_t apic_get_current_count(APICCommonState *s) >>> { >>> int64_t d; >>> @@ -619,9 +650,10 @@ static void apic_timer_update(APICCommonState *s, >>> int64_t current_time) >>> static void apic_timer(void *opaque) >>> { >>> APICCommonState *s = opaque; >>> - >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> apic_local_deliver(s, APIC_LVT_TIMER); >>> apic_timer_update(s, s->next_time); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> >>> static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr) >>> @@ -664,18 +696,22 @@ static uint32_t apic_mem_readl(void *opaque, >>> target_phys_addr_t addr) >>> val = 0x11 | ((APIC_LVT_NB - 1)<< 16); /* version 0x11 */ >>> break; >>> case 0x08: >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> apic_sync_vapic(s, SYNC_FROM_VAPIC); >>> if (apic_report_tpr_access) { >>> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); >>> } >>> val = s->tpr; >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x09: >>> val = apic_get_arb_pri(s); >>> break; >>> case 0x0a: >>> /* ppr */ >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> val = apic_get_ppr(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x0b: >>> val = 0; >>> @@ -712,7 +748,9 @@ static uint32_t apic_mem_readl(void *opaque, >>> target_phys_addr_t addr) >>> val = s->initial_count; >>> break; >>> case 0x39: >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> val = apic_get_current_count(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x3e: >>> val = s->divide_conf; >>> @@ -767,18 +805,22 @@ static void apic_mem_writel(void *opaque, >>> target_phys_addr_t addr, uint32_t val) >>> case 0x03: >>> break; >>> case 0x08: >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> if (apic_report_tpr_access) { >>> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); >>> } >>> s->tpr = val; >>> apic_sync_vapic(s, SYNC_TO_VAPIC); >>> apic_update_irq(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x09: >>> case 0x0a: >>> break; >>> case 0x0b: /* EOI */ >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> apic_eoi(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x0d: >>> s->log_dest = val>> 24; >>> @@ -787,8 +829,10 @@ static void apic_mem_writel(void *opaque, >>> target_phys_addr_t addr, uint32_t val) >>> s->dest_mode = val>> 28; >>> break; >>> case 0x0f: >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> s->spurious_vec = val& 0x1ff; >>> >>> apic_update_irq(s); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x10 ... 0x17: >>> case 0x18 ... 0x1f: >>> @@ -807,24 +851,30 @@ static void apic_mem_writel(void *opaque, >>> target_phys_addr_t addr, uint32_t val) >>> case 0x32 ... 0x37: >>> { >>> int n = index - 0x32; >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> s->lvt[n] = val; >>> if (n == APIC_LVT_TIMER) >>> apic_timer_update(s, qemu_get_clock_ns(vm_clock)); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> break; >>> case 0x38: >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> s->initial_count = val; >>> s->initial_count_load_time = qemu_get_clock_ns(vm_clock); >>> apic_timer_update(s, s->initial_count_load_time); >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> break; >>> case 0x39: >>> break; >>> case 0x3e: >>> { >>> int v; >>> + qemu_mutex_lock_cpu(s->cpu_env); >>> s->divide_conf = val& 0xb; >>> v = (s->divide_conf& 3) | ((s->divide_conf>> 1)& 4); >>> s->count_shift = (v + 1)& 7; >>> >>> + qemu_mutex_unlock_cpu(s->cpu_env); >>> } >>> break; >>> default: >>> diff --git a/hw/pc.c b/hw/pc.c >>> index 4d34a33..5e7350c 100644 >>> --- a/hw/pc.c >>> +++ b/hw/pc.c >>> @@ -147,7 +147,7 @@ void cpu_smm_update(CPUX86State *env) >>> smm_set(!!(env->hflags& HF_SMM_MASK), smm_arg); >>> >>> } >>> >>> - >>> +/* Called by kvm_cpu_exec(), already with lock protection */ >>> /* IRQ handling */ >>> int cpu_get_pic_interrupt(CPUX86State *env) >>> { >>> @@ -173,9 +173,15 @@ static void pic_irq_request(void *opaque, int irq, >>> int level) >>> DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq); >>> if (env->apic_state) { >>> while (env) { >>> + if (!qemu_cpu_is_self(env)) { >>> + qemu_mutex_lock_cpu(env); >>> + } >>> if (apic_accept_pic_intr(env->apic_state)) { >>> apic_deliver_pic_intr(env->apic_state, level); >>> } >>> + if (!qemu_cpu_is_self(env)) { >>> + qemu_mutex_unlock_cpu(env); >>> + } >>> env = env->next_cpu; >>> } >>> } else { >>> diff --git a/kvm-all.c b/kvm-all.c >>> index 9b73ccf..dc9aa54 100644 >>> --- a/kvm-all.c >>> +++ b/kvm-all.c >>> @@ -29,6 +29,7 @@ >>> #include "bswap.h" >>> #include "memory.h" >>> #include "exec-memory.h" >>> +#include "qemu-thread.h" >>> >>> /* This check must be after config-host.h is included */ >>> #ifdef CONFIG_EVENTFD >>> @@ -1252,12 +1253,15 @@ int kvm_cpu_exec(CPUArchState *env) >>> */ >>> qemu_cpu_kick_self(); >>> } >>> - qemu_mutex_unlock_iothread(); >>> + qemu_mutex_unlock(env->cpu_lock); >>> >>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>> >>> - qemu_mutex_lock_iothread(); >>> + qemu_mutex_lock(env->cpu_lock); >>> kvm_arch_post_run(env, run); >>> + qemu_mutex_unlock_cpu(env); >>> + >>> + qemu_mutex_lock_iothread(); >>> >>> kvm_flush_coalesced_mmio_buffer(); >>> >>> @@ -1265,6 +1269,8 @@ int kvm_cpu_exec(CPUArchState *env) >>> if (run_ret == -EINTR || run_ret == -EAGAIN) { >>> DPRINTF("io window exit\n"); >>> ret = EXCP_INTERRUPT; >>> + qemu_mutex_unlock_iothread(); >>> + qemu_mutex_lock_cpu(env); >>> break; >>> } >>> fprintf(stderr, "error: kvm run failed %s\n", >>> @@ -1312,6 +1318,9 @@ int kvm_cpu_exec(CPUArchState *env) >>> ret = kvm_arch_handle_exit(env, run); >>> break; >>> } >>> + >>> + qemu_mutex_unlock_iothread(); >>> + qemu_mutex_lock_cpu(env); >>> } while (ret == 0); >> >> >> This really confuses me. >> >> Shouldn't we be moving qemu_mutex_unlock_iothread() to the very top of >> kvm_cpu_exec()? >> > Sorry, the code is not arranged good enough, but I think the final > style will be like this: > do { > qemu_mutex_lock_iothread() > kvm_flush_coalesced_mmio_buffer(); > qemu_mutex_unlock_iothread() > > switch (run->exit_reason) { > case KVM_EXIT_IO: > qemu_mutex_lock_iothread() > kvm_handle_io(); > qemu_mutex_unlock_iothread() > break; > case KVM_EXIT_MMIO: > qemu_mutex_lock_iothread() > cpu_physical_memory_rw() > qemu_mutex_unlock_iothread() > break; > ................ > }while() > > Right? Can we push the qemu_mutex_lock_iothread out of the do{}? > >> There's only a fixed amount of state that gets manipulated too in >> kvm_cpu_exec() so shouldn't we try to specifically describe what state is >> covered by cpu_lock? >> > In current code, apart from run_on_cpu() writes env->queued_work_last, > while flush_queued_work() reads, the BQL protects the CPUArchState and > the deviceState from the other threads. As to the CPUArchState, the > emulated registers are local. So the only part can be accessed by > other threads is for the irq emulation. > These component including env's > (interrupt_request,eflags,halted,exit_request,exception_injected), all > of them are decided by env->apic_state, so we consider them as a > atomic context, that is say during the updating of env's these > context, we do not allow apic_state changed. > >> What's your strategy for ensuring that all accesses to that state is >> protected by cpu_lock? >> > The cpu_lock is a lock for env->apic_state and all related irq > emulation context. And the apic_state is the only entrance, through > which, other threads can affect this env's irq emulation context. So > when other threads want to access this_env->apic_state, they must > firstly acquire the cpu_lock. > > Thanks and regards, > pingfan >> Regards, >> >> Anthony Liguori >> >>> >>> if (ret< 0) { >> >>