On 28/01/2015 06:45, Fam Zheng wrote: > On Thu, 01/22 15:47, Paolo Bonzini wrote: >> Note that even after this patch, most callers of address_space_* >> functions must still be under the big QEMU lock, otherwise the memory >> region returned by address_space_translate can disappear as soon as >> address_space_translate returns. This will be fixed in the next part >> of this series. >> >> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> >> --- >> cpu-exec.c | 13 ++++++++++++- >> cpus.c | 2 +- >> cputlb.c | 8 ++++++-- >> exec.c | 34 ++++++++++++++++++++++++++-------- >> hw/i386/intel_iommu.c | 3 +++ >> hw/pci-host/apb.c | 1 + >> hw/ppc/spapr_iommu.c | 1 + >> include/exec/exec-all.h | 1 + >> 8 files changed, 51 insertions(+), 12 deletions(-) >> >> diff --git a/cpu-exec.c b/cpu-exec.c >> index 12473ff..edc5eb9 100644 >> --- a/cpu-exec.c >> +++ b/cpu-exec.c >> @@ -26,6 +26,7 @@ >> #include "qemu/timer.h" >> #include "exec/address-spaces.h" >> #include "exec/memory-internal.h" >> +#include "qemu/rcu.h" >> >> /* -icount align implementation. */ >> >> @@ -149,8 +150,15 @@ void cpu_resume_from_signal(CPUState *cpu, void *puc) >> >> void cpu_reload_memory_map(CPUState *cpu) >> { >> + AddressSpaceDispatch *d; >> + >> + if (qemu_in_vcpu_thread()) { >> + rcu_read_unlock(); > > Could you explain why we need to split the critical section? Maybe a line of > comment helps here.
It is needed because otherwise the guest could prolong the critical section as much as it desires. Currently, this is prevented by the I/O thread's kicking of the VCPU thread (iothread_requesting_mutex, qemu_cpu_kick_thread) but this will go away once TCG's execution moves out of the global mutex. This pair matches cpu_exec's rcu_read_lock()/rcu_read_unlock(), which only protects cpu->as->dispatch. Since we reload it below, we can split the critical section. Paolo >> + rcu_read_lock(); >> + } >> + >> /* The CPU and TLB are protected by the iothread lock. */ >> - AddressSpaceDispatch *d = cpu->as->dispatch; >> + d = atomic_rcu_read(&cpu->as->dispatch); >> cpu->memory_dispatch = d; >> tlb_flush(cpu, 1); >> } >> @@ -365,6 +373,8 @@ int cpu_exec(CPUArchState *env) >> * an instruction scheduling constraint on modern architectures. */ >> smp_mb(); >> >> + rcu_read_lock(); >> + >> if (unlikely(exit_request)) { >> cpu->exit_request = 1; >> } >> @@ -567,6 +577,7 @@ int cpu_exec(CPUArchState *env) >> } /* for(;;) */ >> >> cc->cpu_exec_exit(cpu); >> + rcu_read_unlock(); >> >> /* fail safe : never use current_cpu outside cpu_exec() */ >> current_cpu = NULL; >> diff --git a/cpus.c b/cpus.c >> index 3a5323b..b02c793 100644 >> --- a/cpus.c >> +++ b/cpus.c >> @@ -1121,7 +1121,7 @@ bool qemu_cpu_is_self(CPUState *cpu) >> return qemu_thread_is_self(cpu->thread); >> } >> >> -static bool qemu_in_vcpu_thread(void) >> +bool qemu_in_vcpu_thread(void) >> { >> return current_cpu && qemu_cpu_is_self(current_cpu); >> } >> diff --git a/cputlb.c b/cputlb.c >> index f92db5e..38f2151 100644 >> --- a/cputlb.c >> +++ b/cputlb.c >> @@ -243,8 +243,12 @@ static void tlb_add_large_page(CPUArchState *env, >> target_ulong vaddr, >> } >> >> /* Add a new TLB entry. At most one entry for a given virtual address >> - is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the >> - supplied size is only used by tlb_flush_page. */ >> + * is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the >> + * supplied size is only used by tlb_flush_page. >> + * >> + * Called from TCG-generated code, which is under an RCU read-side >> + * critical section. >> + */ >> void tlb_set_page(CPUState *cpu, target_ulong vaddr, >> hwaddr paddr, int prot, >> int mmu_idx, target_ulong size) >> diff --git a/exec.c b/exec.c >> index 762ec76..262e8bc 100644 >> --- a/exec.c >> +++ b/exec.c >> @@ -115,6 +115,8 @@ struct PhysPageEntry { >> typedef PhysPageEntry Node[P_L2_SIZE]; >> >> typedef struct PhysPageMap { >> + struct rcu_head rcu; >> + >> unsigned sections_nb; >> unsigned sections_nb_alloc; >> unsigned nodes_nb; >> @@ -124,6 +126,8 @@ typedef struct PhysPageMap { >> } PhysPageMap; >> >> struct AddressSpaceDispatch { >> + struct rcu_head rcu; >> + >> /* This is a multi-level map on the physical address space. >> * The bottom level has pointers to MemoryRegionSections. >> */ >> @@ -315,6 +319,7 @@ bool memory_region_is_unassigned(MemoryRegion *mr) >> && mr != &io_mem_watch; >> } >> >> +/* Called from RCU critical section */ >> static MemoryRegionSection >> *address_space_lookup_region(AddressSpaceDispatch *d, >> hwaddr addr, >> bool >> resolve_subpage) >> @@ -330,6 +335,7 @@ static MemoryRegionSection >> *address_space_lookup_region(AddressSpaceDispatch *d, >> return section; >> } >> >> +/* Called from RCU critical section */ >> static MemoryRegionSection * >> address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, >> hwaddr *xlat, >> hwaddr *plen, bool resolve_subpage) >> @@ -370,8 +376,10 @@ MemoryRegion *address_space_translate(AddressSpace *as, >> hwaddr addr, >> MemoryRegion *mr; >> hwaddr len = *plen; >> >> + rcu_read_lock(); >> for (;;) { >> - section = address_space_translate_internal(as->dispatch, addr, >> &addr, plen, true); >> + AddressSpaceDispatch *d = atomic_rcu_read(&as->dispatch); >> + section = address_space_translate_internal(d, addr, &addr, plen, >> true); >> mr = section->mr; >> >> if (!mr->iommu_ops) { >> @@ -397,9 +405,11 @@ MemoryRegion *address_space_translate(AddressSpace *as, >> hwaddr addr, >> >> *plen = len; >> *xlat = addr; >> + rcu_read_unlock(); >> return mr; >> } >> >> +/* Called from RCU critical section */ >> MemoryRegionSection * >> address_space_translate_for_iotlb(CPUState *cpu, hwaddr addr, >> hwaddr *xlat, hwaddr *plen) >> @@ -852,6 +862,7 @@ static void cpu_physical_memory_set_dirty_tracking(bool >> enable) >> in_migration = enable; >> } >> >> +/* Called from RCU critical section */ >> hwaddr memory_region_section_get_iotlb(CPUState *cpu, >> MemoryRegionSection *section, >> target_ulong vaddr, >> @@ -1963,7 +1974,8 @@ static uint16_t dummy_section(PhysPageMap *map, >> AddressSpace *as, >> >> MemoryRegion *iotlb_to_region(CPUState *cpu, hwaddr index) >> { >> - MemoryRegionSection *sections = cpu->memory_dispatch->map.sections; >> + AddressSpaceDispatch *d = atomic_rcu_read(&cpu->memory_dispatch); >> + MemoryRegionSection *sections = d->map.sections; >> >> return sections[index & ~TARGET_PAGE_MASK].mr; >> } >> @@ -1999,6 +2011,12 @@ static void mem_begin(MemoryListener *listener) >> as->next_dispatch = d; >> } >> >> +static void address_space_dispatch_free(AddressSpaceDispatch *d) >> +{ >> + phys_sections_free(&d->map); > > > If I understand it, this code doesn't hold iothread lock when releasing the > memory region, but in one of the memroy region destructors, > memory_region_destructor_ram_from_ptr: > > void qemu_ram_free_from_ptr(ram_addr_t addr) > { > RAMBlock *block; > > /* This assumes the iothread lock is taken here too. */ > qemu_mutex_lock_ramlist(); > QTAILQ_FOREACH(block, &ram_list.blocks, next) { > ... > > Is the comment stale or I missed something? > > Fam > >> + g_free(d); >> +} >> + >> static void mem_commit(MemoryListener *listener) >> { >> AddressSpace *as = container_of(listener, AddressSpace, >> dispatch_listener); >> @@ -2007,11 +2025,9 @@ static void mem_commit(MemoryListener *listener) >> >> phys_page_compact_all(next, next->map.nodes_nb); >> >> - as->dispatch = next; >> - >> + atomic_rcu_set(&as->dispatch, next); >> if (cur) { >> - phys_sections_free(&cur->map); >> - g_free(cur); >> + call_rcu(cur, address_space_dispatch_free, rcu); >> } >> } >> >> @@ -2066,8 +2082,10 @@ void address_space_destroy_dispatch(AddressSpace *as) >> AddressSpaceDispatch *d = as->dispatch; >> >> memory_listener_unregister(&as->dispatch_listener); >> - g_free(d); >> - as->dispatch = NULL; >> + atomic_rcu_set(&as->dispatch, NULL); >> + if (d) { >> + call_rcu(d, address_space_dispatch_free, rcu); >> + } >> } >> >> static void memory_map_init(void) >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> index 0a4282a..7da70ff 100644 >> --- a/hw/i386/intel_iommu.c >> +++ b/hw/i386/intel_iommu.c >> @@ -745,6 +745,9 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) >> >> /* Map dev to context-entry then do a paging-structures walk to do a iommu >> * translation. >> + * >> + * Called from RCU critical section. >> + * >> * @bus_num: The bus number >> * @devfn: The devfn, which is the combined of device and function number >> * @is_write: The access is a write operation >> diff --git a/hw/pci-host/apb.c b/hw/pci-host/apb.c >> index f573875..832b6c7 100644 >> --- a/hw/pci-host/apb.c >> +++ b/hw/pci-host/apb.c >> @@ -205,6 +205,7 @@ static AddressSpace *pbm_pci_dma_iommu(PCIBus *bus, void >> *opaque, int devfn) >> return &is->iommu_as; >> } >> >> +/* Called from RCU critical section */ >> static IOMMUTLBEntry pbm_translate_iommu(MemoryRegion *iommu, hwaddr addr, >> bool is_write) >> { >> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c >> index da47474..ba003da 100644 >> --- a/hw/ppc/spapr_iommu.c >> +++ b/hw/ppc/spapr_iommu.c >> @@ -59,6 +59,7 @@ static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t >> liobn) >> return NULL; >> } >> >> +/* Called from RCU critical section */ >> static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr >> addr, >> bool is_write) >> { >> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h >> index bb3fd37..8eb0db3 100644 >> --- a/include/exec/exec-all.h >> +++ b/include/exec/exec-all.h >> @@ -96,6 +96,7 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, >> tb_page_addr_t end, >> void tb_invalidate_phys_range(tb_page_addr_t start, tb_page_addr_t end, >> int is_cpu_write_access); >> #if !defined(CONFIG_USER_ONLY) >> +bool qemu_in_vcpu_thread(void); >> void cpu_reload_memory_map(CPUState *cpu); >> void tcg_cpu_address_space_init(CPUState *cpu, AddressSpace *as); >> /* cputlb.c */ >> -- >> 1.8.3.1 >> >>