On Mon, May 13, 2013 at 5:20 PM, Paolo Bonzini <pbonz...@redhat.com> wrote: > Il 13/05/2013 05:21, Liu Ping Fan ha scritto: >> From: Liu Ping Fan <pingf...@linux.vnet.ibm.com> >> >> Now, each AddressSpaceDispatch has its own radix-tree, and all of them >> lie on phys_section[] and phys_map_nodes[]. When we want lockless >> mmio dispatch, we need something like RCU. >> >> Acheive this with PhysPageTable which contains all of the info for all >> radix trees. After all address space listeners update (ie. excluding the >> readers) we switch from PhysPageTable *cur_pgtbl to *next_pgtbl. >> (The real RCU style is adopt by listener, see next patch) >> >> Signed-off-by: Liu Ping Fan <pingf...@linux.vnet.ibm.com> >> --- >> exec.c | 197 >> +++++++++++++++++++++++++------------------------ >> include/exec/memory.h | 2 + >> memory.c | 2 + >> 3 files changed, 106 insertions(+), 95 deletions(-) >> >> diff --git a/exec.c b/exec.c >> index c5f8082..bb4e540 100644 >> --- a/exec.c >> +++ b/exec.c >> @@ -80,23 +80,34 @@ int use_icount; >> #if !defined(CONFIG_USER_ONLY) >> >> #define SUBSECTION_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) >> -#define PHYS_SECTION_ID(psection) ((psection) - phys_sections) >> +#define PHYS_SECTION_ID(psection, base) ((psection) - base->phys_sections) >> >> typedef struct PhysSection { >> MemoryRegionSection section; >> uint16_t *sub_section; >> } PhysSection; >> >> -static PhysSection *phys_sections; >> -static unsigned phys_sections_nb, phys_sections_nb_alloc; >> -static uint16_t phys_section_unassigned; >> -static uint16_t phys_section_notdirty; >> -static uint16_t phys_section_rom; >> -static uint16_t phys_section_watch; >> +typedef PhysPageEntry Node[L2_SIZE]; >> >> -/* Simple allocator for PhysPageEntry nodes */ >> -static PhysPageEntry (*phys_map_nodes)[L2_SIZE]; >> -static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc; >> +typedef struct PhysPageTable PhysPageTable; >> + >> +struct PhysPageTable { >> + int ref; >> + PhysSection *phys_sections; >> + unsigned phys_sections_nb; >> + unsigned phys_sections_nb_alloc; >> + uint16_t phys_section_unassigned; >> + uint16_t phys_section_notdirty; >> + uint16_t phys_section_rom; >> + uint16_t phys_section_watch; > > These four should be constants. Please make them #defines and add > assertions that they have the expected values (in a separate patch). > Things like the following? #define phys_section_unassigned 0 t = dummy_section(&io_mem_unassigned); assert(t ==phys_section_unassigned );
>> + >> + Node *phys_map_nodes; >> + unsigned phys_map_nodes_nb; >> + unsigned phys_map_nodes_nb_alloc; >> +}; >> +static PhysPageTable *cur_pgtbl; >> +static PhysPageTable *next_pgtbl; > > You shouldn't need cur_pgtbl. Instead, each AddressSpaceDispatch should > have a pointer to its own cur_pgtbl. In the commit hook you can then > take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref > the new one. > Here is a trick. All AddressSpaceDispatch will update its radix tree at the same time, so the re-claimer will come after all of them commit to drop the old phys_map_node[] which holds the radix trees all at once. Is it OK? Or you still prefer each AddressSpaceDispatch has its own space for radix tree? > The lock must also be taken in address_space_translate, together with a > relatively expensive ref/unref pair (two atomic operations). We > probably need real RCU so that we can skip the ref/unref. > Currently, without RCU, do we rely on as->lock? So no need ref/unref on cur_pgtbl? With RCU, I think we need ref/unref on mr, since mr->ops->read/write can block, and should be excluded from RCU section. >> >> #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1) >> >> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch; >> >> static void phys_map_node_reserve(unsigned nodes) >> { >> - if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) { >> + if (next_pgtbl->phys_map_nodes_nb + nodes > >> next_pgtbl->phys_map_nodes_nb_alloc) { >> typedef PhysPageEntry Node[L2_SIZE]; >> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16); >> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc, >> - phys_map_nodes_nb + nodes); >> - phys_map_nodes = g_renew(Node, phys_map_nodes, >> - phys_map_nodes_nb_alloc); >> + next_pgtbl->phys_map_nodes_nb_alloc = >> MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16); >> + next_pgtbl->phys_map_nodes_nb_alloc = >> MAX(next_pgtbl->phys_map_nodes_nb_alloc, >> + next_pgtbl->phys_map_nodes_nb + >> nodes); >> + next_pgtbl->phys_map_nodes = g_renew(Node, >> next_pgtbl->phys_map_nodes, >> + next_pgtbl->phys_map_nodes_nb_alloc); >> } >> } >> >> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void) >> unsigned i; >> uint16_t ret; >> >> - ret = phys_map_nodes_nb++; >> + ret = next_pgtbl->phys_map_nodes_nb++; >> assert(ret != PHYS_MAP_NODE_NIL); >> - assert(ret != phys_map_nodes_nb_alloc); >> + assert(ret != next_pgtbl->phys_map_nodes_nb_alloc); >> for (i = 0; i < L2_SIZE; ++i) { >> - phys_map_nodes[ret][i].is_leaf = 0; >> - phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; >> + next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0; >> + next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; >> } >> return ret; >> } >> >> -static void phys_map_nodes_reset(void) >> -{ >> - phys_map_nodes_nb = 0; >> -} >> - >> - >> static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index, >> hwaddr *nb, uint16_t leaf, >> int level) >> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, >> hwaddr *index, >> >> if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) { >> lp->ptr = phys_map_node_alloc(); >> - p = phys_map_nodes[lp->ptr]; >> + p = next_pgtbl->phys_map_nodes[lp->ptr]; >> if (level == 0) { >> for (i = 0; i < L2_SIZE; i++) { >> p[i].is_leaf = 1; >> - p[i].ptr = phys_section_unassigned; >> + p[i].ptr = next_pgtbl->phys_section_unassigned; >> } >> } >> } else { >> - p = phys_map_nodes[lp->ptr]; >> + p = next_pgtbl->phys_map_nodes[lp->ptr]; >> } >> lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)]; >> >> @@ -192,11 +197,13 @@ static PhysSection >> *phys_section_find(AddressSpaceDispatch *d, >> { >> PhysPageEntry lp = d->phys_map; >> PhysPageEntry *p; >> + PhysSection *phys_sections = cur_pgtbl->phys_sections; >> + Node *phys_map_nodes = cur_pgtbl->phys_map_nodes; >> int i; >> >> for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) { >> if (lp.ptr == PHYS_MAP_NODE_NIL) { >> - return &phys_sections[phys_section_unassigned]; >> + return &phys_sections[cur_pgtbl->phys_section_unassigned]; >> } >> p = phys_map_nodes[lp.ptr]; >> lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)]; >> @@ -209,6 +216,7 @@ static MemoryRegionSection >> *address_space_lookup_region(AddressSpace *as, >> { >> PhysSection *psection; >> uint16_t idx; >> + PhysSection *phys_sections = cur_pgtbl->phys_sections; >> >> psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS); >> if (psection->sub_section) { >> @@ -246,7 +254,7 @@ MemoryRegionSection >> *address_space_translate(AddressSpace *as, hwaddr addr, >> | (addr & iotlb.addr_mask)); >> len = MIN(len, (addr | iotlb.addr_mask) - addr + 1); >> if (!iotlb.perm[is_write]) { >> - section = &phys_sections[phys_section_unassigned].section; >> + section = >> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section; >> break; >> } >> >> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState >> *env, >> iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK) >> + xlat; >> if (!section->readonly) { >> - iotlb |= phys_section_notdirty; >> + iotlb |= cur_pgtbl->phys_section_notdirty; >> } else { >> - iotlb |= phys_section_rom; >> + iotlb |= cur_pgtbl->phys_section_rom; >> } >> } else { >> - iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, >> section)); >> + iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, >> section), cur_pgtbl); >> iotlb += xlat; >> } >> >> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env, >> if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) { >> /* Avoid trapping reads of pages with a write breakpoint. */ >> if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) { >> - iotlb = phys_section_watch + paddr; >> + iotlb = cur_pgtbl->phys_section_watch + paddr; >> *address |= TLB_MMIO; >> break; >> } >> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, >> uint32_t start, >> uint32_t end, uint16_t section); >> static void subsections_init(PhysSection *psection); >> >> -static void destroy_page_desc(uint16_t section_index) >> +/* Call after all listener has been commit. >> + * we do not walk over tree, just simply drop. >> + */ >> +static void destroy_pagetable(PhysPageTable *pgtbl) >> { >> - g_free(phys_sections[section_index].sub_section); >> -} >> - >> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level) >> -{ >> - unsigned i; >> - PhysPageEntry *p; >> + int i; >> >> - if (lp->ptr == PHYS_MAP_NODE_NIL) { >> - return; >> - } >> + g_free(pgtbl->phys_map_nodes); >> >> - p = phys_map_nodes[lp->ptr]; >> - for (i = 0; i < L2_SIZE; ++i) { >> - if (!p[i].is_leaf) { >> - destroy_l2_mapping(&p[i], level - 1); >> + for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) { >> + if (pgtbl->phys_sections[i].sub_section) { >> + g_free(pgtbl->phys_sections[i].sub_section); >> } else { >> - destroy_page_desc(p[i].ptr); >> + memory_region_unref(pgtbl->phys_sections[i].section.mr); >> } >> } >> - lp->is_leaf = 0; >> - lp->ptr = PHYS_MAP_NODE_NIL; >> -} >> + g_free(pgtbl->phys_sections); >> >> -static void destroy_all_mappings(AddressSpaceDispatch *d) >> -{ >> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1); >> - phys_map_nodes_reset(); >> + g_free(pgtbl); >> } > > This should be a separate patch. > Sorry, this is part of reclaimer to play destruction of prev pgtbl. Why to separate it? >> -static uint16_t phys_section_add(MemoryRegionSection *section) >> +static uint16_t phys_section_add(MemoryRegionSection *section, >> PhysPageTable *pgtbl) > > phys_section_add will always add to next_pgtbl. > Ok, will drop @pgtbl. >> { >> - assert(phys_sections_nb < TARGET_PAGE_SIZE); >> + assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE); >> >> - if (phys_sections_nb == phys_sections_nb_alloc) { >> - phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16); >> - phys_sections = g_renew(PhysSection, phys_sections, >> - phys_sections_nb_alloc); >> + if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) { >> + pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * >> 2, 16); >> + pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections, >> + pgtbl->phys_sections_nb_alloc); >> } >> - phys_sections[phys_sections_nb].section = *section; >> - phys_sections[phys_sections_nb].sub_section = NULL; >> + pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section; >> + pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL; >> memory_region_ref(section->mr); >> - return phys_sections_nb++; >> -} >> - >> -static void phys_sections_clear(void) >> -{ >> - while (phys_sections_nb > 0) { >> - PhysSection *phys_section = &phys_sections[--phys_sections_nb]; >> - memory_region_unref(phys_section->section.mr); >> - } >> + return pgtbl->phys_sections_nb++; >> } >> >> static void register_subsection(AddressSpaceDispatch *d, >> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch >> *d, >> psection->section.mr == &io_mem_unassigned); >> >> if (!psection->sub_section) { >> - new_section = phys_section_add(&subsection); >> - psection = &phys_sections[new_section]; >> + new_section = phys_section_add(&subsection, next_pgtbl); >> + psection = &next_pgtbl->phys_sections[new_section]; >> subsections_init(psection); >> phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section); >> } else { >> - new_section = PHYS_SECTION_ID(psection); >> + new_section = PHYS_SECTION_ID(psection, next_pgtbl); >> } >> >> - new_subsection = phys_section_add(section); >> + new_subsection = phys_section_add(section, next_pgtbl); >> >> /* phys_section_add invalidates psection, reload it */ >> - psection = &phys_sections[new_section]; >> + psection = &next_pgtbl->phys_sections[new_section]; >> start = section->offset_within_address_space & ~TARGET_PAGE_MASK; >> end = start + section->size - 1; >> subsection_register(psection, start, end, new_subsection); >> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, >> MemoryRegionSection *sec >> hwaddr start_addr = section->offset_within_address_space; >> ram_addr_t size = section->size; >> hwaddr addr; >> - uint16_t section_index = phys_section_add(section); >> + uint16_t section_index = phys_section_add(section, next_pgtbl); >> >> assert(size); >> >> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection) >> { >> psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE); >> subsection_register(psection, 0, TARGET_PAGE_SIZE-1, >> - phys_section_unassigned); >> + next_pgtbl->phys_section_unassigned); >> } >> >> static uint16_t dummy_section(MemoryRegion *mr) >> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr) >> .size = UINT64_MAX, >> }; >> >> - return phys_section_add(§ion); >> + return phys_section_add(§ion, next_pgtbl); >> } >> >> MemoryRegion *iotlb_to_region(hwaddr index) >> { >> - return phys_sections[index & ~TARGET_PAGE_MASK].section.mr; >> + return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr; >> } >> >> static void io_mem_init(void) >> @@ -1685,21 +1674,40 @@ static void io_mem_init(void) >> "watch", UINT64_MAX); >> } >> >> +void global_begin(void) >> +{ >> + next_pgtbl = g_new0(PhysPageTable, 1); >> + next_pgtbl->ref = 1; >> + next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned); >> + next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty); >> + next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom); >> + next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch); >> +} >> + >> +/* other listeners finished */ >> +void global_commit(void) >> +{ >> + PhysPageTable *t = cur_pgtbl; >> + >> + cur_pgtbl = next_pgtbl; >> + /* Fix me, currently, we rely on each address space listener agaist its >> + * reader. So when we come here, no readers will touch old >> phys_map_node. >> + * After rcu, should changed to call_rcu() >> + */ >> + if (__sync_sub_and_fetch(&t->ref, 1) == 0) { >> + destroy_pagetable(t); >> + } > > global_commit should not be needed, and global_begin should be simply > the new core_begin. It should unref next_pgtbl and reallocate a new one. > Good idea, thanks. Regards, Pingfan > >> +} >> + >> static void mem_begin(MemoryListener *listener) >> { >> AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, >> listener); >> >> - destroy_all_mappings(d); >> d->phys_map.ptr = PHYS_MAP_NODE_NIL; >> } >> >> static void core_begin(MemoryListener *listener) >> { >> - phys_sections_clear(); >> - phys_section_unassigned = dummy_section(&io_mem_unassigned); >> - phys_section_notdirty = dummy_section(&io_mem_notdirty); >> - phys_section_rom = dummy_section(&io_mem_rom); >> - phys_section_watch = dummy_section(&io_mem_watch); >> } >> >> static void tcg_commit(MemoryListener *listener) >> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as) >> AddressSpaceDispatch *d = as->dispatch; >> >> memory_listener_unregister(&d->listener); >> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1); >> g_free(d); >> as->dispatch = NULL; >> } >> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val) >> >> if (!memory_region_is_ram(section->mr) || section->readonly) { >> if (memory_region_is_ram(section->mr)) { >> - section = &phys_sections[phys_section_rom].section; >> + section = >> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >> } >> io_mem_write(section->mr, addr, val, 4); >> } else { >> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val) >> >> if (!memory_region_is_ram(section->mr) || section->readonly) { >> if (memory_region_is_ram(section->mr)) { >> - section = &phys_sections[phys_section_rom].section; >> + section = >> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >> } >> #ifdef TARGET_WORDS_BIGENDIAN >> io_mem_write(section->mr, addr, val >> 32, 4); >> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, >> uint32_t val, >> >> if (!memory_region_is_ram(section->mr) || section->readonly) { >> if (memory_region_is_ram(section->mr)) { >> - section = &phys_sections[phys_section_rom].section; >> + section = >> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >> } >> #if defined(TARGET_WORDS_BIGENDIAN) >> if (endian == DEVICE_LITTLE_ENDIAN) { >> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, >> uint32_t val, >> >> if (!memory_region_is_ram(section->mr) || section->readonly) { >> if (memory_region_is_ram(section->mr)) { >> - section = &phys_sections[phys_section_rom].section; >> + section = >> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >> } >> #if defined(TARGET_WORDS_BIGENDIAN) >> if (endian == DEVICE_LITTLE_ENDIAN) { >> diff --git a/include/exec/memory.h b/include/exec/memory.h >> index b97ace7..cc654fa 100644 >> --- a/include/exec/memory.h >> +++ b/include/exec/memory.h >> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr, >> void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, >> int is_write, hwaddr access_len); >> >> +void global_begin(void); >> +void global_commit(void); >> >> #endif >> >> diff --git a/memory.c b/memory.c >> index 1a86607..da06dfd 100644 >> --- a/memory.c >> +++ b/memory.c >> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void) >> --memory_region_transaction_depth; >> if (!memory_region_transaction_depth && memory_region_update_pending) { >> memory_region_update_pending = false; >> + global_begin(); >> MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); >> >> QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { >> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void) >> } >> >> MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); >> + global_commit(); >> } >> } >> >> >