On Tue, May 14, 2013 at 5:27 PM, Paolo Bonzini <pbonz...@redhat.com> wrote: > Il 14/05/2013 05:38, liu ping fan ha scritto: >>>> +struct PhysPageTable { >>>> + int ref; >>>> + PhysSection *phys_sections; >>>> + unsigned phys_sections_nb; >>>> + unsigned phys_sections_nb_alloc; >>>> + uint16_t phys_section_unassigned; >>>> + uint16_t phys_section_notdirty; >>>> + uint16_t phys_section_rom; >>>> + uint16_t phys_section_watch; >>> >>> These four should be constants. Please make them #defines and add >>> assertions that they have the expected values (in a separate patch). >>> >> Things like the following? >> #define phys_section_unassigned 0 >> t = dummy_section(&io_mem_unassigned); >> assert(t ==phys_section_unassigned ); > > Yes. > >>>> + >>>> + Node *phys_map_nodes; >>>> + unsigned phys_map_nodes_nb; >>>> + unsigned phys_map_nodes_nb_alloc; >>>> +}; >>>> +static PhysPageTable *cur_pgtbl; >>>> +static PhysPageTable *next_pgtbl; >>> >>> You shouldn't need cur_pgtbl. Instead, each AddressSpaceDispatch should >>> have a pointer to its own cur_pgtbl. In the commit hook you can then >>> take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref >>> the new one. >>> >> Here is a trick. All AddressSpaceDispatch will update its radix tree >> at the same time, so the re-claimer will come after all of them commit >> to drop the old phys_map_node[] which holds the radix trees all at >> once. Is it OK? Or you still prefer each AddressSpaceDispatch has its >> own space for radix tree? > > I'm not sure... I prefer to have more stuff in AddressSpaceDispatch to > make things as streamlined as possible on the read side. > Ok, will implement based on each AddressSpaceDispatch
>>> The lock must also be taken in address_space_translate, together with a >>> relatively expensive ref/unref pair (two atomic operations). We >>> probably need real RCU so that we can skip the ref/unref. >>> >> Currently, without RCU, do we rely on as->lock? So no need ref/unref >> on cur_pgtbl? With RCU, I think we need ref/unref on mr, since >> mr->ops->read/write can block, and should be excluded from RCU >> section. > > Yes, we need ref/unref on the region, but in many cases that will be a > no-op (most important, for access to system RAM). And if we can do the > dispatch in the RCU section, similar to what I proposed in reviewing > your patch 2, we have no ref/unref in the common case of accessing RAM. > >>>> #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1) >>>> >>>> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch; >>>> >>>> static void phys_map_node_reserve(unsigned nodes) >>>> { >>>> - if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) { >>>> + if (next_pgtbl->phys_map_nodes_nb + nodes > >>>> next_pgtbl->phys_map_nodes_nb_alloc) { >>>> typedef PhysPageEntry Node[L2_SIZE]; >>>> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16); >>>> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc, >>>> - phys_map_nodes_nb + nodes); >>>> - phys_map_nodes = g_renew(Node, phys_map_nodes, >>>> - phys_map_nodes_nb_alloc); >>>> + next_pgtbl->phys_map_nodes_nb_alloc = >>>> MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16); >>>> + next_pgtbl->phys_map_nodes_nb_alloc = >>>> MAX(next_pgtbl->phys_map_nodes_nb_alloc, >>>> + next_pgtbl->phys_map_nodes_nb + >>>> nodes); >>>> + next_pgtbl->phys_map_nodes = g_renew(Node, >>>> next_pgtbl->phys_map_nodes, >>>> + next_pgtbl->phys_map_nodes_nb_alloc); >>>> } >>>> } >>>> >>>> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void) >>>> unsigned i; >>>> uint16_t ret; >>>> >>>> - ret = phys_map_nodes_nb++; >>>> + ret = next_pgtbl->phys_map_nodes_nb++; >>>> assert(ret != PHYS_MAP_NODE_NIL); >>>> - assert(ret != phys_map_nodes_nb_alloc); >>>> + assert(ret != next_pgtbl->phys_map_nodes_nb_alloc); >>>> for (i = 0; i < L2_SIZE; ++i) { >>>> - phys_map_nodes[ret][i].is_leaf = 0; >>>> - phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; >>>> + next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0; >>>> + next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL; >>>> } >>>> return ret; >>>> } >>>> >>>> -static void phys_map_nodes_reset(void) >>>> -{ >>>> - phys_map_nodes_nb = 0; >>>> -} >>>> - >>>> - >>>> static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index, >>>> hwaddr *nb, uint16_t leaf, >>>> int level) >>>> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, >>>> hwaddr *index, >>>> >>>> if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) { >>>> lp->ptr = phys_map_node_alloc(); >>>> - p = phys_map_nodes[lp->ptr]; >>>> + p = next_pgtbl->phys_map_nodes[lp->ptr]; >>>> if (level == 0) { >>>> for (i = 0; i < L2_SIZE; i++) { >>>> p[i].is_leaf = 1; >>>> - p[i].ptr = phys_section_unassigned; >>>> + p[i].ptr = next_pgtbl->phys_section_unassigned; >>>> } >>>> } >>>> } else { >>>> - p = phys_map_nodes[lp->ptr]; >>>> + p = next_pgtbl->phys_map_nodes[lp->ptr]; >>>> } >>>> lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)]; >>>> >>>> @@ -192,11 +197,13 @@ static PhysSection >>>> *phys_section_find(AddressSpaceDispatch *d, >>>> { >>>> PhysPageEntry lp = d->phys_map; >>>> PhysPageEntry *p; >>>> + PhysSection *phys_sections = cur_pgtbl->phys_sections; >>>> + Node *phys_map_nodes = cur_pgtbl->phys_map_nodes; >>>> int i; >>>> >>>> for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) { >>>> if (lp.ptr == PHYS_MAP_NODE_NIL) { >>>> - return &phys_sections[phys_section_unassigned]; >>>> + return &phys_sections[cur_pgtbl->phys_section_unassigned]; >>>> } >>>> p = phys_map_nodes[lp.ptr]; >>>> lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)]; >>>> @@ -209,6 +216,7 @@ static MemoryRegionSection >>>> *address_space_lookup_region(AddressSpace *as, >>>> { >>>> PhysSection *psection; >>>> uint16_t idx; >>>> + PhysSection *phys_sections = cur_pgtbl->phys_sections; >>>> >>>> psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS); >>>> if (psection->sub_section) { >>>> @@ -246,7 +254,7 @@ MemoryRegionSection >>>> *address_space_translate(AddressSpace *as, hwaddr addr, >>>> | (addr & iotlb.addr_mask)); >>>> len = MIN(len, (addr | iotlb.addr_mask) - addr + 1); >>>> if (!iotlb.perm[is_write]) { >>>> - section = &phys_sections[phys_section_unassigned].section; >>>> + section = >>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section; >>>> break; >>>> } >>>> >>>> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState >>>> *env, >>>> iotlb = (memory_region_get_ram_addr(section->mr) & >>>> TARGET_PAGE_MASK) >>>> + xlat; >>>> if (!section->readonly) { >>>> - iotlb |= phys_section_notdirty; >>>> + iotlb |= cur_pgtbl->phys_section_notdirty; >>>> } else { >>>> - iotlb |= phys_section_rom; >>>> + iotlb |= cur_pgtbl->phys_section_rom; >>>> } >>>> } else { >>>> - iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, >>>> section)); >>>> + iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, >>>> section), cur_pgtbl); >>>> iotlb += xlat; >>>> } >>>> >>>> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState >>>> *env, >>>> if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) { >>>> /* Avoid trapping reads of pages with a write breakpoint. */ >>>> if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) { >>>> - iotlb = phys_section_watch + paddr; >>>> + iotlb = cur_pgtbl->phys_section_watch + paddr; >>>> *address |= TLB_MMIO; >>>> break; >>>> } >>>> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection >>>> *psection, uint32_t start, >>>> uint32_t end, uint16_t section); >>>> static void subsections_init(PhysSection *psection); >>>> >>>> -static void destroy_page_desc(uint16_t section_index) >>>> +/* Call after all listener has been commit. >>>> + * we do not walk over tree, just simply drop. >>>> + */ >>>> +static void destroy_pagetable(PhysPageTable *pgtbl) >>>> { >>>> - g_free(phys_sections[section_index].sub_section); >>>> -} >>>> - >>>> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level) >>>> -{ >>>> - unsigned i; >>>> - PhysPageEntry *p; >>>> + int i; >>>> >>>> - if (lp->ptr == PHYS_MAP_NODE_NIL) { >>>> - return; >>>> - } >>>> + g_free(pgtbl->phys_map_nodes); >>>> >>>> - p = phys_map_nodes[lp->ptr]; >>>> - for (i = 0; i < L2_SIZE; ++i) { >>>> - if (!p[i].is_leaf) { >>>> - destroy_l2_mapping(&p[i], level - 1); >>>> + for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) { >>>> + if (pgtbl->phys_sections[i].sub_section) { >>>> + g_free(pgtbl->phys_sections[i].sub_section); >>>> } else { >>>> - destroy_page_desc(p[i].ptr); >>>> + memory_region_unref(pgtbl->phys_sections[i].section.mr); >>>> } >>>> } >>>> - lp->is_leaf = 0; >>>> - lp->ptr = PHYS_MAP_NODE_NIL; >>>> -} >>>> + g_free(pgtbl->phys_sections); >>>> >>>> -static void destroy_all_mappings(AddressSpaceDispatch *d) >>>> -{ >>>> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1); >>>> - phys_map_nodes_reset(); >>>> + g_free(pgtbl); >>>> } >>> >>> This should be a separate patch. >>> >> Sorry, this is part of reclaimer to play destruction of prev pgtbl. >> Why to separate it? > > You are changing the algorithm by which the old pagetable is destroyed. > This applies just as well to the current code. > > BTW, your patch includes Jan's subpage patch which was broken. I'll > drop it from the branch next time I push. > I will update based on your next version Regards, Pingfan > Paolo > >>>> -static uint16_t phys_section_add(MemoryRegionSection *section) >>>> +static uint16_t phys_section_add(MemoryRegionSection *section, >>>> PhysPageTable *pgtbl) >>> >>> phys_section_add will always add to next_pgtbl. >>> >> Ok, will drop @pgtbl. >>>> { >>>> - assert(phys_sections_nb < TARGET_PAGE_SIZE); >>>> + assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE); >>>> >>>> - if (phys_sections_nb == phys_sections_nb_alloc) { >>>> - phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16); >>>> - phys_sections = g_renew(PhysSection, phys_sections, >>>> - phys_sections_nb_alloc); >>>> + if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) { >>>> + pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc >>>> * 2, 16); >>>> + pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections, >>>> + pgtbl->phys_sections_nb_alloc); >>>> } >>>> - phys_sections[phys_sections_nb].section = *section; >>>> - phys_sections[phys_sections_nb].sub_section = NULL; >>>> + pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section; >>>> + pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL; >>>> memory_region_ref(section->mr); >>>> - return phys_sections_nb++; >>>> -} >>>> - >>>> -static void phys_sections_clear(void) >>>> -{ >>>> - while (phys_sections_nb > 0) { >>>> - PhysSection *phys_section = &phys_sections[--phys_sections_nb]; >>>> - memory_region_unref(phys_section->section.mr); >>>> - } >>>> + return pgtbl->phys_sections_nb++; >>>> } >>>> >>>> static void register_subsection(AddressSpaceDispatch *d, >>>> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch >>>> *d, >>>> psection->section.mr == &io_mem_unassigned); >>>> >>>> if (!psection->sub_section) { >>>> - new_section = phys_section_add(&subsection); >>>> - psection = &phys_sections[new_section]; >>>> + new_section = phys_section_add(&subsection, next_pgtbl); >>>> + psection = &next_pgtbl->phys_sections[new_section]; >>>> subsections_init(psection); >>>> phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section); >>>> } else { >>>> - new_section = PHYS_SECTION_ID(psection); >>>> + new_section = PHYS_SECTION_ID(psection, next_pgtbl); >>>> } >>>> >>>> - new_subsection = phys_section_add(section); >>>> + new_subsection = phys_section_add(section, next_pgtbl); >>>> >>>> /* phys_section_add invalidates psection, reload it */ >>>> - psection = &phys_sections[new_section]; >>>> + psection = &next_pgtbl->phys_sections[new_section]; >>>> start = section->offset_within_address_space & ~TARGET_PAGE_MASK; >>>> end = start + section->size - 1; >>>> subsection_register(psection, start, end, new_subsection); >>>> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch >>>> *d, MemoryRegionSection *sec >>>> hwaddr start_addr = section->offset_within_address_space; >>>> ram_addr_t size = section->size; >>>> hwaddr addr; >>>> - uint16_t section_index = phys_section_add(section); >>>> + uint16_t section_index = phys_section_add(section, next_pgtbl); >>>> >>>> assert(size); >>>> >>>> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection) >>>> { >>>> psection->sub_section = g_malloc0(sizeof(uint16_t) * >>>> TARGET_PAGE_SIZE); >>>> subsection_register(psection, 0, TARGET_PAGE_SIZE-1, >>>> - phys_section_unassigned); >>>> + next_pgtbl->phys_section_unassigned); >>>> } >>>> >>>> static uint16_t dummy_section(MemoryRegion *mr) >>>> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr) >>>> .size = UINT64_MAX, >>>> }; >>>> >>>> - return phys_section_add(§ion); >>>> + return phys_section_add(§ion, next_pgtbl); >>>> } >>>> >>>> MemoryRegion *iotlb_to_region(hwaddr index) >>>> { >>>> - return phys_sections[index & ~TARGET_PAGE_MASK].section.mr; >>>> + return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr; >>>> } >>>> >>>> static void io_mem_init(void) >>>> @@ -1685,21 +1674,40 @@ static void io_mem_init(void) >>>> "watch", UINT64_MAX); >>>> } >>>> >>>> +void global_begin(void) >>>> +{ >>>> + next_pgtbl = g_new0(PhysPageTable, 1); >>>> + next_pgtbl->ref = 1; >>>> + next_pgtbl->phys_section_unassigned = >>>> dummy_section(&io_mem_unassigned); >>>> + next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty); >>>> + next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom); >>>> + next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch); >>>> +} >>>> + >>>> +/* other listeners finished */ >>>> +void global_commit(void) >>>> +{ >>>> + PhysPageTable *t = cur_pgtbl; >>>> + >>>> + cur_pgtbl = next_pgtbl; >>>> + /* Fix me, currently, we rely on each address space listener agaist >>>> its >>>> + * reader. So when we come here, no readers will touch old >>>> phys_map_node. >>>> + * After rcu, should changed to call_rcu() >>>> + */ >>>> + if (__sync_sub_and_fetch(&t->ref, 1) == 0) { >>>> + destroy_pagetable(t); >>>> + } >>> >>> global_commit should not be needed, and global_begin should be simply >>> the new core_begin. It should unref next_pgtbl and reallocate a new one. >>> >> Good idea, thanks. >> >> Regards, >> Pingfan >>> >>>> +} >>>> + >>>> static void mem_begin(MemoryListener *listener) >>>> { >>>> AddressSpaceDispatch *d = container_of(listener, >>>> AddressSpaceDispatch, listener); >>>> >>>> - destroy_all_mappings(d); >>>> d->phys_map.ptr = PHYS_MAP_NODE_NIL; >>>> } >>>> >>>> static void core_begin(MemoryListener *listener) >>>> { >>>> - phys_sections_clear(); >>>> - phys_section_unassigned = dummy_section(&io_mem_unassigned); >>>> - phys_section_notdirty = dummy_section(&io_mem_notdirty); >>>> - phys_section_rom = dummy_section(&io_mem_rom); >>>> - phys_section_watch = dummy_section(&io_mem_watch); >>>> } >>>> >>>> static void tcg_commit(MemoryListener *listener) >>>> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as) >>>> AddressSpaceDispatch *d = as->dispatch; >>>> >>>> memory_listener_unregister(&d->listener); >>>> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1); >>>> g_free(d); >>>> as->dispatch = NULL; >>>> } >>>> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val) >>>> >>>> if (!memory_region_is_ram(section->mr) || section->readonly) { >>>> if (memory_region_is_ram(section->mr)) { >>>> - section = &phys_sections[phys_section_rom].section; >>>> + section = >>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >>>> } >>>> io_mem_write(section->mr, addr, val, 4); >>>> } else { >>>> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val) >>>> >>>> if (!memory_region_is_ram(section->mr) || section->readonly) { >>>> if (memory_region_is_ram(section->mr)) { >>>> - section = &phys_sections[phys_section_rom].section; >>>> + section = >>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >>>> } >>>> #ifdef TARGET_WORDS_BIGENDIAN >>>> io_mem_write(section->mr, addr, val >> 32, 4); >>>> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, >>>> uint32_t val, >>>> >>>> if (!memory_region_is_ram(section->mr) || section->readonly) { >>>> if (memory_region_is_ram(section->mr)) { >>>> - section = &phys_sections[phys_section_rom].section; >>>> + section = >>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >>>> } >>>> #if defined(TARGET_WORDS_BIGENDIAN) >>>> if (endian == DEVICE_LITTLE_ENDIAN) { >>>> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, >>>> uint32_t val, >>>> >>>> if (!memory_region_is_ram(section->mr) || section->readonly) { >>>> if (memory_region_is_ram(section->mr)) { >>>> - section = &phys_sections[phys_section_rom].section; >>>> + section = >>>> &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section; >>>> } >>>> #if defined(TARGET_WORDS_BIGENDIAN) >>>> if (endian == DEVICE_LITTLE_ENDIAN) { >>>> diff --git a/include/exec/memory.h b/include/exec/memory.h >>>> index b97ace7..cc654fa 100644 >>>> --- a/include/exec/memory.h >>>> +++ b/include/exec/memory.h >>>> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr, >>>> void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, >>>> int is_write, hwaddr access_len); >>>> >>>> +void global_begin(void); >>>> +void global_commit(void); >>>> >>>> #endif >>>> >>>> diff --git a/memory.c b/memory.c >>>> index 1a86607..da06dfd 100644 >>>> --- a/memory.c >>>> +++ b/memory.c >>>> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void) >>>> --memory_region_transaction_depth; >>>> if (!memory_region_transaction_depth && memory_region_update_pending) >>>> { >>>> memory_region_update_pending = false; >>>> + global_begin(); >>>> MEMORY_LISTENER_CALL_GLOBAL(begin, Forward); >>>> >>>> QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) { >>>> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void) >>>> } >>>> >>>> MEMORY_LISTENER_CALL_GLOBAL(commit, Forward); >>>> + global_commit(); >>>> } >>>> } >>>> >>>> >>> >> >> >