On Wed, 04 Apr 2007 18:55:18 +1000 Nick Piggin <[EMAIL PROTECTED]> wrote:
> Peter Zijlstra wrote: > > On Wed, 2007-04-04 at 12:22 +1000, Nick Piggin wrote: > > > >>Eric Dumazet wrote: > > > > > >>>I do think such workloads might benefit from a vma_cache not shared by > >>>all threads but private to each thread. A sequence could invalidate the > >>>cache(s). > >>> > >>>ie instead of a mm->mmap_cache, having a mm->sequence, and each thread > >>>having a current->mmap_cache and current->mm_sequence > >> > >>I have a patchset to do exactly this, btw. > > > > > > /me too > > > > However, I decided against pushing it because when it does happen that a > > task is not involved with a vma lookup for longer than it takes the seq > > count to wrap we have a stale pointer... > > > > We could go and walk the tasks once in a while to reset the pointer, but > > it all got a tad involved. > > Well here is my core patch (against I think 2.6.16 + a set of vma cache > cleanups and abstractions). I didn't think the wrapping aspect was > terribly involved. Well, I believe this one is too expensive. I was thinking of a light one : I am not deleting mmap_sem, but adding a sequence number to mm_struct, that is incremented each time a vma is added/deleted, not each time mmap_sem is taken (read or write) Each thread has its own copy of the sequence, taken at the time find_vma() had to do a full lookup. I believe some optimized paths could call check_vma_cache() without mmap_sem read lock taken, and if it fails, take the mmap_sem lock and do the slow path. --- linux-2.6.21-rc5/include/linux/sched.h +++ linux-2.6.21-rc5-ed/include/linux/sched.h @@ -319,10 +319,14 @@ typedef unsigned long mm_counter_t; (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) +struct vm_area_cache { + struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned int sequence; + }; + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; - struct vm_area_struct * mmap_cache; /* last find_vma result */ unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); @@ -336,6 +340,7 @@ struct mm_struct { atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ int map_count; /* number of VMAs */ struct rw_semaphore mmap_sem; + unsigned int mm_sequence; spinlock_t page_table_lock; /* Protects page tables and some counters */ struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung @@ -875,7 +880,7 @@ struct task_struct { struct list_head tasks; struct mm_struct *mm, *active_mm; - + struct vm_area_cache vmacache; /* task state */ struct linux_binfmt *binfmt; int exit_state; --- linux-2.6.21-rc5/include/linux/mm.h +++ linux-2.6.21-rc5-ed/include/linux/mm.h @@ -1176,15 +1176,18 @@ extern int expand_upwards(struct vm_area #endif /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma(struct mm_struct * mm, + unsigned long addr, + struct vm_area_cache *cache); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* Look up the first VMA which intersects the interval start_addr..end_addr-1, NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, + unsigned long start_addr, unsigned long end_addr, struct vm_area_cache *cache) { - struct vm_area_struct * vma = find_vma(mm,start_addr); + struct vm_area_struct * vma = find_vma(mm,start_addr,cache); if (vma && end_addr <= vma->vm_start) vma = NULL; --- linux-2.6.21-rc5/mm/mmap.c +++ linux-2.6.21-rc5-ed/mm/mmap.c @@ -267,7 +267,7 @@ asmlinkage unsigned long sys_brk(unsigne } /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE, ¤t->vmacache)) goto out; /* Ok, looks good - let it rip. */ @@ -447,6 +447,7 @@ static void vma_link(struct mm_struct *m spin_unlock(&mapping->i_mmap_lock); mm->map_count++; + mm->mm_sequence++; validate_mm(mm); } @@ -473,8 +474,7 @@ __vma_unlink(struct mm_struct *mm, struc { prev->vm_next = vma->vm_next; rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + mm->mm_sequence++; } /* @@ -1201,7 +1201,7 @@ arch_get_unmapped_area(struct file *filp if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma(mm, addr, ¤t->vmacache); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; @@ -1214,7 +1214,7 @@ arch_get_unmapped_area(struct file *filp } full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + for (vma = find_vma(mm, addr, ¤t->vmacache); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* @@ -1275,7 +1275,7 @@ arch_get_unmapped_area_topdown(struct fi /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma(mm, addr, ¤t->vmacache); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; @@ -1292,7 +1292,7 @@ arch_get_unmapped_area_topdown(struct fi /* make sure it can fit in the remaining address space */ if (addr > len) { - vma = find_vma(mm, addr-len); + vma = find_vma(mm, addr-len, ¤t->vmacache); if (!vma || addr <= vma->vm_start) /* remember the address as a hint for next time */ return (mm->free_area_cache = addr-len); @@ -1309,7 +1309,7 @@ arch_get_unmapped_area_topdown(struct fi * else if new region fits below vma->vm_start, * return with success: */ - vma = find_vma(mm, addr); + vma = find_vma(mm, addr, ¤t->vmacache); if (!vma || addr+len <= vma->vm_start) /* remember the address as a hint for next time */ return (mm->free_area_cache = addr); @@ -1397,16 +1397,28 @@ get_unmapped_area(struct file *file, uns EXPORT_SYMBOL(get_unmapped_area); +struct vm_area_struct * check_vma_cache(struct mm_struct * mm, unsigned long addr, struct vm_area_cache *cache) +{ + struct vm_area_struct *vma = cache->mmap_cache; + unsigned int mmseq = mm->mm_sequence; + smp_rmb(); + if (cache->sequence == mmseq && + vma && + addr < vma->vm_end && vma->vm_start <= addr) + return vma; + return NULL; +} + /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr, struct vm_area_cache *cache) { struct vm_area_struct *vma = NULL; if (mm) { /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + vma = check_vma_cache(mm, addr, cache); + if (!vma) { struct rb_node * rb_node; rb_node = mm->mm_rb.rb_node; @@ -1426,8 +1438,10 @@ struct vm_area_struct * find_vma(struct } else rb_node = rb_node->rb_right; } - if (vma) - mm->mmap_cache = vma; + if (vma) { + cache->mmap_cache = vma; + cache->sequence = mm->mm_sequence; + } } } return vma; @@ -1638,7 +1652,7 @@ find_extend_vma(struct mm_struct * mm, u unsigned long start; addr &= PAGE_MASK; - vma = find_vma(mm,addr); + vma = find_vma(mm,addr,¤t->vmacache); if (!vma) return NULL; if (vma->vm_start <= addr) @@ -1726,7 +1740,7 @@ detach_vmas_to_be_unmapped(struct mm_str else addr = vma ? vma->vm_end : mm->mmap_base; mm->unmap_area(mm, addr); - mm->mmap_cache = NULL; /* Kill the cache. */ + mm->mm_sequence++; } /* @@ -1823,7 +1837,7 @@ int do_munmap(struct mm_struct *mm, unsi } /* Does it split the last one? */ - last = find_vma(mm, end); + last = find_vma(mm, end, ¤t->vmacache); if (last && end > last->vm_start) { int error = split_vma(mm, last, end, 1); if (error) --- linux-2.6.21-rc5/kernel/fork.c +++ linux-2.6.21-rc5-ed/kernel/fork.c @@ -213,7 +213,6 @@ static inline int dup_mmap(struct mm_str mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; @@ -564,6 +563,7 @@ good_mm: tsk->mm = mm; tsk->active_mm = mm; + tsk->vmacache.mmap_cache = NULL; return 0; fail_nomem: --- linux-2.6.21-rc5/mm/mempolicy.c +++ linux-2.6.21-rc5-ed/mm/mempolicy.c @@ -532,7 +532,7 @@ long do_get_mempolicy(int *policy, nodem return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); - vma = find_vma_intersection(mm, addr, addr+1); + vma = find_vma_intersection(mm, addr, addr+1, ¤t->mmcache); if (!vma) { up_read(&mm->mmap_sem); return -EFAULT; --- linux-2.6.21-rc5/arch/i386/mm/fault.c +++ linux-2.6.21-rc5-ed/arch/i386/mm/fault.c @@ -374,7 +374,7 @@ fastcall void __kprobes do_page_fault(st down_read(&mm->mmap_sem); } - vma = find_vma(mm, address); + vma = find_vma(mm, address, &tsk->vmacache); if (!vma) goto bad_area; if (vma->vm_start <= address) --- linux-2.6.21-rc5/kernel/futex.c +++ linux-2.6.21-rc5-ed/kernel/futex.c @@ -346,7 +346,7 @@ static int futex_handle_fault(unsigned l struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - if (attempt > 2 || !(vma = find_vma(mm, address)) || + if (attempt > 2 || !(vma = find_vma(mm, address, ¤t->vmacache)) || vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) return -EFAULT; --- linux-2.6.21-rc5/mm/fremap.c +++ linux-2.6.21-rc5-ed/mm/fremap.c @@ -146,7 +146,7 @@ asmlinkage long sys_remap_file_pages(uns /* We need down_write() to change vma->vm_flags. */ down_read(&mm->mmap_sem); retry: - vma = find_vma(mm, start); + vma = find_vma(mm, start, ¤t->vmacache); /* * Make sure the vma is shared, that it supports prefaulting, --- linux-2.6.21-rc5/mm/madvise.c +++ linux-2.6.21-rc5-ed/mm/madvise.c @@ -329,7 +329,7 @@ asmlinkage long sys_madvise(unsigned lon if (prev) vma = prev->vm_next; else /* madvise_remove dropped mmap_sem */ - vma = find_vma(current->mm, start); + vma = find_vma(current->mm, start, ¤t->vmacache); } out: up_write(¤t->mm->mmap_sem); --- linux-2.6.21-rc5/mm/memory.c +++ linux-2.6.21-rc5-ed/mm/memory.c @@ -2556,7 +2556,7 @@ int make_pages_present(unsigned long add int ret, len, write; struct vm_area_struct * vma; - vma = find_vma(current->mm, addr); + vma = find_vma(current->mm, addr, ¤t->vmacache); if (!vma) return -1; write = (vma->vm_flags & VM_WRITE) != 0; --- linux-2.6.21-rc5/mm/mincore.c +++ linux-2.6.21-rc5-ed/mm/mincore.c @@ -63,7 +63,7 @@ static long do_mincore(unsigned long add unsigned long nr; int i; pgoff_t pgoff; - struct vm_area_struct *vma = find_vma(current->mm, addr); + struct vm_area_struct *vma = find_vma(current->mm, addr, ¤t->vmacache); /* * find_vma() didn't find anything above us, or we're --- linux-2.6.21-rc5/mm/mremap.c +++ linux-2.6.21-rc5-ed/mm/mremap.c @@ -315,7 +315,7 @@ unsigned long do_mremap(unsigned long ad * Ok, we need to grow.. or relocate. */ ret = -EFAULT; - vma = find_vma(mm, addr); + vma = find_vma(mm, addr, ¤t->vmacache); if (!vma || vma->vm_start > addr) goto out; if (is_vm_hugetlb_page(vma)) { --- linux-2.6.21-rc5/mm/msync.c +++ linux-2.6.21-rc5-ed/mm/msync.c @@ -54,7 +54,7 @@ asmlinkage long sys_msync(unsigned long * just ignore them, but return -ENOMEM at the end. */ down_read(&mm->mmap_sem); - vma = find_vma(mm, start); + vma = find_vma(mm, start, ¤t->vmacache); for (;;) { struct file *file; @@ -86,7 +86,7 @@ asmlinkage long sys_msync(unsigned long if (error || start >= end) goto out; down_read(&mm->mmap_sem); - vma = find_vma(mm, start); + vma = find_vma(mm, start, ¤t->vmacache); } else { if (start >= end) { error = 0; --- linux-2.6.21-rc5/fs/proc/task_mmu.c +++ linux-2.6.21-rc5-ed/fs/proc/task_mmu.c @@ -405,9 +405,15 @@ static void *m_start(struct seq_file *m, down_read(&mm->mmap_sem); /* Start with last addr hint */ - if (last_addr && (vma = find_vma(mm, last_addr))) { - vma = vma->vm_next; - goto out; + if (last_addr) { + struct vm_area_cache nocache = { + .sequence = mm->mm_sequence - 1, + }; + vma = find_vma(mm, last_addr, &nocache); + if (vma) { + vma = vma->vm_next; + goto out; + } } /* --- linux-2.6.21-rc5/drivers/char/mem.c +++ linux-2.6.21-rc5-ed/drivers/char/mem.c @@ -633,7 +633,7 @@ static inline size_t read_zero_pagealign down_read(&mm->mmap_sem); /* For private mappings, just map in zero pages. */ - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + for (vma = find_vma(mm, addr, ¤t->vmacache); vma; vma = vma->vm_next) { unsigned long count; if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/