2/2 -- SUSE Labs, Novell Inc.
Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearning of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is difficult to handle nicely - difficult but not impossible, it could be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed. Many thanks to Hugh Dickins for input. Signed-off-by: Nick Piggin <[EMAIL PROTECTED]> Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_RESERVED 0x00080000 /* Pages and ptes in region aren't managed with regular pagecache or rmap routines */ + #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ @@ -337,7 +338,7 @@ static inline void get_page(struct page static inline void put_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -723,6 +724,9 @@ void install_arg_page(struct vm_area_str int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +#define invalid_pfn(pte, vm_flags, vaddr) \ + __invalid_pfn(__FUNCTION__, pte, vm_flags, vaddr) +void __invalid_pfn(const char *, pte_t, unsigned long, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); Index: linux-2.6/mm/madvise.c =================================================================== --- linux-2.6.orig/mm/madvise.c +++ linux-2.6/mm/madvise.c @@ -123,7 +123,7 @@ static long madvise_dontneed(struct vm_a unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -333,6 +333,21 @@ out: } /* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void __invalid_pfn(const char *errfunc, pte_t pte, + unsigned long vm_flags, unsigned long vaddr) +{ + printk(KERN_ERR "%s: pte does not point to valid memory. " + "process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n", + errfunc, current->comm, (long)pte_val(pte), vm_flags, vaddr); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s spin_unlock(&mmlist_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } + /* If the region is VM_RESERVED, the mapping is not + * mapped via rmap - duplicate the pte as is. + */ + if (vm_flags & VM_RESERVED) + goto out_set_pte; + + /* If the pte points outside of valid memory but + * the region is not VM_RESERVED, we have a problem. + */ pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. - */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(pte, vm_flags, addr); + goto out_set_pte; /* try to do something sane */ } + page = pfn_to_page(pfn); + /* Mappings to zero pages aren't covered by rmap either. */ + if (page == ZERO_PAGE(addr)) + goto out_set_pte; + /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -400,8 +419,9 @@ copy_one_pte(struct mm_struct *dst_mm, s inc_mm_counter(dst_mm, rss); if (PageAnon(page)) inc_mm_counter(dst_mm, anon_rss); - set_pte_at(dst_mm, addr, dst_pte, pte); page_dup_rmap(page); +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -514,7 +534,8 @@ int copy_page_range(struct mm_struct *ds return 0; } -static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, +static void zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -528,10 +549,14 @@ static void zap_pte_range(struct mmu_gat if (pte_present(ptent)) { struct page *page = NULL; unsigned long pfn = pte_pfn(ptent); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; + if (!(vma->vm_flags & VM_RESERVED)) { + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(ptent, vma->vm_flags, addr); + } else { + page = pfn_to_page(pfn); + if (page == ZERO_PAGE(addr)) + page = NULL; + } } if (unlikely(details) && page) { /* @@ -584,7 +609,8 @@ static void zap_pte_range(struct mmu_gat pte_unmap(pte - 1); } -static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, +static inline void zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -596,11 +622,12 @@ static inline void zap_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - zap_pte_range(tlb, pmd, addr, next, details); + zap_pte_range(tlb, vma, pmd, addr, next, details); } while (pmd++, addr = next, addr != end); } -static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static inline void zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -612,7 +639,7 @@ static inline void zap_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - zap_pmd_range(tlb, pud, addr, next, details); + zap_pmd_range(tlb, vma, pud, addr, next, details); } while (pud++, addr = next, addr != end); } @@ -633,7 +660,7 @@ static void unmap_page_range(struct mmu_ next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - zap_pud_range(tlb, pgd, addr, next, details); + zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -933,7 +960,7 @@ int get_user_pages(struct task_struct *t continue; } - if (!vma || (vma->vm_flags & VM_IO) + if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) || !(flags & vma->vm_flags)) return i ? : -EFAULT; @@ -993,8 +1020,7 @@ int get_user_pages(struct task_struct *t if (pages) { pages[i] = page; flush_dcache_page(page); - if (!PageReserved(page)) - page_cache_get(page); + page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1098,8 +1124,7 @@ static int remap_pte_range(struct mm_str return -ENOMEM; do { BUG_ON(!pte_none(*pte)); - if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); @@ -1239,6 +1264,8 @@ static int do_wp_page(struct mm_struct * pte_t entry; int ret; + BUG_ON(vma->vm_flags & VM_RESERVED); + if (unlikely(!pfn_valid(pfn))) { /* * This should really halt the system so it can be debugged or @@ -1246,9 +1273,8 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); - printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", - address); spin_unlock(&mm->page_table_lock); + invalid_pfn(pte, vma->vm_flags, address); return VM_FAULT_OOM; } old_page = pfn_to_page(pfn); @@ -1273,13 +1299,16 @@ static int do_wp_page(struct mm_struct * /* * Ok, we need to copy. Oh, well.. */ - if (!PageReserved(old_page)) + if (old_page == ZERO_PAGE(address)) + old_page = NULL; + else page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) goto no_new_page; - if (old_page == ZERO_PAGE(address)) { + if (old_page == NULL) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) goto no_new_page; @@ -1296,12 +1325,13 @@ static int do_wp_page(struct mm_struct * spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { - if (PageAnon(old_page)) - dec_mm_counter(mm, anon_rss); - if (PageReserved(old_page)) + if (old_page == NULL) inc_mm_counter(mm, rss); - else + else { page_remove_rmap(old_page); + if (PageAnon(old_page)) + dec_mm_counter(mm, anon_rss); + } flush_cache_page(vma, address, pfn); break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); @@ -1312,13 +1342,16 @@ static int do_wp_page(struct mm_struct * ret |= VM_FAULT_WRITE; } pte_unmap(page_table); - page_cache_release(new_page); - page_cache_release(old_page); + if (old_page) { + page_cache_release(new_page); + page_cache_release(old_page); + } spin_unlock(&mm->page_table_lock); return ret; no_new_page: - page_cache_release(old_page); + if (old_page) + page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1755,7 +1788,7 @@ do_anonymous_page(struct mm_struct *mm, struct page * page = ZERO_PAGE(addr); /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); /* ..except if it's a write access */ if (write_access) { @@ -1894,9 +1927,6 @@ retry: */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - inc_mm_counter(mm, rss); - flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) @@ -1905,8 +1935,10 @@ retry: if (anon) { lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else + } else if (!(vma->vm_flags & VM_RESERVED)) { page_add_file_rmap(new_page); + inc_mm_counter(mm, rss); + } pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -113,7 +113,8 @@ static void bad_page(const char *functio 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback); + 1 << PG_writeback | + 1 << PG_reserved ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -243,7 +244,6 @@ static inline int page_is_buddy(struct p { if (PagePrivate(page) && (page_order(page) == order) && - !PageReserved(page) && page_count(page) == 0) return 1; return 0; @@ -326,7 +326,8 @@ static inline void free_pages_check(cons 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -454,7 +455,8 @@ static void prep_new_page(struct page *p 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(__FUNCTION__, page); page->flags &= ~(1 << PG_uptodate | 1 << PG_error | @@ -1011,7 +1013,7 @@ void __pagevec_free(struct pagevec *pvec fastcall void __free_pages(struct page *page, unsigned int order) { - if (!PageReserved(page) && put_page_testzero(page)) { + if (put_page_testzero(page)) { if (order == 0) free_hot_page(page); else @@ -1653,7 +1655,7 @@ void __init memmap_init_zone(unsigned lo continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 0); + set_page_count(page, 1); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); Index: linux-2.6/mm/swap.c =================================================================== --- linux-2.6.orig/mm/swap.c +++ linux-2.6/mm/swap.c @@ -48,7 +48,7 @@ void put_page(struct page *page) } return; } - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } EXPORT_SYMBOL(put_page); @@ -215,7 +215,7 @@ void release_pages(struct page **pages, struct page *page = pages[i]; struct zone *pagezone; - if (PageReserved(page) || !put_page_testzero(page)) + if (!put_page_testzero(page)) continue; pagezone = page_zone(page); Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c +++ linux-2.6/mm/fremap.c @@ -29,18 +29,21 @@ static inline void zap_pte(struct mm_str return; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); + struct page *page; flush_cache_page(vma, addr, pfn); pte = ptep_clear_flush(vma, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) { - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, rss); - } + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(pte, vma->vm_flags, addr); + return; + } + page = pfn_to_page(pfn); + if (page != ZERO_PAGE(addr)) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + dec_mm_counter(mm, rss); + page_cache_release(page); } } else { if (!pte_file(pte)) @@ -65,6 +68,8 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pte_t pte_val; + BUG_ON(vma->vm_flags & VM_RESERVED); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -122,6 +127,8 @@ int install_file_pte(struct mm_struct *m pgd_t *pgd; pte_t pte_val; + BUG_ON(vma->vm_flags & VM_RESERVED); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); Index: linux-2.6/mm/msync.c =================================================================== --- linux-2.6.orig/mm/msync.c +++ linux-2.6/mm/msync.c @@ -37,11 +37,11 @@ static void sync_pte_range(struct vm_are if (!pte_maybe_dirty(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(*pte, vma->vm_flags, addr); continue; + } page = pfn_to_page(pfn); - if (PageReserved(page)) - continue; if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) @@ -149,6 +149,9 @@ static int msync_interval(struct vm_area if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) return -EBUSY; + if (vma->vm_flags & VM_RESERVED) + return -EINVAL; + if (file && (vma->vm_flags & VM_SHARED)) { filemap_sync(vma, addr, end); Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c +++ linux-2.6/mm/rmap.c @@ -442,8 +442,6 @@ int page_referenced(struct page *page, i void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(PageReserved(page)); - inc_mm_counter(vma->vm_mm, anon_rss); if (atomic_inc_and_test(&page->_mapcount)) { @@ -469,8 +467,7 @@ void page_add_anon_rmap(struct page *pag void page_add_file_rmap(struct page *page) { BUG_ON(PageAnon(page)); - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + BUG_ON(!pfn_valid(page_to_pfn(page))); if (atomic_inc_and_test(&page->_mapcount)) inc_page_state(nr_mapped); @@ -484,8 +481,6 @@ void page_add_file_rmap(struct page *pag */ void page_remove_rmap(struct page *page) { - BUG_ON(PageReserved(page)); - if (atomic_add_negative(-1, &page->_mapcount)) { BUG_ON(page_mapcount(page) < 0); /* @@ -643,13 +638,13 @@ static void try_to_unmap_cluster(unsigne continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(*pte, vma->vm_flags, address); continue; + } page = pfn_to_page(pfn); BUG_ON(PageAnon(page)); - if (PageReserved(page)) - continue; if (ptep_clear_flush_young(vma, address, pte)) continue; @@ -812,7 +807,6 @@ int try_to_unmap(struct page *page) { int ret; - BUG_ON(PageReserved(page)); BUG_ON(!PageLocked(page)); if (PageAnon(page)) Index: linux-2.6/drivers/scsi/sg.c =================================================================== --- linux-2.6.orig/drivers/scsi/sg.c +++ linux-2.6/drivers/scsi/sg.c @@ -1887,13 +1887,17 @@ st_unmap_user_pages(struct scatterlist * int i; for (i=0; i < nr_pages; i++) { - if (dirtied && !PageReserved(sgl[i].page)) - SetPageDirty(sgl[i].page); - /* unlock_page(sgl[i].page); */ + struct page *page = sgl[i].page; + + /* XXX: just for debug. Remove when PageReserved is removed */ + BUG_ON(PageReserved(page)); + if (dirtied) + SetPageDirty(page); + /* unlock_page(page); */ /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(sgl[i].page); + page_cache_release(page); } return 0; Index: linux-2.6/drivers/scsi/st.c =================================================================== --- linux-2.6.orig/drivers/scsi/st.c +++ linux-2.6/drivers/scsi/st.c @@ -4431,12 +4431,16 @@ static int sgl_unmap_user_pages(struct s int i; for (i=0; i < nr_pages; i++) { - if (dirtied && !PageReserved(sgl[i].page)) - SetPageDirty(sgl[i].page); + struct page *page = sgl[i].page; + + /* XXX: just for debug. Remove when PageReserved is removed */ + BUG_ON(PageReserved(page)); + if (dirtied) + SetPageDirty(page); /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(sgl[i].page); + page_cache_release(page); } return 0; Index: linux-2.6/sound/core/pcm_native.c =================================================================== --- linux-2.6.orig/sound/core/pcm_native.c +++ linux-2.6/sound/core/pcm_native.c @@ -2944,8 +2944,7 @@ static struct page * snd_pcm_mmap_status return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->status); - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -2987,8 +2986,7 @@ static struct page * snd_pcm_mmap_contro return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->control); - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -3061,8 +3059,7 @@ static struct page *snd_pcm_mmap_data_no vaddr = runtime->dma_area + offset; page = virt_to_page(vaddr); } - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; Index: linux-2.6/mm/mmap.c =================================================================== --- linux-2.6.orig/mm/mmap.c +++ linux-2.6/mm/mmap.c @@ -1077,6 +1077,17 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; + if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) + == (VM_WRITE | VM_RESERVED)) { + printk(KERN_WARNING "program %s is using MAP_PRIVATE, " + "PROT_WRITE mmap of VM_RESERVED memory, which " + "is deprecated. Please report this to " + "[EMAIL PROTECTED]",current->comm); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + error = -EACCES; + goto unmap_and_free_vma; + } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) Index: linux-2.6/mm/mprotect.c =================================================================== --- linux-2.6.orig/mm/mprotect.c +++ linux-2.6/mm/mprotect.c @@ -131,6 +131,14 @@ mprotect_fixup(struct vm_area_struct *vm return -ENOMEM; newflags |= VM_ACCOUNT; } + if (oldflags & VM_RESERVED) { + BUG_ON(oldflags & VM_WRITE); + printk(KERN_WARNING "program %s is using MAP_PRIVATE, " + "PROT_WRITE mprotect of VM_RESERVED memory, " + "which is deprecated. Please report this to " + "[EMAIL PROTECTED]",current->comm); + return -EACCES; + } } newprot = protection_map[newflags & 0xf]; Index: linux-2.6/mm/bootmem.c =================================================================== --- linux-2.6.orig/mm/bootmem.c +++ linux-2.6/mm/bootmem.c @@ -297,6 +297,7 @@ static unsigned long __init free_all_boo if (j + 16 < BITS_PER_LONG) prefetchw(page + j + 16); __ClearPageReserved(page + j); + set_page_count(page + j, 0); } __free_pages(page, order); i += BITS_PER_LONG; Index: linux-2.6/mm/mempolicy.c =================================================================== --- linux-2.6.orig/mm/mempolicy.c +++ linux-2.6/mm/mempolicy.c @@ -253,8 +253,10 @@ static int check_pte_range(struct mm_str if (!pte_present(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (unlikely(!pfn_valid(pfn))) { + invalid_pfn(*pte, -1UL, addr); continue; + } nid = pfn_to_nid(pfn); if (!test_bit(nid, nodes)) break; @@ -326,6 +328,8 @@ check_range(struct mm_struct *mm, unsign first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); + if (first->vm_flags & VM_RESERVED) + return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) Index: linux-2.6/arch/ppc64/kernel/vdso.c =================================================================== --- linux-2.6.orig/arch/ppc64/kernel/vdso.c +++ linux-2.6/arch/ppc64/kernel/vdso.c @@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(str return NOPAGE_SIGBUS; /* - * Last page is systemcfg, special handling here, no get_page() a - * this is a reserved page + * Last page is systemcfg. */ if ((vma->vm_end - address) <= PAGE_SIZE) - return virt_to_page(systemcfg); + pg = virt_to_page(systemcfg); + else + pg = virt_to_page(vbase + offset); - pg = virt_to_page(vbase + offset); get_page(pg); DBG(" ->page count: %d\n", page_count(pg)); @@ -600,6 +600,8 @@ void __init vdso_init(void) ClearPageReserved(pg); get_page(pg); } + + get_page(virt_to_page(systemcfg)); } int in_gate_area_no_task(unsigned long addr) Index: linux-2.6/kernel/power/swsusp.c =================================================================== --- linux-2.6.orig/kernel/power/swsusp.c +++ linux-2.6/kernel/power/swsusp.c @@ -434,15 +434,23 @@ static int save_highmem_zone(struct zone continue; page = pfn_to_page(pfn); /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. + * PageReserved results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. + * + * rvmalloc should not cause this, because all implementations + * appear to always be using vmalloc_32 on architectures with + * highmem. This is a good thing, because we would like to save + * rvmalloc pages. + * + * It appears to be triggered by pages which do not point to + * valid memory (see arch/i386/mm/init.c:one_highpage_init(), + * which sets PageReserved if the page does not point to valid + * RAM. + * + * XXX: must remove usage of PageReserved! */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); + if (PageReserved(page)) continue; - } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; @@ -528,10 +536,9 @@ static int saveable(struct zone * zone, return 0; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { + if (pfn_is_nosave(pfn)) { pr_debug("[nosave pfn 0x%lx]", pfn); return 0; } Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c +++ linux-2.6/mm/shmem.c @@ -1523,7 +1523,8 @@ static void do_shmem_file_read(struct fi index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - page_cache_release(page); + if (page != ZERO_PAGE(0)) + page_cache_release(page); if (ret != nr || !desc->count) break;