When either calling change_page_attr() with the default attributes pages in the direct mapping have and a page's attributes already were set to the default or when changing the attributes from one non-default value to another, the reference counting broke, leading to either premature restoration of a large page or missing the opportunity to do so.
At the same time, make __PHYSICAL_MASK_SHIFT on 64-bits the value it architecturally ought to have. Signed-off-by: Jan Beulich <[EMAIL PROTECTED]> Cc: Andi Kleen <[EMAIL PROTECTED]> arch/x86/mm/ioremap_64.c | 4 +- arch/x86/mm/pageattr_32.c | 84 +++++++++++++++++++++++++++++-------------- arch/x86/mm/pageattr_64.c | 57 +++++++++++++++++++---------- include/asm-x86/page_32.h | 10 +++++ include/asm-x86/page_64.h | 2 - include/asm-x86/pgtable_32.h | 3 + include/asm-x86/pgtable_64.h | 4 +- 7 files changed, 114 insertions(+), 50 deletions(-) --- linux-2.6.24-rc5/arch/x86/mm/ioremap_64.c 2007-12-12 11:28:18.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/ioremap_64.c 2007-12-04 16:01:11.000000000 +0100 @@ -48,7 +48,7 @@ ioremap_change_attr(unsigned long phys_a * Must use a address here and not struct page because the phys addr * can be a in hole between nodes and not have an memmap entry. */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); + err = change_page_attr_addr(vaddr,npages,MAKE_GLOBAL(__PAGE_KERNEL|flags)); if (!err) global_flush_tlb(); } @@ -199,7 +199,7 @@ void iounmap(volatile void __iomem *addr /* Reset the direct mapping. Can block */ if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); + ioremap_change_attr(p->phys_addr, get_vm_area_size(p), 0); /* Finally remove it */ o = remove_vm_area((void *)addr); --- linux-2.6.24-rc5/arch/x86/mm/pageattr_32.c 2007-12-12 11:28:18.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_32.c 2007-12-04 16:01:11.000000000 +0100 @@ -116,24 +116,22 @@ static void set_pmd_pte(pte_t *kpte, uns spin_unlock_irqrestore(&pgd_lock, flags); } +static pgprot_t _ref_prot[KERNEL_PGD_PTRS * PTRS_PER_PMD]; +#define ref_prot(addr) _ref_prot[__pa(addr) >> PMD_SHIFT] + /* * No more special protections in this 2/4MB area - revert to a * large page again. */ static inline void revert_page(struct page *kpte_page, unsigned long address) { - pgprot_t ref_prot; pte_t *linear; - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE; - linear = (pte_t *) pmd_offset(pud_offset(pgd_offset_k(address), address), address); set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, - ref_prot)); + pte_mkhuge(pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + ref_prot(address)))); } static inline void save_page(struct page *kpte_page) @@ -142,12 +140,22 @@ static inline void save_page(struct page list_add(&kpte_page->lru, &df_list); } +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2) +{ + return !((pgprot_val(prot1) ^ pgprot_val(prot2)) +#ifdef CONFIG_X86_PAE + & __supported_pte_mask +#endif + & ~(_PAGE_ACCESSED|_PAGE_DIRTY)); +} + static int __change_page_attr(struct page *page, pgprot_t prot) { pte_t *kpte; unsigned long address; struct page *kpte_page; + pgprot_t old_prot, ref_prot; BUG_ON(PageHighMem(page)); address = (unsigned long)page_address(page); @@ -159,29 +167,31 @@ __change_page_attr(struct page *page, pg BUG_ON(PageLRU(kpte_page)); BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { + old_prot = pte_pgprot(pte_clrhuge(*kpte)); + ref_prot = ref_prot(address); + if (!pgprot_match(prot, ref_prot)) { if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); } else { - pgprot_t ref_prot; - struct page *split; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_EXEC : PAGE_KERNEL; - split = split_large_page(address, prot, ref_prot); - if (!split) + BUG_ON(!pgprot_match(old_prot, ref_prot)); + kpte_page = split_large_page(address, prot, ref_prot); + if (!kpte_page) return -ENOMEM; - set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); - kpte_page = split; + set_pmd_pte(kpte, address, + mk_pte(kpte_page, PAGE_KERNEL_EXEC)); + } + if (!PageReserved(kpte_page) + && pgprot_match(old_prot, ref_prot)) + page_private(kpte_page)++; + } else if (!pgprot_match(ref_prot, old_prot)) { + BUG_ON(pte_huge(*kpte)); + set_pte_atomic(kpte, mk_pte(page, ref_prot)); + if (!PageReserved(kpte_page)) { + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; } else - BUG(); + return 0; /* * If the pte was reserved, it means it was created at boot @@ -190,8 +200,17 @@ __change_page_attr(struct page *page, pg */ save_page(kpte_page); - if (!PageReserved(kpte_page)) { - if (cpu_has_pse && (page_private(kpte_page) == 0)) { + if (!PageReserved(kpte_page) && cpu_has_pse) { + if (page_private(kpte_page) == PTRS_PER_PTE) { + unsigned i; + + kpte = page_address(kpte_page); + for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte) + if (pgprot_match(pte_pgprot(*kpte), prot)) + page_private(kpte_page)--; + ref_prot(address) = prot; + } + if (page_private(kpte_page) == 0) { paravirt_release_pt(page_to_pfn(kpte_page)); revert_page(kpte_page, address); } @@ -222,8 +241,21 @@ int change_page_attr(struct page *page, int err = 0; int i; unsigned long flags; + static char first = 1; spin_lock_irqsave(&cpa_lock, flags); + + if (unlikely(first)) { + unsigned long addr = PAGE_OFFSET & PMD_MASK; + + /* This must match is_kernel_text(). */ + for (; addr <= (unsigned long)__init_end; addr += PMD_SIZE) + ref_prot(addr) = PAGE_KERNEL_EXEC; + for (; addr > PAGE_OFFSET; addr += PMD_SIZE) + ref_prot(addr) = PAGE_KERNEL; + first = 0; + } + for (i = 0; i < numpages; i++, page++) { err = __change_page_attr(page, prot); if (err) --- linux-2.6.24-rc5/arch/x86/mm/pageattr_64.c 2007-12-12 11:28:18.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/arch/x86/mm/pageattr_64.c 2007-12-04 16:01:11.000000000 +0100 @@ -98,8 +98,14 @@ static inline void save_page(struct page list_add(&fpage->lru, &deferred_pages); } +/* protected by init_mm.mmap_sem */ +static pgprot_t kref_prot[] = { + [0 ... (KERNEL_TEXT_SIZE - 1) >> PMD_SHIFT] = PAGE_KERNEL_EXEC +}; +#define kref_prot(kaddr) kref_prot[((kaddr) - __START_KERNEL_map) >> PMD_SHIFT] + /* - * No more special protections in this 2/4MB area - revert to a + * No more special protections in this 2MB area - revert to a * large page again. */ static void revert_page(unsigned long address, pgprot_t ref_prot) @@ -122,55 +128,68 @@ static void revert_page(unsigned long ad set_pte((pte_t *)pmd, large_pte); } +static inline int pgprot_match(pgprot_t prot1, pgprot_t prot2) +{ + return !((pgprot_val(prot1) ^ pgprot_val(prot2)) + & __supported_pte_mask & ~(_PAGE_ACCESSED|_PAGE_DIRTY)); +} + static int __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, pgprot_t ref_prot) { pte_t *kpte; struct page *kpte_page; - pgprot_t ref_prot2; + pgprot_t old_prot; kpte = lookup_address(address); if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + kpte_page = virt_to_page(kpte); BUG_ON(PageLRU(kpte_page)); BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { + old_prot = pte_pgprot(pte_clrhuge(*kpte)); + if (!pgprot_match(prot, ref_prot)) { if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) + BUG_ON(!pgprot_match(old_prot, ref_prot)); + kpte_page = split_large_page(address, prot, ref_prot); + if (!kpte_page) return -ENOMEM; - pgprot_val(ref_prot2) &= ~_PAGE_NX; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; + set_pte(kpte, mk_pte(kpte_page, PAGE_KERNEL_EXEC)); } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { + if (pgprot_match(old_prot, ref_prot)) + page_private(kpte_page)++; + } else if (!pgprot_match(ref_prot, old_prot)) { + BUG_ON(pte_huge(*kpte)); set_pte(kpte, pfn_pte(pfn, ref_prot)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; } else - BUG(); + return 0; /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); save_page(kpte_page); + if (page_private(kpte_page) == PTRS_PER_PTE + && address >= __START_KERNEL_map + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { + unsigned i; + + kpte = page_address(kpte_page); + for (i = 0; i < PTRS_PER_PTE; ++i, ++kpte) + if (pgprot_match(pte_pgprot(*kpte), prot)) + page_private(kpte_page)--; + kref_prot(address) = ref_prot = prot; + } if (page_private(kpte_page) == 0) revert_page(address, ref_prot); return 0; } /* - * Change the page attributes of an page in the linear mapping. + * Change the page attributes of a page in the linear mapping. * * This should be used when a page is mapped with a different caching policy * than write-back somewhere - some CPUs do not like it when mappings with --- linux-2.6.24-rc5/include/asm-x86/page_32.h 2007-12-12 11:29:30.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_32.h 2007-12-04 16:01:11.000000000 +0100 @@ -6,6 +6,16 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) +#ifdef CONFIG_X86_PAE +#define __PHYSICAL_MASK_SHIFT 52 +#define __PHYSICAL_MASK ((1ULL << __PHYSICAL_MASK_SHIFT) - 1) +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE - 1ULL) & __PHYSICAL_MASK) +#else +#define __PHYSICAL_MASK_SHIFT 32 +#define __PHYSICAL_MASK (~0UL) +#define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK) +#endif + #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) --- linux-2.6.24-rc5/include/asm-x86/page_64.h 2007-12-12 11:29:30.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/page_64.h 2007-12-04 16:01:11.000000000 +0100 @@ -98,7 +98,7 @@ extern unsigned long phys_base; #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) /* See Documentation/x86_64/mm.txt for a description of the memory map. */ -#define __PHYSICAL_MASK_SHIFT 46 +#define __PHYSICAL_MASK_SHIFT 52 #define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1) #define __VIRTUAL_MASK_SHIFT 48 #define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) --- linux-2.6.24-rc5/include/asm-x86/pgtable_32.h 2007-12-12 11:29:30.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_32.h 2007-12-04 16:01:11.000000000 +0100 @@ -228,11 +228,14 @@ static inline int pte_file(pte_t pte) { static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } +static inline pte_t pte_clrhuge(pte_t pte) { (pte).pte_low &= ~_PAGE_PSE; return pte; } static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } +#define pte_pgprot(pte) (__pgprot(pte_val(pte) & ~PHYSICAL_PAGE_MASK)) + #ifdef CONFIG_X86_PAE # include <asm/pgtable-3level.h> #else --- linux-2.6.24-rc5/include/asm-x86/pgtable_64.h 2007-12-12 11:29:30.000000000 +0100 +++ 2.6.24-rc5-x86-change_page_attr/include/asm-x86/pgtable_64.h 2007-12-04 16:01:11.000000000 +0100 @@ -344,9 +344,9 @@ static inline int pmd_large(pmd_t pte) { #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) -#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pte_to_pgoff(pte) (pte_val(pte) >> PAGE_SHIFT) #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT +#define PTE_FILE_MAX_BITS (64 - PAGE_SHIFT) /* PTE - Level 1 access. */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/