On Tue, 2008-11-04 at 19:12 +0000, Ian Campbell wrote: > On Tue, 2008-11-04 at 17:43 +0100, Bastian Blank wrote: > > On Tue, Nov 04, 2008 at 02:26:33PM +0000, Ian Campbell wrote: > > > On Tue, 2008-11-04 at 14:02 +0100, Bastian Blank wrote: > > > > Maybe its the best to remove the workaround and instead cripple mprotect > > > > to not allow PROT_NONE for now. And then hope that this can't be > > > > triggered by mmap with PROT_NONE. > > > I was thinking of going down the path of removing the workaround then > > > fixing mprotect, so your suggestion would be a consistant first step I > > > think.
This patch makes mprotect work by (very skankily) hacking out large page support which is unsupported on top of Xen anyway (I think so, currently anyway). I think I took out PAT as collaterol damage too. A cleaned up version without the pat damage might be an acceptable fix for the mprotect issue. My suspicion is that one of the -xen.c or mach-xen/asm/ files has gotten out of sync with a fix to its native partner since _PAGE_PSE is used for PROTNONE on native too so they must get round it somehow. I'll have a scrobble through and see if I can see it. Ian. -- Ian Campbell Once I finally figured out all of life's answers, they changed the questions.
Index: sid-xen/mm/mprotect.c =================================================================== --- sid-xen.orig/mm/mprotect.c 2008-11-05 06:41:55.000000000 +0000 +++ sid-xen/mm/mprotect.c 2008-11-05 06:51:56.000000000 +0000 @@ -39,6 +39,7 @@ { pte_t *pte, oldpte; spinlock_t *ptl; + int debug = !strcmp(current->comm, "mprot"); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -60,6 +61,9 @@ if (dirty_accountable && pte_dirty(ptent)) ptent = pte_mkwrite(ptent); set_pte_at(mm, addr, pte, ptent); + if (debug) + printk(KERN_CRIT "change present pte @ %p %#lx -> %#lx\n", + pte, oldpte.pte, ptent.pte); #ifdef CONFIG_MIGRATION } else if (!pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -227,6 +231,7 @@ { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; + int debug = !strcmp(current->comm, "mprot"); int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); @@ -280,6 +285,8 @@ if (start > vma->vm_start) prev = vma; + + for (nstart = start ; ; ) { unsigned long newflags; @@ -287,6 +294,10 @@ newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + if (debug) + printk(KERN_CRIT "mprotect(%s) vma:%p %#lx-%#lx flags:%#lx->%#lx new prot %#lx\n", current->comm, + vma, vma->vm_start, vma->vm_end, vma->vm_flags, newflags, prot); + /* newflags >> 4 shift VM_MAY% in place of VM_% */ if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { error = -EACCES; Index: sid-xen/arch/x86/mm/dump_pagetables.c =================================================================== --- sid-xen.orig/arch/x86/mm/dump_pagetables.c 2008-11-05 06:41:22.000000000 +0000 +++ sid-xen/arch/x86/mm/dump_pagetables.c 2008-11-05 06:44:14.000000000 +0000 @@ -98,15 +98,15 @@ /* Bit 9 has a different meaning on level 3 vs 4 */ if (level <= 3) { - if (pr & _PAGE_PSE) - seq_printf(m, "PSE "); - else - seq_printf(m, " "); +// if (pr & _PAGE_PSE) +// seq_printf(m, "PSE "); +// else +// seq_printf(m, " "); } else { - if (pr & _PAGE_PAT) - seq_printf(m, "pat "); - else - seq_printf(m, " "); +// if (pr & _PAGE_PAT) +// seq_printf(m, "pat "); +// else +// seq_printf(m, " "); } if (pr & _PAGE_GLOBAL) seq_printf(m, "GLB "); Index: sid-xen/arch/x86/mm/ioremap-xen.c =================================================================== --- sid-xen.orig/arch/x86/mm/ioremap-xen.c 2008-11-05 06:41:22.000000000 +0000 +++ sid-xen/arch/x86/mm/ioremap-xen.c 2008-11-05 06:44:14.000000000 +0000 @@ -255,12 +255,12 @@ default: err = _set_memory_uc(vaddr, nrpages); break; - case _PAGE_CACHE_WC: - err = _set_memory_wc(vaddr, nrpages); - break; - case _PAGE_CACHE_WB: - err = _set_memory_wb(vaddr, nrpages); - break; + //case _PAGE_CACHE_WC: + //err = _set_memory_wc(vaddr, nrpages); + //break; + //case _PAGE_CACHE_WB: + //err = _set_memory_wb(vaddr, nrpages); + //break; } return err; @@ -340,7 +340,7 @@ * - request is uc-, return cannot be write-combine * - request is write-combine, return cannot be write-back */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && +/* if ((prot_val == _PAGE_CACHE_UC_MINUS && (new_prot_val == _PAGE_CACHE_WB || new_prot_val == _PAGE_CACHE_WC)) || (prot_val == _PAGE_CACHE_WC && @@ -353,6 +353,7 @@ free_memtype(phys_addr, phys_addr + size); return NULL; } +*/ prot_val = new_prot_val; } @@ -364,12 +365,12 @@ case _PAGE_CACHE_UC_MINUS: prot = PAGE_KERNEL_UC_MINUS; break; - case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_WC; - break; - case _PAGE_CACHE_WB: - prot = PAGE_KERNEL; - break; +// case _PAGE_CACHE_WC: +// prot = PAGE_KERNEL_WC; +// break; +// case _PAGE_CACHE_WB: +// prot = PAGE_KERNEL; +// break; } /* @@ -446,8 +447,9 @@ void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) { if (pat_wc_enabled) - return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, - __builtin_return_address(0)); + BUG(); + //return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + // __builtin_return_address(0)); else return ioremap_nocache(phys_addr, size); } Index: sid-xen/arch/x86/mm/pageattr-xen.c =================================================================== --- sid-xen.orig/arch/x86/mm/pageattr-xen.c 2008-11-05 06:41:22.000000000 +0000 +++ sid-xen/arch/x86/mm/pageattr-xen.c 2008-11-05 06:44:14.000000000 +0000 @@ -504,8 +504,9 @@ #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { + BUG(); mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - pgprot_val(ref_prot) |= _PAGE_PSE; +// pgprot_val(ref_prot) |= _PAGE_PSE; } #endif @@ -714,7 +715,7 @@ static inline int cache_attr(pgprot_t attr) { return pgprot_val(attr) & - (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); + (/*_PAGE_PAT | _PAGE_PAT_LARGE |*/ _PAGE_PWT | _PAGE_PCD); } static int change_page_attr_set_clr(unsigned long addr, int numpages, @@ -819,18 +820,21 @@ int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + BUG(); + return 0; + //return change_page_attr_set(addr, numpages, + // __pgprot(_PAGE_CACHE_WC)); } int set_memory_wc(unsigned long addr, int numpages) { - if (!pat_wc_enabled) - return set_memory_uc(addr, numpages); - - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, - _PAGE_CACHE_WC, NULL)) - return -EINVAL; + BUG(); +// if (!pat_wc_enabled) +// return set_memory_uc(addr, numpages); +// +// aif (reserve_memtype(addr, addr + numpages * PAGE_SIZE, +// _PAGE_CACHE_WC, NULL)) +// return -EINVAL; return _set_memory_wc(addr, numpages); } Index: sid-xen/arch/x86/mm/pat-xen.c =================================================================== --- sid-xen.orig/arch/x86/mm/pat-xen.c 2008-11-05 06:41:22.000000000 +0000 +++ sid-xen/arch/x86/mm/pat-xen.c 2008-11-05 06:44:14.000000000 +0000 @@ -116,9 +116,9 @@ case _PAGE_CACHE_UC: return "uncached"; case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; case _PAGE_CACHE_WB: return "write-back"; - case _PAGE_CACHE_WC: return "write-combining"; - case _PAGE_CACHE_WP: return "write-protected"; - case _PAGE_CACHE_WT: return "write-through"; + // case _PAGE_CACHE_WC: return "write-combining"; + //case _PAGE_CACHE_WP: return "write-protected"; + //case _PAGE_CACHE_WT: return "write-through"; default: return "broken"; } } @@ -172,16 +172,16 @@ * Consistency checks with other PAT requests is done later * while going through memtype list. */ - if (pat_type == _PAGE_CACHE_WC) { - *ret_prot = prot | _PAGE_CACHE_WC; - return 0; - } else if (pat_type == _PAGE_CACHE_UC_MINUS) { - *ret_prot = prot | _PAGE_CACHE_UC_MINUS; - return 0; - } else if (pat_type == _PAGE_CACHE_UC) { - *ret_prot = prot | _PAGE_CACHE_UC; - return 0; - } +// if (pat_type == _PAGE_CACHE_WC) { +// *ret_prot = prot | _PAGE_CACHE_WC; +// return 0; +// } else if (pat_type == _PAGE_CACHE_UC_MINUS) { +// *ret_prot = prot | _PAGE_CACHE_UC_MINUS; +// return 0; +// } else if (pat_type == _PAGE_CACHE_UC) { +// *ret_prot = prot | _PAGE_CACHE_UC; +// return 0; +// } /* * Look for MTRR hint to get the effective type in case where PAT @@ -192,7 +192,8 @@ if (mtrr_type == MTRR_TYPE_UNCACHABLE) { *ret_prot = prot | _PAGE_CACHE_UC; } else if (mtrr_type == MTRR_TYPE_WRCOMB) { - *ret_prot = prot | _PAGE_CACHE_WC; + //*ret_prot = prot | _PAGE_CACHE_WC; + BUG(); } else { *ret_prot = prot | _PAGE_CACHE_WB; } Index: sid-xen/arch/x86/xen/mmu.c =================================================================== --- sid-xen.orig/arch/x86/xen/mmu.c 2008-11-05 06:41:22.000000000 +0000 +++ sid-xen/arch/x86/xen/mmu.c 2008-11-05 06:44:14.000000000 +0000 @@ -156,6 +156,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + BUG(); /* updates to init_mm may be done without lock */ if (mm == &init_mm) preempt_disable(); Index: sid-xen/mm/memory.c =================================================================== --- sid-xen.orig/mm/memory.c 2008-11-05 06:41:41.000000000 +0000 +++ sid-xen/mm/memory.c 2008-11-05 06:44:14.000000000 +0000 @@ -2327,6 +2327,7 @@ struct page *page; spinlock_t *ptl; pte_t entry; + int debug = !strcmp(current->comm, "mprot"); /* Allocate our own private page. */ pte_unmap(page_table); @@ -2342,7 +2343,9 @@ goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); + if (debug) printk("entry %#lx\n", entry.pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (debug) printk("maybe_mkwrite %#lx\n", entry.pte); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) @@ -2350,8 +2353,9 @@ inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); page_add_new_anon_rmap(page, vma, address); + if (debug) printk("set pte at %#lx %p %#lx\n", address, page_table, entry.pte); set_pte_at(mm, address, page_table, entry); - + if (debug) printk("set pte ok\n"); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, entry); unlock: @@ -2645,18 +2649,33 @@ { pte_t entry; spinlock_t *ptl; + int debug = !strcmp(current->comm, "mprot"); entry = *pte; + if (debug) { + printk(KERN_CRIT "pte fault on %#lx @ %p\n", entry.pte, pte); + printk(KERN_CRIT "vma %p: %#lx-%#lx %#lx\n", vma, vma->vm_start, vma->vm_end, vma->vm_flags); + } if (!pte_present(entry)) { + if (debug) printk(KERN_CRIT "pte not present\n"); if (pte_none(entry)) { + if (debug) printk(KERN_CRIT "pte is none\n"); if (vma->vm_ops) { - if (likely(vma->vm_ops->fault)) + if (likely(vma->vm_ops->fault)) { + if (debug) { + printk(KERN_CRIT "handle via vm_ops->fault %pF\n", vma->vm_ops->fault); + } return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); + } if (unlikely(vma->vm_ops->nopfn)) + if (debug) { + printk(KERN_CRIT "handle via vm_ops->nopfn %pF\n\n", vma->vm_ops->nopfn); + } return do_no_pfn(mm, vma, address, pte, pmd, write_access); } + if (debug) printk(KERN_CRIT "handle as anonymous page\n"); return do_anonymous_page(mm, vma, address, pte, pmd, write_access); } @@ -2665,6 +2684,8 @@ pte, pmd, write_access, entry); return do_swap_page(mm, vma, address, pte, pmd, write_access, entry); + } else { + if (debug) printk(KERN_CRIT "pte is present\n"); } ptl = pte_lockptr(mm, pmd); Index: sid-xen/include/asm-x86/mach-xen/asm/pgtable.h =================================================================== --- sid-xen.orig/include/asm-x86/mach-xen/asm/pgtable.h 2008-11-05 06:54:23.000000000 +0000 +++ sid-xen/include/asm-x86/mach-xen/asm/pgtable.h 2008-11-05 06:55:53.000000000 +0000 @@ -11,14 +11,14 @@ #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ #define _PAGE_BIT_FILE 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ -#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_PSE_ 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT_ 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_IO 9 /* Mapped page is I/O or foreign and * has no associated page struct. */ #define _PAGE_BIT_UNUSED2 10 /* available for programmer */ #define _PAGE_BIT_UNUSED3 11 -#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_PAT_LARGE_ 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* @@ -33,13 +33,13 @@ #define _PAGE_PCD (_AC(1, L)<<_PAGE_BIT_PCD) #define _PAGE_ACCESSED (_AC(1, L)<<_PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AC(1, L)<<_PAGE_BIT_DIRTY) -#define _PAGE_PSE (_AC(1, L)<<_PAGE_BIT_PSE) /* 2MB page */ +#define _PAGE_PSE_ (_AC(1, L)<<_PAGE_BIT_PSE_) /* 2MB page */ #define _PAGE_GLOBAL (_AC(1, L)<<_PAGE_BIT_GLOBAL) /* Global TLB entry */ #define _PAGE_IO (_AC(1, L)<<_PAGE_BIT_IO) #define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2) #define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3) -#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT) -#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE) +#define _PAGE_PAT_ (_AC(1, L)<<_PAGE_BIT_PAT_) +#define _PAGE_PAT_LARGE_ (_AC(1, L)<<_PAGE_BIT_PAT_LARGE_) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX) @@ -50,7 +50,7 @@ /* If _PAGE_PRESENT is clear, we use these: */ #define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping, * saved PTE; unset:swap */ -#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE; +#define _PAGE_PROTNONE _PAGE_PSE_ /* if the user mapped it with PROT_NONE; pte_present gives true */ #ifndef __ASSEMBLY__ @@ -74,11 +74,11 @@ * PAT settings are part of the hypervisor interface, which sets the * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). */ -#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT /*| _PAGE_PAT*/) #define _PAGE_CACHE_WB (0) #define _PAGE_CACHE_WT (_PAGE_PWT) -#define _PAGE_CACHE_WC (_PAGE_PAT) -#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +//#define _PAGE_CACHE_WC (_PAGE_PAT) +//#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) #define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) #define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) @@ -120,8 +120,8 @@ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL /*| _PAGE_PSE*/) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC /*| _PAGE_PSE*/) /* * We don't support GLOBAL page in xenolinux64 @@ -198,7 +198,8 @@ static inline int pte_huge(pte_t pte) { - return __pte_val(pte) & _PAGE_PSE; + return 0; + //return __pte_val(pte) & _PAGE_PSE; } static inline int pte_global(pte_t pte) @@ -218,8 +219,9 @@ static inline int pmd_large(pmd_t pte) { - return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return 0; + //return (__pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + //x (_PAGE_PSE | _PAGE_PRESENT); } static inline pte_t pte_mkclean(pte_t pte) @@ -259,12 +261,16 @@ static inline pte_t pte_mkhuge(pte_t pte) { - return __pte_ma(__pte_val(pte) | _PAGE_PSE); + BUG(); + return pte; + //return __pte_ma(__pte_val(pte) | _PAGE_PSE); } static inline pte_t pte_clrhuge(pte_t pte) { - return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); + BUG(); + return pte; + //return __pte_ma(__pte_val(pte) & ~(pteval_t)_PAGE_PSE); } static inline pte_t pte_mkglobal(pte_t pte)
signature.asc
Description: This is a digitally signed message part