On Mon, 2015-03-23 at 20:30 +0530, Aneesh Kumar K.V wrote: > -static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva, > +static inline pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, > unsigned long *pte_sizep) > { > pte_t *ptep; > unsigned long ps = *pte_sizep; > unsigned int shift; > + unsigned long flags; > > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); > if (!ptep) > - return NULL; > + goto err_out; > if (shift) > *pte_sizep = 1ul << shift; > else > *pte_sizep = PAGE_SIZE; > > if (ps > *pte_sizep) > - return NULL; > + goto err_out; > > - return ptep; > + local_irq_restore(flags); > + return *ptep; > +err_out: > + local_irq_restore(flags); > + return __pte(0); > }
Doesn't the above go backward vs. your explanation ? IE. You capture the PTE *value* inside the lock then drop the lock and have no check for splitting or anything, causing you to potentially return a stale PTE. You assume the caller will check for splitting ? It doesn't ... > #endif /* __ASSEMBLY__ */ > > diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c > index 3b2252e7731b..c2b44f250824 100644 > --- a/arch/powerpc/kernel/eeh.c > +++ b/arch/powerpc/kernel/eeh.c > @@ -326,17 +326,21 @@ void eeh_slot_error_detail(struct eeh_pe *pe, int > severity) > static inline unsigned long eeh_token_to_phys(unsigned long token) > { > pte_t *ptep; > - unsigned long pa; > + unsigned long pa, flags; > int hugepage_shift; > > /* > * We won't find hugepages here, iomem > */ > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); > - if (!ptep) > + if (!ptep) { > + local_irq_restore(flags); > return token; > + } > WARN_ON(hugepage_shift); > pa = pte_pfn(*ptep) << PAGE_SHIFT; > + local_irq_restore(flags); Isn't think the same as lookup_linux_pte ? Can't you just use that and keep WARN_ON(psize != PAGE_SIZE) ? > return pa | (token & (PAGE_SIZE-1)); > } > diff --git a/arch/powerpc/kernel/io-workarounds.c > b/arch/powerpc/kernel/io-workarounds.c > index 24b968f8e4d8..bd958d93fd38 100644 > --- a/arch/powerpc/kernel/io-workarounds.c > +++ b/arch/powerpc/kernel/io-workarounds.c > @@ -65,6 +65,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) > if (token && token <= iowa_bus_count) > bus = &iowa_busses[token - 1]; > else { > + unsigned long flags; > unsigned long vaddr, paddr; > pte_t *ptep; > > @@ -72,6 +73,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) > if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) > return NULL; > > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, > &hugepage_shift); > if (ptep == NULL) > @@ -83,6 +85,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) > WARN_ON(hugepage_shift); > paddr = pte_pfn(*ptep) << PAGE_SHIFT; > } > + local_irq_restore(flags); > bus = iowa_pci_find(vaddr, paddr); Same. > if (bus == NULL) > diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c > b/arch/powerpc/kvm/book3s_64_mmu_hv.c > index 534acb3c6c3d..2dccaa6a1113 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c > @@ -537,14 +537,15 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, > struct kvm_vcpu *vcpu, > } > /* if the guest wants write access, see if that is OK */ > if (!writing && hpte_is_writable(r)) { > + unsigned long flags; > unsigned int hugepage_shift; > pte_t *ptep, pte; > > /* > * We need to protect against page table destruction > - * while looking up and updating the pte. > + * hugepage split and collapse. > */ > - rcu_read_lock_sched(); > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(current->mm->pgd, > hva, &hugepage_shift); > if (ptep) { > @@ -553,7 +554,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, > struct kvm_vcpu *vcpu, > if (pte_write(pte)) > write_ok = 1; > } > - rcu_read_unlock_sched(); > + local_irq_restore(flags); > } > } Ack. > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > index 625407e4d3b0..b0ad673b35cf 100644 > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > @@ -24,14 +24,19 @@ > /* Translate address of a vmalloc'd thing to a linear map address */ > static void *real_vmalloc_addr(void *x) > { > + unsigned long flags; > unsigned long addr = (unsigned long) x; > pte_t *p; > > + local_irq_save(flags); > p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); > - if (!p || !pte_present(*p)) > + if (!p || !pte_present(*p)) { > + local_irq_restore(flags); > return NULL; > + } > /* assume we don't have huge pages in vmalloc space... */ > addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); > + local_irq_restore(flags); > return __va(addr); > } This is called in real mode, I don't like the debug stuff that can happen inside local_irq_* in that context. Can you either pounce that to the higher level callers (and so not do it in real mode at all) or use a lower level variant ? (I prefer going for the callers). > @@ -134,20 +139,27 @@ static void remove_revmap_chain(struct kvm *kvm, long > pte_index, > static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva, > int writing, unsigned long *pte_sizep) > { > - pte_t *ptep; > + pte_t *ptep, pte; > + unsigned long flags; > unsigned long ps = *pte_sizep; > unsigned int hugepage_shift; > > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift); > if (!ptep) > - return __pte(0); > + goto err_out; > if (hugepage_shift) > *pte_sizep = 1ul << hugepage_shift; > else > *pte_sizep = PAGE_SIZE; > if (ps > *pte_sizep) > - return __pte(0); > - return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); > + goto err_out; > + pte = kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift); > + local_irq_restore(flags); > + return pte; > +err_out: > + local_irq_restore(flags); > + return __pte(0); > } Ditto > static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v) > @@ -211,6 +223,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long > flags, > > /* Look up the Linux PTE for the backing page */ > pte_size = psize; > + /* > + * If we had a page table table change after lookup, we would > + * retry via mmu_notifier_retry. > + */ > pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); > if (pte_present(pte) && !pte_protnone(pte)) { > if (writing && !pte_write(pte)) > diff --git a/arch/powerpc/kvm/e500_mmu_host.c > b/arch/powerpc/kvm/e500_mmu_host.c > index cc536d4a75ef..b50778eee650 100644 > --- a/arch/powerpc/kvm/e500_mmu_host.c > +++ b/arch/powerpc/kvm/e500_mmu_host.c > @@ -335,7 +335,7 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > unsigned long mmu_seq; > struct kvm *kvm = vcpu_e500->vcpu.kvm; > unsigned long tsize_pages = 0; > - pte_t *ptep; > + pte_t pte; > unsigned int wimg = 0; > pgd_t *pgdir; > > @@ -468,9 +468,16 @@ static inline int kvmppc_e500_shadow_map(struct > kvmppc_vcpu_e500 *vcpu_e500, > > > pgdir = vcpu_e500->vcpu.arch.pgdir; > - ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages); > - if (pte_present(*ptep)) > - wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; > + /* > + * We are just looking at the wimg bits, so we don't > + * care much about the trans splitting bit. > + * How about hugepage collapse ? the pfn will change, > + * may be we should hold mmap_sem ?. We also don't > + * have a page reference count here . > + */ > + pte = lookup_linux_pte(pgdir, hva, &tsize_pages); > + if (pte_present(pte)) > + wimg = (pte >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; > else { > if (printk_ratelimit()) > pr_err("%s: pte not present: gfn %lx, pfn %lx\n", > diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c > index 7e408bfc7948..92f01649450a 100644 > --- a/arch/powerpc/mm/hugetlbpage.c > +++ b/arch/powerpc/mm/hugetlbpage.c > @@ -108,8 +108,17 @@ int pgd_huge(pgd_t pgd) > > pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) > { > - /* Only called for hugetlbfs pages, hence can ignore THP */ > - return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); > + unsigned long flags; > + pte_t *ptep; > + /* > + * Only called for hugetlbfs pages, hence can ignore THP > + * The save and restore are there only for avoiding the warning > + * in find_linux_pte_or_hugepte(). > + */ > + local_irq_save(flags); > + ptep = find_linux_pte_or_hugepte(mm->pgd, addr, NULL); > + local_irq_restore(flags); > + return ptep; > } > > static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, > @@ -681,28 +690,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, > } while (addr = next, addr != end); > } > > +/* > + * We are holding mmap_sem, so a parallel huge page collapse cannot run. > + * To prevent hugepage split, disable irq. > + */ > struct page * > follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) > { > pte_t *ptep; > struct page *page; > unsigned shift; > - unsigned long mask; > + unsigned long mask, flags; > /* > * Transparent hugepages are handled by generic code. We can skip them > * here. > */ > + local_irq_save(flags); > ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); > > /* Verify it is a huge page else bail. */ > - if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) > + if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) { > + local_irq_restore(flags); > return ERR_PTR(-EINVAL); > - > + } > mask = (1UL << shift) - 1; > page = pte_page(*ptep); > if (page) > page += (address & mask) / PAGE_SIZE; > > + local_irq_restore(flags); > return page; > } > > @@ -949,6 +965,8 @@ void flush_dcache_icache_hugepage(struct page *page) > * > * So long as we atomically load page table pointers we are safe against > teardown, > * we can follow the address down to the the page and take a ref on it. > + * Should be called with irq disabled. If is is splitting hugepage, we will > return > + * NULL; > */ > > pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned > *shift) > @@ -963,6 +981,11 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned > long ea, unsigned *shift > if (shift) > *shift = 0; > > + if (!irqs_disabled()) { Might want to use the arch_* variant for similar "beware of generic debug stuff in real mode" reasons... > + pr_info("%s called with irq enabled\n", __func__); > + dump_stack(); > + } > + > pgdp = pgdir + pgd_index(ea); > pgd = ACCESS_ONCE(*pgdp); > /* > diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c > index 2396dda282cd..173e4abc0c56 100644 > --- a/arch/powerpc/perf/callchain.c > +++ b/arch/powerpc/perf/callchain.c > @@ -125,6 +125,7 @@ static int read_user_stack_slow(void __user *ptr, void > *ret, int nb) > if (!pgdir) > return -EFAULT; > > + /* FIXME!! I guess we come with irq disabled here ? */ Not that I know of, that's not guaranteed. > ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); > if (!shift) > shift = PAGE_SHIFT; _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev