CPU 1                           CPU 2                                   CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

mmap_write_lock_killable()

                                addr = old_addr
                                lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
                                                                        
*new_addr = 10; and fills
                                                                        TLB 
with new addr
                                                                        and old 
pfn

unlock(pmd_ptl)
                                ptep_clear_flush()
                                old pfn is free.
                                                                        Stale 
TLB entry

Fix this race by holding pmd lock in pageout. This still doesn't handle the race
between MOVE_PUD and pageout.

Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: 
https://lore.kernel.org/linux-mm/CAHk-=wgxvr04ebntxqfevontwnp6fdm+oj5vauqxp3s-huw...@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.ibm.com>
---
 include/linux/rmap.h |  9 ++++++---
 mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index def5c62c93b3..272ab0c2b60b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
-       spinlock_t *ptl;
+       spinlock_t *pte_ptl;
+       spinlock_t *pmd_ptl;
        unsigned int flags;
 };
 
@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct 
page_vma_mapped_walk *pvmw)
        /* HugeTLB pte is set to the relevant page table entry without 
pte_mapped. */
        if (pvmw->pte && !PageHuge(pvmw->page))
                pte_unmap(pvmw->pte);
-       if (pvmw->ptl)
-               spin_unlock(pvmw->ptl);
+       if (pvmw->pte_ptl)
+               spin_unlock(pvmw->pte_ptl);
+       if (pvmw->pmd_ptl)
+               spin_unlock(pvmw->pmd_ptl);
 }
 
 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..87a2c94c7e27 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
                                return false;
                }
        }
-       pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
-       spin_lock(pvmw->ptl);
+       if (USE_SPLIT_PTE_PTLOCKS) {
+               pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+               spin_lock(pvmw->pte_ptl);
+       }
        return true;
 }
 
@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
                if (!pvmw->pte)
                        return false;
 
-               pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
-               spin_lock(pvmw->ptl);
+               pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, 
pvmw->pte);
+               spin_lock(pvmw->pte_ptl);
                if (!check_pte(pvmw))
                        return not_found(pvmw);
                return true;
@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
        if (!pud_present(*pud))
                return false;
        pvmw->pmd = pmd_offset(pud, pvmw->address);
+       pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
        /*
         * Make sure the pmd value isn't cached in a register by the
         * compiler and used as a stale value after we've observed a
@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
         */
        pmde = READ_ONCE(*pvmw->pmd);
        if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
-               pvmw->ptl = pmd_lock(mm, pvmw->pmd);
                if (likely(pmd_trans_huge(*pvmw->pmd))) {
                        if (pvmw->flags & PVMW_MIGRATION)
                                return not_found(pvmw);
@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
                                }
                        }
                        return not_found(pvmw);
-               } else {
-                       /* THP pmd was split under us: handle on pte level */
-                       spin_unlock(pvmw->ptl);
-                       pvmw->ptl = NULL;
                }
-       } else if (!pmd_present(pmde)) {
-               return false;
-       }
+       } else if (!pmd_present(pmde))
+               return not_found(pvmw);
+
        if (!map_pte(pvmw))
                goto next_pte;
        while (1) {
@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
                        /* Did we cross page table boundary? */
                        if (pvmw->address % PMD_SIZE == 0) {
                                pte_unmap(pvmw->pte);
-                               if (pvmw->ptl) {
-                                       spin_unlock(pvmw->ptl);
-                                       pvmw->ptl = NULL;
+                               if (pvmw->pte_ptl) {
+                                       spin_unlock(pvmw->pte_ptl);
+                                       pvmw->pte_ptl = NULL;
                                }
+                               spin_unlock(pvmw->pmd_ptl);
+                               pvmw->pmd_ptl = NULL;
                                goto restart;
                        } else {
                                pvmw->pte++;
                        }
                } while (pte_none(*pvmw->pte));
 
-               if (!pvmw->ptl) {
-                       pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
-                       spin_lock(pvmw->ptl);
+               if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
+                       pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
+                       spin_lock(pvmw->pte_ptl);
                }
        }
 }
-- 
2.31.1

Reply via email to