Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
we can return with a device-exclusive entry from page_vma_mapped_walk().

try_to_unmap_one() is not prepared for that, so teach it about these
PFN swap PTEs. Note that device-private entries are so far not
applicable on that path, as we expect ZONE_DEVICE pages so far only in
migration code when it comes to the RMAP.

Note that we could currently only run into this case with
device-exclusive entries on THPs. We still adjust the mapcount on
conversion to device-exclusive; this makes the rmap walk
abort early for small folios, because we'll always have
!folio_mapped() with a single device-exclusive entry. We'll adjust the
mapcount logic once all page_vma_mapped_walk() users can properly
handle device-exclusive entries.

Further note that try_to_unmap() calls MMU notifiers and holds the
folio lock, so any device-exclusive users should be properly prepared
for a device-exclusive PTE to "vanish".

Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
Signed-off-by: David Hildenbrand <da...@redhat.com>
---
 mm/rmap.c | 52 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 1129ed132af94..47142a656ae51 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1648,9 +1648,9 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+       bool anon_exclusive, ret = true;
        pte_t pteval;
        struct page *subpage;
-       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
@@ -1722,7 +1722,18 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);
 
-               pfn = pte_pfn(ptep_get(pvmw.pte));
+               /*
+                * Handle PFN swap PTEs, such as device-exclusive ones, that
+                * actually map pages.
+                */
+               pteval = ptep_get(pvmw.pte);
+               if (likely(pte_present(pteval))) {
+                       pfn = pte_pfn(pteval);
+               } else {
+                       pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+                       VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+               }
+
                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
@@ -1778,7 +1789,9 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                                hugetlb_vma_unlock_write(vma);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
-               } else {
+                       if (pte_dirty(pteval))
+                               folio_mark_dirty(folio);
+               } else if (likely(pte_present(pteval))) {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
@@ -1796,6 +1809,10 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                        } else {
                                pteval = ptep_clear_flush(vma, address, 
pvmw.pte);
                        }
+                       if (pte_dirty(pteval))
+                               folio_mark_dirty(folio);
+               } else {
+                       pte_clear(mm, address, pvmw.pte);
                }
 
                /*
@@ -1805,10 +1822,6 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                 */
                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
 
-               /* Set the dirty flag on the folio now the pte is gone. */
-               if (pte_dirty(pteval))
-                       folio_mark_dirty(folio);
-
                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);
 
@@ -1822,8 +1835,8 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
-
-               } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+               } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+                          !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
@@ -1902,6 +1915,12 @@ static bool try_to_unmap_one(struct folio *folio, struct 
vm_area_struct *vma,
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }
+
+                       /*
+                        * arch_unmap_one() is expected to be a NOP on
+                        * architectures where we could have PFN swap PTEs,
+                        * so we'll not check/care.
+                        */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1926,10 +1945,17 @@ static bool try_to_unmap_one(struct folio *folio, 
struct vm_area_struct *vma,
                        swp_pte = swp_entry_to_pte(entry);
                        if (anon_exclusive)
                                swp_pte = pte_swp_mkexclusive(swp_pte);
-                       if (pte_soft_dirty(pteval))
-                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                       if (pte_uffd_wp(pteval))
-                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       if (likely(pte_present(pteval))) {
+                               if (pte_soft_dirty(pteval))
+                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                               if (pte_uffd_wp(pteval))
+                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       } else {
+                               if (pte_swp_soft_dirty(pteval))
+                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                               if (pte_swp_uffd_wp(pteval))
+                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       }
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                } else {
                        /*
-- 
2.48.1

Reply via email to