Ever since commit b756a3b5e7ea ("mm: device exclusive memory access")
we can return with a device-exclusive entry from page_vma_mapped_walk().

try_to_migrate_one() is not prepared for that, so teach it about these
non-present nonswap PTEs. We already handle device-private entries by
specializing on the folio, so we can reshuffle that code to make it
work on the non-present nonswap PTEs instead.

Get rid of most folio_is_device_private() handling, except when handling
HWPoison. It's unclear what the right thing to do here is.

Note that we could currently only run into this case with
device-exclusive entries on THPs; but as we have a refcount vs. mapcount
inbalance, folio splitting etc. will just bail out early and not even
try migrating. For order-0 folios, we still adjust the mapcount on
conversion to device-exclusive, making the rmap walk abort early
(folio_mapcount() == 0 and breaking swapout). We'll fix
that next, now that try_to_migrate_one() can handle it.

Further note that try_to_migrate() calls MMU notifiers and holds the
folio lock, so any device-exclusive users should be properly prepared
for this device-exclusive PTE to "vanish".

Fixes: b756a3b5e7ea ("mm: device exclusive memory access")
Signed-off-by: David Hildenbrand <da...@redhat.com>
---
 mm/rmap.c | 125 ++++++++++++++++++++++--------------------------------
 1 file changed, 51 insertions(+), 74 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 12900f367a2a..903a78e60781 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2040,9 +2040,9 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
 {
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
+       bool anon_exclusive, writable, ret = true;
        pte_t pteval;
        struct page *subpage;
-       bool anon_exclusive, ret = true;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
@@ -2109,24 +2109,20 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);
 
-               pfn = pte_pfn(ptep_get(pvmw.pte));
-
-               if (folio_is_zone_device(folio)) {
-                       /*
-                        * Our PTE is a non-present device exclusive entry and
-                        * calculating the subpage as for the common case would
-                        * result in an invalid pointer.
-                        *
-                        * Since only PAGE_SIZE pages can currently be
-                        * migrated, just set it to page. This will need to be
-                        * changed when hugepage migrations to device private
-                        * memory are supported.
-                        */
-                       VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
-                       subpage = &folio->page;
+               /*
+                * We can end up here with selected non-swap entries that
+                * actually map pages similar to PROT_NONE; see
+                * page_vma_mapped_walk()->check_pte().
+                */
+               pteval = ptep_get(pvmw.pte);
+               if (likely(pte_present(pteval))) {
+                       pfn = pte_pfn(pteval);
                } else {
-                       subpage = folio_page(folio, pfn - folio_pfn(folio));
+                       pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
+                       VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
                }
+
+               subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);
@@ -2182,7 +2178,10 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                        }
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
-               } else {
+                       if (pte_dirty(pteval))
+                               folio_mark_dirty(folio);
+                       writable = pte_write(pteval);
+               } else if (likely(pte_present(pteval))) {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
@@ -2200,54 +2199,21 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                        } else {
                                pteval = ptep_clear_flush(vma, address, 
pvmw.pte);
                        }
+                       if (pte_dirty(pteval))
+                               folio_mark_dirty(folio);
+                       writable = pte_write(pteval);
+               } else {
+                       pte_clear(mm, address, pvmw.pte);
+                       writable = 
is_writable_device_private_entry(pte_to_swp_entry(pteval));
                }
 
-               /* Set the dirty flag on the folio now the pte is gone. */
-               if (pte_dirty(pteval))
-                       folio_mark_dirty(folio);
+               VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
+                               !anon_exclusive, folio);
 
                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);
 
-               if (folio_is_device_private(folio)) {
-                       unsigned long pfn = folio_pfn(folio);
-                       swp_entry_t entry;
-                       pte_t swp_pte;
-
-                       if (anon_exclusive)
-                               
WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
-                                                                          
subpage));
-
-                       /*
-                        * Store the pfn of the page in a special migration
-                        * pte. do_swap_page() will wait until the migration
-                        * pte is removed and then restart fault handling.
-                        */
-                       entry = pte_to_swp_entry(pteval);
-                       if (is_writable_device_private_entry(entry))
-                               entry = make_writable_migration_entry(pfn);
-                       else if (anon_exclusive)
-                               entry = 
make_readable_exclusive_migration_entry(pfn);
-                       else
-                               entry = make_readable_migration_entry(pfn);
-                       swp_pte = swp_entry_to_pte(entry);
-
-                       /*
-                        * pteval maps a zone device page and is therefore
-                        * a swap pte.
-                        */
-                       if (pte_swp_soft_dirty(pteval))
-                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                       if (pte_swp_uffd_wp(pteval))
-                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
-                       set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
-                       trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
-                                               folio_order(folio));
-                       /*
-                        * No need to invalidate here it will synchronize on
-                        * against the special swap migration pte.
-                        */
-               } else if (PageHWPoison(subpage)) {
+               if (PageHWPoison(subpage) && !folio_is_device_private(folio)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
@@ -2257,8 +2223,8 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
-
-               } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+               } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
+                          !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
@@ -2274,6 +2240,11 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                        swp_entry_t entry;
                        pte_t swp_pte;
 
+                       /*
+                        * arch_unmap_one() is expected to be a NOP on
+                        * architectures where we could have non-swp entries
+                        * here.
+                        */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                if (folio_test_hugetlb(folio))
                                        set_huge_pte_at(mm, address, pvmw.pte,
@@ -2284,8 +2255,6 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
-                       VM_BUG_ON_PAGE(pte_write(pteval) && 
folio_test_anon(folio) &&
-                                      !anon_exclusive, subpage);
 
                        /* See folio_try_share_anon_rmap_pte(): clear PTE 
first. */
                        if (folio_test_hugetlb(folio)) {
@@ -2310,7 +2279,7 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                       if (pte_write(pteval))
+                       if (writable)
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
                        else if (anon_exclusive)
@@ -2319,15 +2288,23 @@ static bool try_to_migrate_one(struct folio *folio, 
struct vm_area_struct *vma,
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
-                       if (pte_young(pteval))
-                               entry = make_migration_entry_young(entry);
-                       if (pte_dirty(pteval))
-                               entry = make_migration_entry_dirty(entry);
-                       swp_pte = swp_entry_to_pte(entry);
-                       if (pte_soft_dirty(pteval))
-                               swp_pte = pte_swp_mksoft_dirty(swp_pte);
-                       if (pte_uffd_wp(pteval))
-                               swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       if (likely(pte_present(pteval))) {
+                               if (pte_young(pteval))
+                                       entry = 
make_migration_entry_young(entry);
+                               if (pte_dirty(pteval))
+                                       entry = 
make_migration_entry_dirty(entry);
+                               swp_pte = swp_entry_to_pte(entry);
+                               if (pte_soft_dirty(pteval))
+                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                               if (pte_uffd_wp(pteval))
+                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       } else {
+                               swp_pte = swp_entry_to_pte(entry);
+                               if (pte_swp_soft_dirty(pteval))
+                                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+                               if (pte_swp_uffd_wp(pteval))
+                                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
+                       }
                        if (folio_test_hugetlb(folio))
                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
                                                hsz);
-- 
2.48.1

Reply via email to