We require a writable PTE and only support anonymous folio: we can only
have exactly one PTE pointing at that page, which we can just lookup
using a folio walk, avoiding the rmap walk and the anon VMA lock.

So let's stop doing an rmap walk and perform a folio walk instead, so we
can easily just modify a single PTE and avoid relying on rmap/mapcounts.

We now effectively work on a single PTE instead of multiple PTEs of
a large folio, allowing for conversion of individual PTEs from
non-exclusive to device-exclusive -- note that the opposite direction
always works on single PTEs: restore_exclusive_pte().

With this change, device-exclusive handling is fully compatible with THPs /
large folios. We still require PMD-sized THPs to get PTE-mapped, and
supporting PMD-mapped THP (without the PTE-remapping) is a different
endeavour that might not be worth it at this point: it might even have
negative side-effects [1].

This gets rid of the "folio_mapcount()" usage and let's us fix ordinary
rmap walks (migration/swapout) next. Spell out that messing with the
mapcount is wrong and must be fixed.

[1] https://lkml.kernel.org/r/Z5tI-cOSyzdLjoe_@phenom.ffwll.local

Signed-off-by: David Hildenbrand <da...@redhat.com>
---
 mm/rmap.c | 200 ++++++++++++++++++------------------------------------
 1 file changed, 67 insertions(+), 133 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 7ccf850565d33..0cd2a2d3de00d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2375,131 +2375,6 @@ void try_to_migrate(struct folio *folio, enum ttu_flags 
flags)
 }
 
 #ifdef CONFIG_DEVICE_PRIVATE
-struct make_exclusive_args {
-       struct mm_struct *mm;
-       unsigned long address;
-       void *owner;
-       bool valid;
-};
-
-static bool page_make_device_exclusive_one(struct folio *folio,
-               struct vm_area_struct *vma, unsigned long address, void *priv)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
-       struct make_exclusive_args *args = priv;
-       pte_t pteval;
-       struct page *subpage;
-       bool ret = true;
-       struct mmu_notifier_range range;
-       swp_entry_t entry;
-       pte_t swp_pte;
-       pte_t ptent;
-
-       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
-                                     vma->vm_mm, address, min(vma->vm_end,
-                                     address + folio_size(folio)),
-                                     args->owner);
-       mmu_notifier_invalidate_range_start(&range);
-
-       while (page_vma_mapped_walk(&pvmw)) {
-               /* Unexpected PMD-mapped THP? */
-               VM_BUG_ON_FOLIO(!pvmw.pte, folio);
-
-               ptent = ptep_get(pvmw.pte);
-               if (!pte_present(ptent)) {
-                       ret = false;
-                       page_vma_mapped_walk_done(&pvmw);
-                       break;
-               }
-
-               subpage = folio_page(folio,
-                               pte_pfn(ptent) - folio_pfn(folio));
-               address = pvmw.address;
-
-               /* Nuke the page table entry. */
-               flush_cache_page(vma, address, pte_pfn(ptent));
-               pteval = ptep_clear_flush(vma, address, pvmw.pte);
-
-               /* Set the dirty flag on the folio now the pte is gone. */
-               if (pte_dirty(pteval))
-                       folio_mark_dirty(folio);
-
-               /*
-                * Check that our target page is still mapped at the expected
-                * address.
-                */
-               if (args->mm == mm && args->address == address &&
-                   pte_write(pteval))
-                       args->valid = true;
-
-               /*
-                * Store the pfn of the page in a special migration
-                * pte. do_swap_page() will wait until the migration
-                * pte is removed and then restart fault handling.
-                */
-               if (pte_write(pteval))
-                       entry = make_writable_device_exclusive_entry(
-                                                       page_to_pfn(subpage));
-               else
-                       entry = make_readable_device_exclusive_entry(
-                                                       page_to_pfn(subpage));
-               swp_pte = swp_entry_to_pte(entry);
-               if (pte_soft_dirty(pteval))
-                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
-               if (pte_uffd_wp(pteval))
-                       swp_pte = pte_swp_mkuffd_wp(swp_pte);
-
-               set_pte_at(mm, address, pvmw.pte, swp_pte);
-
-               /*
-                * There is a reference on the page for the swap entry which has
-                * been removed, so shouldn't take another.
-                */
-               folio_remove_rmap_pte(folio, subpage, vma);
-       }
-
-       mmu_notifier_invalidate_range_end(&range);
-
-       return ret;
-}
-
-/**
- * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
- * @folio: The folio to replace page table entries for.
- * @mm: The mm_struct where the folio is expected to be mapped.
- * @address: Address where the folio is expected to be mapped.
- * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
- *
- * Tries to remove all the page table entries which are mapping this
- * folio and replace them with special device exclusive swap entries to
- * grant a device exclusive access to the folio.
- *
- * Context: Caller must hold the folio lock.
- * Return: false if the page is still mapped, or if it could not be unmapped
- * from the expected address. Otherwise returns true (success).
- */
-static bool folio_make_device_exclusive(struct folio *folio,
-               struct mm_struct *mm, unsigned long address, void *owner)
-{
-       struct make_exclusive_args args = {
-               .mm = mm,
-               .address = address,
-               .owner = owner,
-               .valid = false,
-       };
-       struct rmap_walk_control rwc = {
-               .rmap_one = page_make_device_exclusive_one,
-               .done = folio_not_mapped,
-               .anon_lock = folio_lock_anon_vma_read,
-               .arg = &args,
-       };
-
-       rmap_walk(folio, &rwc);
-
-       return args.valid && !folio_mapcount(folio);
-}
-
 /**
  * make_device_exclusive() - Mark a page for exclusive use by a device
  * @mm: mm_struct of associated target process
@@ -2541,22 +2416,31 @@ static bool folio_make_device_exclusive(struct folio 
*folio,
 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop)
 {
-       struct folio *folio;
+       struct mmu_notifier_range range;
+       struct folio *folio, *fw_folio;
+       struct vm_area_struct *vma;
+       struct folio_walk fw;
        struct page *page;
-       long npages;
+       swp_entry_t entry;
+       pte_t swp_pte;
 
        mmap_assert_locked(mm);
+       addr = PAGE_ALIGN_DOWN(addr);
 
        /*
         * Fault in the page writable and try to lock it; note that if the
         * address would already be marked for exclusive use by a device,
         * the GUP call would undo that first by triggering a fault.
+        *
+        * If any other device would already map this page exclusively, the
+        * fault will trigger a conversion to an ordinary
+        * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
         */
-       npages = get_user_pages_remote(mm, addr, 1,
-                                      FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
-                                      &page, NULL);
-       if (npages != 1)
-               return ERR_PTR(npages);
+       page = get_user_page_vma_remote(mm, addr,
+                                       FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+                                       &vma);
+       if (IS_ERR(page))
+               return page;
        folio = page_folio(page);
 
        if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
@@ -2569,11 +2453,61 @@ struct page *make_device_exclusive(struct mm_struct 
*mm, unsigned long addr,
                return ERR_PTR(-EBUSY);
        }
 
-       if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
+       /*
+        * Inform secondary MMUs that we are going to convert this PTE to
+        * device-exclusive, such that they unmap it now. Note that the
+        * caller must filter this event out to prevent livelocks.
+        */
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
+                                     mm, addr, addr + PAGE_SIZE, owner);
+       mmu_notifier_invalidate_range_start(&range);
+
+       /*
+        * Let's do a second walk and make sure we still find the same page
+        * mapped writable. Note that any page of an anonymous folio can
+        * only be mapped writable using exactly one PTE ("exclusive"), so
+        * there cannot be other mappings.
+        */
+       fw_folio = folio_walk_start(&fw, vma, addr, 0);
+       if (fw_folio != folio || fw.page != page ||
+           fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
+               if (fw_folio)
+                       folio_walk_end(&fw, vma);
+               mmu_notifier_invalidate_range_end(&range);
                folio_unlock(folio);
                folio_put(folio);
                return ERR_PTR(-EBUSY);
        }
+
+       /* Nuke the page table entry so we get the uptodate dirty bit. */
+       flush_cache_page(vma, addr, page_to_pfn(page));
+       fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
+
+       /* Set the dirty flag on the folio now the PTE is gone. */
+       if (pte_dirty(fw.pte))
+               folio_mark_dirty(folio);
+
+       /*
+        * Store the pfn of the page in a special device-exclusive PFN swap PTE.
+        * do_swap_page() will trigger the conversion back while holding the
+        * folio lock.
+        */
+       entry = make_writable_device_exclusive_entry(page_to_pfn(page));
+       swp_pte = swp_entry_to_pte(entry);
+       if (pte_soft_dirty(fw.pte))
+               swp_pte = pte_swp_mksoft_dirty(swp_pte);
+       /* The pte is writable, uffd-wp does not apply. */
+       set_pte_at(mm, addr, fw.ptep, swp_pte);
+
+       /*
+        * TODO: The device-exclusive PFN swap PTE holds a folio reference but
+        * does not count as a mapping (mapcount), which is wrong and must be
+        * fixed, otherwise RMAP walks don't behave as expected.
+        */
+       folio_remove_rmap_pte(folio, page, vma);
+
+       folio_walk_end(&fw, vma);
+       mmu_notifier_invalidate_range_end(&range);
        *foliop = folio;
        return page;
 }
-- 
2.48.1

Reply via email to