Wire the fault side of read-write protection tracking and turn the
userspace interface on.

An RWP-protected PTE is PAGE_NONE with the uffd bit set. The
PROT_NONE triggers a fault on any access; the uffd bit distinguishes
it from plain mprotect(PROT_NONE) or NUMA hinting.

Fault dispatch, per level:

  PTE     handle_pte_fault()    -> do_uffd_rwp()
  PMD     __handle_mm_fault()   -> do_huge_pmd_uffd_rwp()
  hugetlb hugetlb_fault()       -> hugetlb_handle_userfault()

The RWP branches gate on userfaultfd_pte_rwp() / userfaultfd_huge_pmd_rwp()
(VM_UFFD_RWP plus the uffd bit) and fall through to do_numa_page() /
do_huge_pmd_numa_page() otherwise. Each delivers a
UFFD_PAGEFAULT_FLAG_RWP message through handle_userfault(); the handler
resolves it with UFFDIO_RWPROTECT clearing MODE_RWP.

userfaultfd_must_wait() and userfaultfd_huge_must_wait() add matching
protnone+uffd waiters so sync-mode fault handlers block correctly.

Expose the UAPI:

  UFFDIO_REGISTER_MODE_RWP   -> UFFD_API_REGISTER_MODES
  UFFD_FEATURE_RWP           -> UFFD_API_FEATURES
  _UFFDIO_RWPROTECT          -> UFFD_API_RANGE_IOCTLS
                                UFFD_API_RANGE_IOCTLS_BASIC

UFFD_FEATURE_RWP is masked out at UFFDIO_API time when PROT_NONE is
not available or VM_UFFD_RWP aliases VM_NONE (32-bit), so userspace
never sees an advertised-but-broken feature.

Works on anonymous, shmem, and hugetlb memory.

Signed-off-by: Kiryl Shutsemau <[email protected]>
Assisted-by: Claude:claude-opus-4-6
---
 fs/userfaultfd.c                 | 33 ++++++++++++++++++++++++++++++--
 include/linux/huge_mm.h          |  7 +++++++
 include/linux/userfaultfd_k.h    | 24 +++++++++++++++++++++++
 include/uapi/linux/userfaultfd.h | 12 ++++++++----
 mm/huge_memory.c                 |  5 +++++
 mm/hugetlb.c                     | 11 +++++++++++
 mm/memory.c                      | 21 ++++++++++++++++++--
 7 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f2097c558165..6e577c4ac4dd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -261,6 +261,12 @@ static inline bool userfaultfd_huge_must_wait(struct 
userfaultfd_ctx *ctx,
         */
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                return true;
+       /*
+        * PTE is still RW-protected (protnone with uffd bit), wait for
+        * resolution. Plain PROT_NONE without the marker is not an RWP fault.
+        */
+       if (pte_protnone(pte) && huge_pte_uffd(pte) && (reason & VM_UFFD_RWP))
+               return true;
 
        return false;
 }
@@ -321,8 +327,14 @@ static inline bool userfaultfd_must_wait(struct 
userfaultfd_ctx *ctx,
        if (!pmd_present(_pmd))
                return false;
 
-       if (pmd_trans_huge(_pmd))
-               return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+       if (pmd_trans_huge(_pmd)) {
+               if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+                       return true;
+               if (pmd_protnone(_pmd) && pmd_uffd(_pmd) &&
+                   (reason & VM_UFFD_RWP))
+                       return true;
+               return false;
+       }
 
        pte = pte_offset_map(pmd, address);
        if (!pte)
@@ -347,6 +359,14 @@ static inline bool userfaultfd_must_wait(struct 
userfaultfd_ctx *ctx,
         */
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                goto out;
+       /*
+        * PTE is still RW-protected (protnone with uffd bit), wait for
+        * userspace to resolve. Plain PROT_NONE without the marker is not
+        * an RWP fault.
+        */
+       if (pte_protnone(ptent) && pte_uffd(ptent) &&
+           (reason & VM_UFFD_RWP))
+               goto out;
 
        ret = false;
 out:
@@ -2086,6 +2106,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
                uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
        }
+       /*
+        * RWP needs both PROT_NONE support and the uffd-wp PTE bit. The
+        * VM_UFFD_RWP check covers compile-time unavailability; the
+        * pgtable_supports_uffd() check covers runtime (e.g. riscv
+        * without the SVRSW60T59B extension) where the PTE bit is declared
+        * but not actually usable.
+        */
+       if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
+               uffdio_api.features &= ~UFFD_FEATURE_RWP;
 
        ret = -EINVAL;
        if (features & ~uffdio_api.features)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..e980909ee49e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -520,6 +520,8 @@ static inline bool folio_test_pmd_mappable(struct folio 
*folio)
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf);
+
 vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
 
 extern struct folio *huge_zero_folio;
@@ -702,6 +704,11 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
        return NULL;
 }
 
+static inline vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+       return 0;
+}
+
 static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
        return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3dfcdc3a9b98..37e8d0d29353 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct 
vm_area_struct *vma,
        return userfaultfd_wp(vma) && pmd_uffd(pmd);
 }
 
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+                                      pte_t pte)
+{
+       return userfaultfd_rwp(vma) && pte_uffd(pte);
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+                                           pmd_t pmd)
+{
+       return userfaultfd_rwp(vma) && pmd_uffd(pmd);
+}
+
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return vma->vm_flags & __VM_UFFD_FLAGS;
@@ -399,6 +411,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct 
vm_area_struct *vma,
        return false;
 }
 
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+                                      pte_t pte)
+{
+       return false;
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+                                           pmd_t pmd)
+{
+       return false;
+}
+
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return false;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7b78aa3b5318..d803e76d47ad 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,8 @@
 #define UFFD_API ((__u64)0xAA)
 #define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING |        \
                                 UFFDIO_REGISTER_MODE_WP |      \
-                                UFFDIO_REGISTER_MODE_MINOR)
+                                UFFDIO_REGISTER_MODE_MINOR |   \
+                                UFFDIO_REGISTER_MODE_RWP)
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |    \
                           UFFD_FEATURE_EVENT_FORK |            \
                           UFFD_FEATURE_EVENT_REMAP |           \
@@ -42,7 +43,8 @@
                           UFFD_FEATURE_WP_UNPOPULATED |        \
                           UFFD_FEATURE_POISON |                \
                           UFFD_FEATURE_WP_ASYNC |              \
-                          UFFD_FEATURE_MOVE)
+                          UFFD_FEATURE_MOVE |                  \
+                          UFFD_FEATURE_RWP)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
@@ -54,13 +56,15 @@
         (__u64)1 << _UFFDIO_MOVE |             \
         (__u64)1 << _UFFDIO_WRITEPROTECT |     \
         (__u64)1 << _UFFDIO_CONTINUE |         \
-        (__u64)1 << _UFFDIO_POISON)
+        (__u64)1 << _UFFDIO_POISON |           \
+        (__u64)1 << _UFFDIO_RWPROTECT)
 #define UFFD_API_RANGE_IOCTLS_BASIC            \
        ((__u64)1 << _UFFDIO_WAKE |             \
         (__u64)1 << _UFFDIO_COPY |             \
         (__u64)1 << _UFFDIO_WRITEPROTECT |     \
         (__u64)1 << _UFFDIO_CONTINUE |         \
-        (__u64)1 << _UFFDIO_POISON)
+        (__u64)1 << _UFFDIO_POISON |           \
+        (__u64)1 << _UFFDIO_RWPROTECT)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2aecb6d01c44..631e0355919f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2264,6 +2264,11 @@ static inline bool can_change_pmd_writable(struct 
vm_area_struct *vma,
        return pmd_dirty(pmd);
 }
 
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+       return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
 /* NUMA hinting page fault entry point for trans huge pmds */
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f2f1eb6cf66e..bac9aa852f6b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6067,6 +6067,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
                goto out_mutex;
        }
 
+       /*
+        * Protnone hugetlb PTEs with the uffd bit are used by
+        * userfaultfd RWP for access tracking. Plain PROT_NONE (without the
+        * marker) is not an RWP fault and is not expected on hugetlb (no
+        * NUMA hinting), so let normal hugetlb fault handling proceed.
+        */
+       if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) &&
+           userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) {
+               return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+       }
+
        /*
         * If we are going to COW/unshare the mapping later, we examine the
         * pending reservations for this page now. This will ensure that any
diff --git a/mm/memory.c b/mm/memory.c
index ea9616e3dbaf..e0dcf2c28d9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6172,6 +6172,12 @@ static void numa_rebuild_large_mapping(struct vm_fault 
*vmf, struct vm_area_stru
        }
 }
 
+static vm_fault_t do_uffd_rwp(struct vm_fault *vmf)
+{
+       pte_unmap(vmf->pte);
+       return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
@@ -6446,8 +6452,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);
 
-       if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+       if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+               /*
+                * RWP-protected PTEs are protnone plus the uffd bit. On a
+                * VM_UFFD_RWP VMA, a protnone PTE without the uffd bit is
+                * NUMA hinting and must still fall through to do_numa_page().
+                */
+               if (userfaultfd_pte_rwp(vmf->vma, vmf->orig_pte))
+                       return do_uffd_rwp(vmf);
                return do_numa_page(vmf);
+       }
 
        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
@@ -6561,8 +6575,11 @@ static vm_fault_t __handle_mm_fault(struct 
vm_area_struct *vma,
                return 0;
        }
        if (pmd_trans_huge(vmf.orig_pmd)) {
-               if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+               if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) {
+                       if (userfaultfd_huge_pmd_rwp(vma, vmf.orig_pmd))
+                               return do_huge_pmd_uffd_rwp(&vmf);
                        return do_huge_pmd_numa_page(&vmf);
+               }
 
                if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                    !pmd_write(vmf.orig_pmd)) {
-- 
2.51.2


Reply via email to