+cc Suren, who has worked HEAVILY on VMA field manipulation and such :) Suren - David is proposing adding a new field. AFAICT this does not add a new cache line so I think we're all good.
But FYI! On Fri, Apr 25, 2025 at 10:17:09AM +0200, David Hildenbrand wrote: > Let's use our new interface. In remap_pfn_range(), we'll now decide > whether we have to track (full VMA covered) or only sanitize the pgprot > (partial VMA covered). > > Remember what we have to untrack by linking it from the VMA. When > duplicating VMAs (e.g., splitting, mremap, fork), we'll handle it similar > to anon VMA names, and use a kref to share the tracking. > > Once the last VMA un-refs our tracking data, we'll do the untracking, > which simplifies things a lot and should sort our various issues we saw > recently, for example, when partially unmapping/zapping a tracked VMA. > > This change implies that we'll keep tracking the original PFN range even > after splitting + partially unmapping it: not too bad, because it was > not working reliably before. The only thing that kind-of worked before > was shrinking such a mapping using mremap(): we managed to adjust the > reservation in a hacky way, now we won't adjust the reservation but > leave it around until all involved VMAs are gone. > > Signed-off-by: David Hildenbrand <da...@redhat.com> > --- > include/linux/mm_inline.h | 2 + > include/linux/mm_types.h | 11 ++++++ > kernel/fork.c | 54 ++++++++++++++++++++++++-- > mm/memory.c | 81 +++++++++++++++++++++++++++++++-------- > mm/mremap.c | 4 -- > 5 files changed, 128 insertions(+), 24 deletions(-) > > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h > index f9157a0c42a5c..89b518ff097e6 100644 > --- a/include/linux/mm_inline.h > +++ b/include/linux/mm_inline.h > @@ -447,6 +447,8 @@ static inline bool anon_vma_name_eq(struct anon_vma_name > *anon_name1, > > #endif /* CONFIG_ANON_VMA_NAME */ > > +void pfnmap_track_ctx_release(struct kref *ref); > + > static inline void init_tlb_flush_pending(struct mm_struct *mm) > { > atomic_set(&mm->tlb_flush_pending, 0); > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 56d07edd01f91..91124761cfda8 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -764,6 +764,14 @@ struct vma_numab_state { > int prev_scan_seq; > }; > > +#ifdef __HAVE_PFNMAP_TRACKING > +struct pfnmap_track_ctx { > + struct kref kref; > + unsigned long pfn; > + unsigned long size; > +}; > +#endif > + > /* > * This struct describes a virtual memory area. There is one of these > * per VM-area/task. A VM area is any part of the process virtual memory > @@ -877,6 +885,9 @@ struct vm_area_struct { > struct anon_vma_name *anon_name; > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > +#ifdef __HAVE_PFNMAP_TRACKING > + struct pfnmap_track_ctx *pfnmap_track_ctx; > +#endif > } __randomize_layout; > > #ifdef CONFIG_NUMA > diff --git a/kernel/fork.c b/kernel/fork.c > index 168681fc4b25a..ae518b8fe752c 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -481,7 +481,51 @@ static void vm_area_init_from(const struct > vm_area_struct *src, > #ifdef CONFIG_NUMA > dest->vm_policy = src->vm_policy; > #endif > +#ifdef __HAVE_PFNMAP_TRACKING > + dest->pfnmap_track_ctx = NULL; > +#endif > +} > + > +#ifdef __HAVE_PFNMAP_TRACKING > +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, > + struct vm_area_struct *new) > +{ > + struct pfnmap_track_ctx *ctx = orig->pfnmap_track_ctx; > + > + if (likely(!ctx)) > + return 0; > + > + /* > + * We don't expect to ever hit this. If ever required, we would have > + * to duplicate the tracking. > + */ > + if (unlikely(kref_read(&ctx->kref) >= REFCOUNT_MAX)) > + return -ENOMEM; > + kref_get(&ctx->kref); > + new->pfnmap_track_ctx = ctx; > + return 0; > +} > + > +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) > +{ > + struct pfnmap_track_ctx *ctx = vma->pfnmap_track_ctx; > + > + if (likely(!ctx)) > + return; > + > + kref_put(&ctx->kref, pfnmap_track_ctx_release); > + vma->pfnmap_track_ctx = NULL; > +} > +#else > +static inline int vma_pfnmap_track_ctx_dup(struct vm_area_struct *orig, > + struct vm_area_struct *new) > +{ > + return 0; > } > +static inline void vma_pfnmap_track_ctx_release(struct vm_area_struct *vma) > +{ > +} > +#endif > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > { > @@ -493,6 +537,11 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct > *orig) > ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); > ASSERT_EXCLUSIVE_WRITER(orig->vm_file); > vm_area_init_from(orig, new); > + > + if (vma_pfnmap_track_ctx_dup(orig, new)) { > + kmem_cache_free(vm_area_cachep, new); > + return NULL; > + } > vma_lock_init(new, true); > INIT_LIST_HEAD(&new->anon_vma_chain); > vma_numab_state_init(new); > @@ -507,6 +556,7 @@ void vm_area_free(struct vm_area_struct *vma) > vma_assert_detached(vma); > vma_numab_state_free(vma); > free_anon_vma_name(vma); > + vma_pfnmap_track_ctx_release(vma); > kmem_cache_free(vm_area_cachep, vma); > } > > @@ -669,10 +719,6 @@ static __latent_entropy int dup_mmap(struct mm_struct > *mm, > if (!tmp) > goto fail_nomem; > > - /* track_pfn_copy() will later take care of copying internal > state. */ > - if (unlikely(tmp->vm_flags & VM_PFNMAP)) > - untrack_pfn_clear(tmp); > - > retval = vma_dup_policy(mpnt, tmp); > if (retval) > goto fail_nomem_policy; > diff --git a/mm/memory.c b/mm/memory.c > index c737a8625866a..eb2b3f10a97ec 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1370,7 +1370,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct > vm_area_struct *src_vma) > struct mm_struct *dst_mm = dst_vma->vm_mm; > struct mm_struct *src_mm = src_vma->vm_mm; > struct mmu_notifier_range range; > - unsigned long next, pfn = 0; > + unsigned long next; > bool is_cow; > int ret; > > @@ -1380,12 +1380,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct > vm_area_struct *src_vma) > if (is_vm_hugetlb_page(src_vma)) > return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, > src_vma); > > - if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { > - ret = track_pfn_copy(dst_vma, src_vma, &pfn); > - if (ret) > - return ret; > - } > - > /* > * We need to invalidate the secondary MMU mappings only when > * there could be a permission downgrade on the ptes of the > @@ -1427,8 +1421,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct > vm_area_struct *src_vma) > raw_write_seqcount_end(&src_mm->write_protect_seq); > mmu_notifier_invalidate_range_end(&range); > } > - if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) > - untrack_pfn_copy(dst_vma, pfn); > return ret; > } > > @@ -1923,9 +1915,6 @@ static void unmap_single_vma(struct mmu_gather *tlb, > if (vma->vm_file) > uprobe_munmap(vma, start, end); > > - if (unlikely(vma->vm_flags & VM_PFNMAP)) > - untrack_pfn(vma, 0, 0, mm_wr_locked); > - > if (start != end) { > if (unlikely(is_vm_hugetlb_page(vma))) { > /* > @@ -2871,6 +2860,36 @@ int remap_pfn_range_notrack(struct vm_area_struct > *vma, unsigned long addr, > return error; > } > > +#ifdef __HAVE_PFNMAP_TRACKING > +static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long > pfn, > + unsigned long size, pgprot_t *prot) > +{ > + struct pfnmap_track_ctx *ctx; > + > + if (pfnmap_track(pfn, size, prot)) > + return ERR_PTR(-EINVAL); > + > + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); > + if (unlikely(!ctx)) { > + pfnmap_untrack(pfn, size); > + return ERR_PTR(-ENOMEM); > + } > + > + ctx->pfn = pfn; > + ctx->size = size; > + kref_init(&ctx->kref); > + return ctx; > +} > + > +void pfnmap_track_ctx_release(struct kref *ref) > +{ > + struct pfnmap_track_ctx *ctx = container_of(ref, struct > pfnmap_track_ctx, kref); > + > + pfnmap_untrack(ctx->pfn, ctx->size); > + kfree(ctx); > +} > +#endif /* __HAVE_PFNMAP_TRACKING */ > + > /** > * remap_pfn_range - remap kernel memory to userspace > * @vma: user vma to map to > @@ -2883,20 +2902,50 @@ int remap_pfn_range_notrack(struct vm_area_struct > *vma, unsigned long addr, > * > * Return: %0 on success, negative error code otherwise. > */ > +#ifdef __HAVE_PFNMAP_TRACKING > int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, > unsigned long pfn, unsigned long size, pgprot_t prot) > { > + struct pfnmap_track_ctx *ctx = NULL; > int err; > > - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); > - if (err) > + size = PAGE_ALIGN(size); > + > + /* > + * If we cover the full VMA, we'll perform actual tracking, and > + * remember to untrack when the last reference to our tracking > + * context from a VMA goes away. > + * > + * If we only cover parts of the VMA, we'll only sanitize the > + * pgprot. > + */ > + if (addr == vma->vm_start && addr + size == vma->vm_end) { > + if (vma->pfnmap_track_ctx) > + return -EINVAL; > + ctx = pfnmap_track_ctx_alloc(pfn, size, &prot); > + if (IS_ERR(ctx)) > + return PTR_ERR(ctx); > + } else if (pfnmap_sanitize_pgprot(pfn, size, &prot)) { > return -EINVAL; > + } > > err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); > - if (err) > - untrack_pfn(vma, pfn, PAGE_ALIGN(size), true); > + if (ctx) { > + if (err) > + kref_put(&ctx->kref, pfnmap_track_ctx_release); > + else > + vma->pfnmap_track_ctx = ctx; > + } > return err; > } > + > +#else > +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, > + unsigned long pfn, unsigned long size, pgprot_t prot) > +{ > + return remap_pfn_range_notrack(vma, addr, pfn, size, prot); > +} > +#endif > EXPORT_SYMBOL(remap_pfn_range); > > /** > diff --git a/mm/mremap.c b/mm/mremap.c > index 7db9da609c84f..6e78e02f74bd3 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -1191,10 +1191,6 @@ static int copy_vma_and_data(struct vma_remap_struct > *vrm, > if (is_vm_hugetlb_page(vma)) > clear_vma_resv_huge_pages(vma); > > - /* Tell pfnmap has moved from this vma */ > - if (unlikely(vma->vm_flags & VM_PFNMAP)) > - untrack_pfn_clear(vma); > - > *new_vma_ptr = new_vma; > return err; > } > -- > 2.49.0 >