On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <[email protected]> wrote:
> Previously, when a driver needed to do something like establish a reference > count, it could do so in the mmap hook in the knowledge that the mapping > would succeed. > > With the introduction of f_op->mmap_prepare this is no longer the case, as > it is invoked prior to actually establishing the mapping. > > To take this into account, introduce a new vm_ops->mapped callback which is > invoked when the VMA is first mapped (though notably - not when it is > merged - which is correct and mirrors existing mmap/open/close behaviour). > > We do better that vm_ops->open() here, as this callback can return an > error, at which point the VMA will be unmapped. > > Note that vm_ops->mapped() is invoked after any mmap action is > complete (such as I/O remapping). > > We intentionally do not expose the VMA at this point, exposing only the > fields that could be used, and an output parameter in case the operation > needs to update the vma->vm_private_data field. > > In order to deal with stacked filesystems which invoke inner filesystem's > mmap() invocations, add __compat_vma_mapped() and invoke it on > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is > handled when an mmap() caller invokes a nested filesystem's mmap_prepare() > callback. > > We can now also remove call_action_complete() and invoke > mmap_action_complete() directly, as we separate out the rmap lock logic to > be called in __mmap_region() instead via maybe_drop_file_rmap_lock(). > > We also abstract unmapping of a VMA on mmap action completion into its own > helper function, unmap_vma_locked(). > > Additionally, update VMA userland test headers to reflect the change. > > Signed-off-by: Lorenzo Stoakes (Oracle) <[email protected]> > --- > include/linux/fs.h | 9 +++- > include/linux/mm.h | 17 +++++++ > mm/internal.h | 10 ++++ > mm/util.c | 86 ++++++++++++++++++++++++--------- > mm/vma.c | 41 +++++++++++----- > tools/testing/vma/include/dup.h | 34 ++++++++++++- > 6 files changed, 158 insertions(+), 39 deletions(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index a2628a12bd2b..c390f5c667e3 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file) > } > > int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); > +int __vma_check_mmap_hook(struct vm_area_struct *vma); > > static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) > { > + int err; > + > if (file->f_op->mmap_prepare) > return compat_vma_mmap(file, vma); > > - return file->f_op->mmap(file, vma); > + err = file->f_op->mmap(file, vma); > + if (err) > + return err; > + > + return __vma_check_mmap_hook(vma); > } > > static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc > *desc) > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 12a0b4c63736..7333d5db1221 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -759,6 +759,23 @@ struct vm_operations_struct { > * Context: User context. May sleep. Caller holds mmap_lock. > */ > void (*close)(struct vm_area_struct *vma); > + /** > + * @mapped: Called when the VMA is first mapped in the MM. Not called if > + * the new VMA is merged with an adjacent VMA. > + * > + * The @vm_private_data field is an output field allowing the user to > + * modify vma->vm_private_data as necessary. > + * > + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if > + * set from f_op->mmap. > + * > + * Returns %0 on success, or an error otherwise. On error, the VMA will > + * be unmapped. > + * > + * Context: User context. May sleep. Caller holds mmap_lock. > + */ > + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, > + const struct file *file, void **vm_private_data); > /* Called any time before splitting to check if it's allowed */ > int (*may_split)(struct vm_area_struct *vma, unsigned long addr); > int (*mremap)(struct vm_area_struct *vma); > diff --git a/mm/internal.h b/mm/internal.h > index 7bfa85b5e78b..f0f2cf1caa36 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio > *folio) > * mmap hook and safely handle error conditions. On error, VMA hooks will be > * mutated. > * > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare(). > + * > * @file: File which backs the mapping. > * @vma: VMA which we are mapping. > * > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma) > /* unmap_vmas is in mm/memory.c */ > void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap); > > +static inline void unmap_vma_locked(struct vm_area_struct *vma) > +{ > + const size_t len = vma_pages(vma) << PAGE_SHIFT; > + > + mmap_assert_locked(vma->vm_mm); > + do_munmap(vma->vm_mm, vma->vm_start, len, NULL); > +} > + > #ifdef CONFIG_MMU > > static inline void get_anon_vma(struct anon_vma *anon_vma) > diff --git a/mm/util.c b/mm/util.c > index dba1191725b6..2b0ed54008d6 100644 > --- a/mm/util.c > +++ b/mm/util.c > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio) > EXPORT_SYMBOL(flush_dcache_folio); > #endif > > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + struct vm_area_desc desc = { > + .mm = vma->vm_mm, > + .file = file, > + .start = vma->vm_start, > + .end = vma->vm_end, > + > + .pgoff = vma->vm_pgoff, > + .vm_file = vma->vm_file, > + .vma_flags = vma->flags, > + .page_prot = vma->vm_page_prot, > + > + .action.type = MMAP_NOTHING, /* Default */ > + }; > + int err; > + > + err = vfs_mmap_prepare(file, &desc); > + if (err) > + return err; > + > + err = mmap_action_prepare(&desc, &desc.action); > + if (err) > + return err; > + > + set_vma_from_desc(vma, &desc); > + return mmap_action_complete(vma, &desc.action); > +} > + > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma) > +{ > + const struct vm_operations_struct *vm_ops = vma->vm_ops; > + void *vm_private_data = vma->vm_private_data; > + int err; > + > + if (!vm_ops->mapped) > + return 0; > + Hello! Can vm_ops be NULL here? __compat_vma_mapped() is called from compat_vma_mmap(), which is reached when a filesystem provides mmap_prepare. If the mmap_prepare hook does not set desc->vm_ops, vma->vm_ops will be NULL and this dereferences a NULL pointer. For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger a NULL pointer dereference here. Would need to do if (!vm_ops || !vm_ops->mapped) return 0; here > + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file, > + &vm_private_data); > + if (err) > + unmap_vma_locked(vma); when mapped() returns an error, unmap_vma_locked(vma) is called but execution continues into the vm_private_data update below. After unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA entirely), so accessing vma->vm_private_data after that is a use-after-free. Probably need to do: if (err) { unmap_vma_locked(vma); return err; } > + /* Update private data if changed. */ > + if (vm_private_data != vma->vm_private_data) > + vma->vm_private_data = vm_private_data; > + > + return err; > +} > + > /** > * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an > * existing VMA and execute any requested actions. > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio); > */ > int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) > { > - struct vm_area_desc desc = { > - .mm = vma->vm_mm, > - .file = file, > - .start = vma->vm_start, > - .end = vma->vm_end, > - > - .pgoff = vma->vm_pgoff, > - .vm_file = vma->vm_file, > - .vma_flags = vma->flags, > - .page_prot = vma->vm_page_prot, > - > - .action.type = MMAP_NOTHING, /* Default */ > - }; > int err; > > - err = vfs_mmap_prepare(file, &desc); > - if (err) > - return err; > - > - err = mmap_action_prepare(&desc, &desc.action); > + err = __compat_vma_mmap(file, vma); > if (err) > return err; > > - set_vma_from_desc(vma, &desc); > - return mmap_action_complete(vma, &desc.action); > + return __compat_vma_mapped(file, vma); > } > EXPORT_SYMBOL(compat_vma_mmap); > > +int __vma_check_mmap_hook(struct vm_area_struct *vma) > +{ > + /* vm_ops->mapped is not valid if mmap() is specified. */ > + if (WARN_ON_ONCE(vma->vm_ops->mapped)) > + return -EINVAL; I think vma->vm_ops can be NULL here. Should be: if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped)) return -EINVAL; > + > + return 0; > +} > +EXPORT_SYMBOL(__vma_check_mmap_hook); > + > static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio, > const struct page *page) > { > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct > *vma, > * invoked if we do NOT merge, so we only clean up the VMA we created. > */ > if (err) { > - const size_t len = vma_pages(vma) << PAGE_SHIFT; > - > - do_munmap(current->mm, vma->vm_start, len, NULL); > - > + unmap_vma_locked(vma); > if (action->error_hook) { > /* We may want to filter the error. */ > err = action->error_hook(err); > diff --git a/mm/vma.c b/mm/vma.c > index 054cf1d262fb..ef9f5a5365d1 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state > *map) > return false; > } > > -static int call_action_complete(struct mmap_state *map, > - struct mmap_action *action, > - struct vm_area_struct *vma) > +static int call_mapped_hook(struct vm_area_struct *vma) > { > - int ret; > + const struct vm_operations_struct *vm_ops = vma->vm_ops; > + void *vm_private_data = vma->vm_private_data; > + int err; > > - ret = mmap_action_complete(vma, action); > + if (!vm_ops || !vm_ops->mapped) > + return 0; > + err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, > + vma->vm_file, &vm_private_data); > + if (err) { > + unmap_vma_locked(vma); > + return err; > + } > + /* Update private data if changed. */ > + if (vm_private_data != vma->vm_private_data) > + vma->vm_private_data = vm_private_data; > + return 0; > +} > > - /* If we held the file rmap we need to release it. */ > - if (map->hold_file_rmap_lock) { > - struct file *file = vma->vm_file; > +static void maybe_drop_file_rmap_lock(struct mmap_state *map, > + struct vm_area_struct *vma) > +{ > + struct file *file; > > - i_mmap_unlock_write(file->f_mapping); > - } > - return ret; > + if (!map->hold_file_rmap_lock) > + return; > + file = vma->vm_file; > + i_mmap_unlock_write(file->f_mapping); > } > > static unsigned long __mmap_region(struct file *file, unsigned long addr, > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, > unsigned long addr, > __mmap_complete(&map, vma); > > if (have_mmap_prepare && allocated_new) { > - error = call_action_complete(&map, &desc.action, vma); > + error = mmap_action_complete(vma, &desc.action); > + if (!error) > + error = call_mapped_hook(vma); > > + maybe_drop_file_rmap_lock(&map, vma); > if (error) > return error; > } > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h > index 908beb263307..47d8db809f31 100644 > --- a/tools/testing/vma/include/dup.h > +++ b/tools/testing/vma/include/dup.h > @@ -606,12 +606,34 @@ struct vm_area_struct { > } __randomize_layout; > > struct vm_operations_struct { > - void (*open)(struct vm_area_struct * area); > + /** > + * @open: Called when a VMA is remapped or split. Not called upon first > + * mapping a VMA. > + * Context: User context. May sleep. Caller holds mmap_lock. > + */ > + void (*open)(struct vm_area_struct *vma); > /** > * @close: Called when the VMA is being removed from the MM. > * Context: User context. May sleep. Caller holds mmap_lock. > */ > - void (*close)(struct vm_area_struct * area); > + void (*close)(struct vm_area_struct *vma); > + /** > + * @mapped: Called when the VMA is first mapped in the MM. Not called if > + * the new VMA is merged with an adjacent VMA. > + * > + * The @vm_private_data field is an output field allowing the user to > + * modify vma->vm_private_data as necessary. > + * > + * ONLY valid if set from f_op->mmap_prepare. Will result in an error if > + * set from f_op->mmap. > + * > + * Returns %0 on success, or an error otherwise. On error, the VMA will > + * be unmapped. > + * > + * Context: User context. May sleep. Caller holds mmap_lock. > + */ > + int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff, > + const struct file *file, void **vm_private_data); > /* Called any time before splitting to check if it's allowed */ > int (*may_split)(struct vm_area_struct *area, unsigned long addr); > int (*mremap)(struct vm_area_struct *area); > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct > *vma, struct file *file) > swap(vma->vm_file, file); > fput(file); > } > + > +static inline void unmap_vma_locked(struct vm_area_struct *vma) > +{ > + const size_t len = vma_pages(vma) << PAGE_SHIFT; > + > + mmap_assert_locked(vma->vm_mm); > + do_munmap(vma->vm_mm, vma->vm_start, len, NULL); > +} > -- > 2.53.0 > >

