On Mon, Jul 7, 2025 at 11:21 AM Liam R. Howlett <liam.howl...@oracle.com> wrote:
>
> * Suren Baghdasaryan <sur...@google.com> [250704 02:07]:
> > With maple_tree supporting vma tree traversal under RCU and per-vma
> > locks, /proc/pid/maps can be read while holding individual vma locks
> > instead of locking the entire address space.
> > Completely lockless approach (walking vma tree under RCU) would be quite
> > complex with the main issue being get_vma_name() using callbacks which
> > might not work correctly with a stable vma copy, requiring original
> > (unstable) vma - see special_mapping_name() for an example.
> > When per-vma lock acquisition fails, we take the mmap_lock for reading,
> > lock the vma, release the mmap_lock and continue. This fallback to mmap
> > read lock guarantees the reader to make forward progress even during
> > lock contention. This will interfere with the writer but for a very
> > short time while we are acquiring the per-vma lock and only when there
> > was contention on the vma reader is interested in. We shouldn't see a
> > repeated fallback to mmap read locks in practice, as this require a
> > very unlikely series of lock contentions (for instance due to repeated
> > vma split operations). However even if this did somehow happen, we would
> > still progress.
> > One case requiring special handling is when vma changes between the
> > time it was found and the time it got locked. A problematic case would
> > be if vma got shrunk so that it's start moved higher in the address
> > space and a new vma was installed at the beginning:
> >
> > reader found:               |--------VMA A--------|
> > VMA is modified:            |-VMA B-|----VMA A----|
> > reader locks modified VMA A
> > reader reports VMA A:       |  gap  |----VMA A----|
> >
> > This would result in reporting a gap in the address space that does not
> > exist. To prevent this we retry the lookup after locking the vma, however
> > we do that only when we identify a gap and detect that the address space
> > was changed after we found the vma.
> > This change is designed to reduce mmap_lock contention and prevent a
> > process reading /proc/pid/maps files (often a low priority task, such
> > as monitoring/data collection services) from blocking address space
> > updates. Note that this change has a userspace visible disadvantage:
> > it allows for sub-page data tearing as opposed to the previous mechanism
> > where data tearing could happen only between pages of generated output
> > data. Since current userspace considers data tearing between pages to be
> > acceptable, we assume is will be able to handle sub-page data tearing
> > as well.
> >
> > Signed-off-by: Suren Baghdasaryan <sur...@google.com>
>
> Reviewed-by: Liam R. Howlett <liam.howl...@oracle.com>

Thanks!
I'll update addressing Lorenzo's nits and will repost in a couple
days. Hopefully by then I can get some reviews for the tests in the
series.

>
> > ---
> >  fs/proc/internal.h        |   5 ++
> >  fs/proc/task_mmu.c        | 118 ++++++++++++++++++++++++++++++++++----
> >  include/linux/mmap_lock.h |  11 ++++
> >  mm/madvise.c              |   3 +-
> >  mm/mmap_lock.c            |  88 ++++++++++++++++++++++++++++
> >  5 files changed, 214 insertions(+), 11 deletions(-)
> >
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index 3d48ffe72583..7c235451c5ea 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -384,6 +384,11 @@ struct proc_maps_private {
> >       struct task_struct *task;
> >       struct mm_struct *mm;
> >       struct vma_iterator iter;
> > +     loff_t last_pos;
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     bool mmap_locked;
> > +     struct vm_area_struct *locked_vma;
> > +#endif
> >  #ifdef CONFIG_NUMA
> >       struct mempolicy *task_mempolicy;
> >  #endif
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index b8bc06d05a72..ff3fe488ce51 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct 
> > proc_maps_private *priv)
> >  }
> >  #endif
> >
> > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> > -                                             loff_t *ppos)
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +
> > +static void unlock_vma(struct proc_maps_private *priv)
> > +{
> > +     if (priv->locked_vma) {
> > +             vma_end_read(priv->locked_vma);
> > +             priv->locked_vma = NULL;
> > +     }
> > +}
> > +
> > +static const struct seq_operations proc_pid_maps_op;
> > +
> > +static inline bool lock_vma_range(struct seq_file *m,
> > +                               struct proc_maps_private *priv)
> > +{
> > +     /*
> > +      * smaps and numa_maps perform page table walk, therefore require
> > +      * mmap_lock but maps can be read with locking just the vma.
> > +      */
> > +     if (m->op != &proc_pid_maps_op) {
> > +             if (mmap_read_lock_killable(priv->mm))
> > +                     return false;
> > +
> > +             priv->mmap_locked = true;
> > +     } else {
> > +             rcu_read_lock();
> > +             priv->locked_vma = NULL;
> > +             priv->mmap_locked = false;
> > +     }
> > +
> > +     return true;
> > +}
> > +
> > +static inline void unlock_vma_range(struct proc_maps_private *priv)
> > +{
> > +     if (priv->mmap_locked) {
> > +             mmap_read_unlock(priv->mm);
> > +     } else {
> > +             unlock_vma(priv);
> > +             rcu_read_unlock();
> > +     }
> > +}
> > +
> > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> > +                                        loff_t last_pos)
> > +{
> > +     struct vm_area_struct *vma;
> > +
> > +     if (priv->mmap_locked)
> > +             return vma_next(&priv->iter);
> > +
> > +     unlock_vma(priv);
> > +     vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
> > +     if (!IS_ERR_OR_NULL(vma))
> > +             priv->locked_vma = vma;
> > +
> > +     return vma;
> > +}
> > +
> > +#else /* CONFIG_PER_VMA_LOCK */
> > +
> > +static inline bool lock_vma_range(struct seq_file *m,
> > +                               struct proc_maps_private *priv)
> >  {
> > -     struct vm_area_struct *vma = vma_next(&priv->iter);
> > +     return mmap_read_lock_killable(priv->mm) == 0;
> > +}
> > +
> > +static inline void unlock_vma_range(struct proc_maps_private *priv)
> > +{
> > +     mmap_read_unlock(priv->mm);
> > +}
> > +
> > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> > +                                        loff_t last_pos)
> > +{
> > +     return vma_next(&priv->iter);
> > +}
> >
> > +#endif /* CONFIG_PER_VMA_LOCK */
> > +
> > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t 
> > *ppos)
> > +{
> > +     struct proc_maps_private *priv = m->private;
> > +     struct vm_area_struct *vma;
> > +
> > +     vma = get_next_vma(priv, *ppos);
> > +     /* EINTR is possible */
> > +     if (IS_ERR(vma))
> > +             return vma;
> > +
> > +     /* Store previous position to be able to restart if needed */
> > +     priv->last_pos = *ppos;
> >       if (vma) {
> > -             *ppos = vma->vm_start;
> > +             /*
> > +              * Track the end of the reported vma to ensure position 
> > changes
> > +              * even if previous vma was merged with the next vma and we
> > +              * found the extended vma with the same vm_start.
> > +              */
> > +             *ppos = vma->vm_end;
> >       } else {
> > -             *ppos = -2;
> > +             *ppos = -2; /* -2 indicates gate vma */
> >               vma = get_gate_vma(priv->mm);
> >       }
> >
> > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
> >               return NULL;
> >       }
> >
> > -     if (mmap_read_lock_killable(mm)) {
> > +     if (!lock_vma_range(m, priv)) {
> >               mmput(mm);
> >               put_task_struct(priv->task);
> >               priv->task = NULL;
> >               return ERR_PTR(-EINTR);
> >       }
> >
> > +     /*
> > +      * Reset current position if last_addr was set before
> > +      * and it's not a sentinel.
> > +      */
> > +     if (last_addr > 0)
> > +             *ppos = last_addr = priv->last_pos;
> >       vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
> >       hold_task_mempolicy(priv);
> >       if (last_addr == -2)
> >               return get_gate_vma(mm);
> >
> > -     return proc_get_vma(priv, ppos);
> > +     return proc_get_vma(m, ppos);
> >  }
> >
> >  static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
> >  {
> >       if (*ppos == -2) {
> > -             *ppos = -1;
> > +             *ppos = -1; /* -1 indicates no more vmas */
> >               return NULL;
> >       }
> > -     return proc_get_vma(m->private, ppos);
> > +     return proc_get_vma(m, ppos);
> >  }
> >
> >  static void m_stop(struct seq_file *m, void *v)
> > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v)
> >               return;
> >
> >       release_task_mempolicy(priv);
> > -     mmap_read_unlock(mm);
> > +     unlock_vma_range(priv);
> >       mmput(mm);
> >       put_task_struct(priv->task);
> >       priv->task = NULL;
> > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> > index 5da384bd0a26..1f4f44951abe 100644
> > --- a/include/linux/mmap_lock.h
> > +++ b/include/linux/mmap_lock.h
> > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma);
> >  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
> >                                         unsigned long address);
> >
> > +/*
> > + * Locks next vma pointed by the iterator. Confirms the locked vma has not
> > + * been modified and will retry under mmap_lock protection if modification
> > + * was detected. Should be called from read RCU section.
> > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
> > + * process was interrupted.
> > + */
> > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> > +                                  struct vma_iterator *iter,
> > +                                  unsigned long address);
> > +
> >  #else /* CONFIG_PER_VMA_LOCK */
> >
> >  static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index a34c2c89a53b..e61e32b2cd91 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
> >
> >  struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
> >  {
> > -     mmap_assert_locked(vma->vm_mm);
> > +     if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
> > +             vma_assert_locked(vma);
> >
> >       return vma->anon_name;
> >  }
> > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> > index 5f725cc67334..ed0e5e2171cd 100644
> > --- a/mm/mmap_lock.c
> > +++ b/mm/mmap_lock.c
> > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct 
> > mm_struct *mm,
> >       count_vm_vma_lock_event(VMA_LOCK_ABORT);
> >       return NULL;
> >  }
> > +
> > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct 
> > *mm,
> > +                                                    struct vma_iterator 
> > *iter,
> > +                                                    unsigned long address)
> > +{
> > +     struct vm_area_struct *vma;
> > +     int ret;
> > +
> > +     ret = mmap_read_lock_killable(mm);
> > +     if (ret)
> > +             return ERR_PTR(ret);
> > +
> > +     /* Lookup the vma at the last position again under mmap_read_lock */
> > +     vma_iter_init(iter, mm, address);
> > +     vma = vma_next(iter);
> > +     if (vma)
> > +             vma_start_read_locked(vma);
> > +
> > +     mmap_read_unlock(mm);
> > +
> > +     return vma;
> > +}
> > +
> > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> > +                                  struct vma_iterator *iter,
> > +                                  unsigned long address)
> > +{
> > +     struct vm_area_struct *vma;
> > +     unsigned int mm_wr_seq;
> > +     bool mmap_unlocked;
> > +
> > +     RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
> > +retry:
> > +     /* Start mmap_lock speculation in case we need to verify the vma 
> > later */
> > +     mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
> > +     vma = vma_next(iter);
> > +     if (!vma)
> > +             return NULL;
> > +
> > +     vma = vma_start_read(mm, vma);
> > +
> > +     if (IS_ERR_OR_NULL(vma)) {
> > +             /*
> > +              * Retry immediately if the vma gets detached from under us.
> > +              * Infinite loop should not happen because the vma we find 
> > will
> > +              * have to be constantly knocked out from under us.
> > +              */
> > +             if (PTR_ERR(vma) == -EAGAIN) {
> > +                     vma_iter_init(iter, mm, address);
> > +                     goto retry;
> > +             }
> > +
> > +             goto out;
> > +     }
> > +
> > +     /*
> > +      * Verify the vma we locked belongs to the same address space and it's
> > +      * not behind of the last search position.
> > +      */
> > +     if (unlikely(vma->vm_mm != mm || address >= vma->vm_end))
> > +             goto out_unlock;
> > +
> > +     /*
> > +      * vma can be ahead of the last search position but we need to verify
> > +      * it was not shrunk after we found it and another vma has not been
> > +      * installed ahead of it. Otherwise we might observe a gap that should
> > +      * not be there.
> > +      */
> > +     if (address < vma->vm_start) {
> > +             /* Verify only if the address space might have changed since 
> > vma lookup. */
> > +             if (!mmap_unlocked || mmap_lock_speculate_retry(mm, 
> > mm_wr_seq)) {
> > +                     vma_iter_init(iter, mm, address);
> > +                     if (vma != vma_next(iter))
> > +                             goto out_unlock;
> > +             }
> > +     }
> > +
> > +     return vma;
> > +
> > +out_unlock:
> > +     vma_end_read(vma);
> > +out:
> > +     rcu_read_unlock();
> > +     vma = lock_vma_under_mmap_lock(mm, iter, address);
> > +     rcu_read_lock();
> > +
> > +     return vma;
> > +}
> >  #endif /* CONFIG_PER_VMA_LOCK */
> >
> >  #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
> > --
> > 2.50.0.727.gbf7dc18ff4-goog
> >

Reply via email to