On Tue, 2025-08-19 at 20:03 +1000, Alistair Popple wrote: > On Sat, Aug 09, 2025 at 03:51:32PM +0200, Thomas Hellström wrote: > > GPU use-cases for mmu_interval_notifiers with hmm often involve > > starting a gpu operation and then waiting for it to complete. > > These operations are typically context preemption or TLB flushing. > > > > With single-pass notifiers per GPU this doesn't scale in > > multi-gpu scenarios. In those scenarios we'd want to first start > > preemption- or TLB flushing on all GPUs and as a second pass wait > > for them to complete on all gpus. > > > > One can do this on per-driver basis multiplexing per-driver > > notifiers but that would mean sharing the notifier "user" lock > > across all GPUs and that doesn't scale well either, so adding > > support > > for multi-pass in the core appears like the right choice. > > > > Implement multi-pass capability in the mmu_interval_notifier. Use a > > linked list for the additional passes to minimize the impact for > > use-cases that don't need the multi-pass functionality. > > > > Cc: Jason Gunthorpe <j...@ziepe.ca> > > Cc: Andrew Morton <a...@linux-foundation.org> > > Cc: Simona Vetter <simona.vet...@ffwll.ch> > > Cc: Dave Airlie <airl...@gmail.com> > > Cc: <dri-devel@lists.freedesktop.org> > > Cc: <linux...@kvack.org> > > Cc: <linux-ker...@vger.kernel.org> > > > > Signed-off-by: Thomas Hellström <thomas.hellst...@linux.intel.com> > > --- > > include/linux/mmu_notifier.h | 30 ++++++++++++++++ > > mm/mmu_notifier.c | 67 +++++++++++++++++++++++++++++++- > > ---- > > 2 files changed, 88 insertions(+), 9 deletions(-) > > > > diff --git a/include/linux/mmu_notifier.h > > b/include/linux/mmu_notifier.h > > index d1094c2d5fb6..1107a8eafd8a 100644 > > --- a/include/linux/mmu_notifier.h > > +++ b/include/linux/mmu_notifier.h > > @@ -233,6 +233,32 @@ struct mmu_notifier { > > unsigned int users; > > }; > > > > +/** > > + * struct mmu_interval_notifier_pass - mmu_interval_notifier > > multi-pass abstraction > > + * @link: List link for the notifiers pending pass list > > + * > > + * Allocate, typically using GFP_NOWAIT in the interval notifier's > > first pass. > > + * If allocation fails (which is not unlikely under memory > > pressure), fall back > > + * to single-pass operation. > > + */ > > +struct mmu_interval_notifier_pass { > > If we limit the number of passes to two maybe call this > `mmu_interval_notifier_finish()`? ... > > > + struct list_head link; > > + /** > > + * @pass: Driver callback for additionall pass. > > + * @additional_pass: Pointer to the > > mmu_interval_notifier_pass structure. > > + * @range: The mmu_notifier_range. > > + * @cur_seq: The current sequence set by the first pass. > > + * > > + * Return: Either a pointer to a valid > > mmu_interval_notifier_pass for > > + * another pass to be called, or %NULL if processing is > > complete for this > > + * notifier. There is no error reporting mechanism for > > additional passes. > > + */ > > + struct mmu_interval_notifier_pass * > > + (*pass) (struct mmu_interval_notifier_pass > > *additional_pass, >
> > > + const struct mmu_notifier_range *range, > > + unsigned long cur_seq); > > +}; > > + > > /** > > * struct mmu_interval_notifier_ops > > * @invalidate: Upon return the caller must stop using any SPTEs > > within this > > @@ -243,6 +269,10 @@ struct mmu_interval_notifier_ops { > > bool (*invalidate)(struct mmu_interval_notifier > > *interval_sub, > > const struct mmu_notifier_range *range, > > unsigned long cur_seq); > > + bool (*invalidate_multipass)(struct mmu_interval_notifier > > *interval_sub, > > ... and then this could be called `invalidate_start()`. That might > address some > of the concerns with naming. Makes sense. I'll have a look at that. /Thomas > > > + const struct > > mmu_notifier_range *range, > > + unsigned long cur_seq, > > + struct > > mmu_interval_notifier_pass **pass); > > }; > > > > struct mmu_interval_notifier { > > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c > > index 8e0125dc0522..dd6af87db103 100644 > > --- a/mm/mmu_notifier.c > > +++ b/mm/mmu_notifier.c > > @@ -260,6 +260,22 @@ mmu_interval_read_begin(struct > > mmu_interval_notifier *interval_sub) > > } > > EXPORT_SYMBOL_GPL(mmu_interval_read_begin); > > > > +static void mn_itree_additional_passes(struct list_head > > *additional_passes, > > + const struct > > mmu_notifier_range *range, > > + unsigned long cur_seq) > > +{ > > + struct mmu_interval_notifier_pass *p, *next; > > + > > + while (!list_empty(additional_passes)) { > > + list_for_each_entry_safe(p, next, > > additional_passes, link) { > > + list_del_init(&p->link); > > + p = p->pass(p, range, cur_seq); > > + if (p) > > + list_add_tail(&p->link, > > additional_passes); > > + } > > + } > > +} > > + > > static void mn_itree_release(struct mmu_notifier_subscriptions > > *subscriptions, > > struct mm_struct *mm) > > { > > @@ -272,17 +288,32 @@ static void mn_itree_release(struct > > mmu_notifier_subscriptions *subscriptions, > > }; > > struct mmu_interval_notifier *interval_sub; > > unsigned long cur_seq; > > + LIST_HEAD(additional_passes); > > bool ret; > > > > for (interval_sub = > > mn_itree_inv_start_range(subscriptions, > > &range, &cur_seq); > > interval_sub; > > interval_sub = mn_itree_inv_next(interval_sub, > > &range)) { > > - ret = interval_sub->ops->invalidate(interval_sub, > > &range, > > - cur_seq); > > + if (interval_sub->ops->invalidate_multipass) { > > + struct mmu_interval_notifier_pass *second > > = NULL; > > + > > + ret = interval_sub->ops- > > >invalidate_multipass(interval_sub, > > + > > &range, > > + > > cur_seq, > > + > > &second); > > + if (ret && second) > > + list_add_tail(&second->link, > > &additional_passes); > > + > > + } else { > > + ret = interval_sub->ops- > > >invalidate(interval_sub, > > + > > &range, > > + > > cur_seq); > > + } > > WARN_ON(!ret); > > } > > > > + mn_itree_additional_passes(&additional_passes, &range, > > cur_seq); > > mn_itree_inv_end(subscriptions); > > } > > > > @@ -431,6 +462,8 @@ static int mn_itree_invalidate(struct > > mmu_notifier_subscriptions *subscriptions, > > { > > struct mmu_interval_notifier *interval_sub; > > unsigned long cur_seq; > > + LIST_HEAD(additional_passes); > > + int err = 0; > > > > for (interval_sub = > > mn_itree_inv_start_range(subscriptions, > > range, &cur_seq); > > @@ -438,23 +471,39 @@ static int mn_itree_invalidate(struct > > mmu_notifier_subscriptions *subscriptions, > > interval_sub = mn_itree_inv_next(interval_sub, > > range)) { > > bool ret; > > > > - ret = interval_sub->ops->invalidate(interval_sub, > > range, > > - cur_seq); > > + if (interval_sub->ops->invalidate_multipass) { > > + struct mmu_interval_notifier_pass *second > > = NULL; > > + > > + ret = interval_sub->ops- > > >invalidate_multipass(interval_sub, > > + > > range, > > + > > cur_seq, > > + > > &second); > > + if (ret && second) > > + list_add_tail(&second->link, > > &additional_passes); > > + > > + } else { > > + ret = interval_sub->ops- > > >invalidate(interval_sub, > > + range, > > + > > cur_seq); > > + } > > if (!ret) { > > if > > (WARN_ON(mmu_notifier_range_blockable(range))) > > continue; > > - goto out_would_block; > > + err = -EAGAIN; > > + break; > > } > > } > > - return 0; > > > > -out_would_block: > > + mn_itree_additional_passes(&additional_passes, range, > > cur_seq); > > + > > /* > > * On -EAGAIN the non-blocking caller is not allowed to > > call > > * invalidate_range_end() > > */ > > - mn_itree_inv_end(subscriptions); > > - return -EAGAIN; > > + if (err) > > + mn_itree_inv_end(subscriptions); > > + > > + return err; > > } > > > > static int mn_hlist_invalidate_range_start( > > -- > > 2.50.1 > > > >