The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=c2230ad3b121bd1e57fdb2c466f5b826aad5f730
commit c2230ad3b121bd1e57fdb2c466f5b826aad5f730 Author: Mark Johnston <ma...@freebsd.org> AuthorDate: 2025-08-01 18:16:17 +0000 Commit: Mark Johnston <ma...@freebsd.org> CommitDate: 2025-08-01 21:44:25 +0000 inotify: Avoid calling vrele() with a namecache mutex held In cache_vop_inotify(), we call inotify_log() with a namecache hash lock held. inotify_log() looks at all watches registered with the vnode to see if any of them are interested in the event. In some cases, we have to detach and free the watch after logging the event. This means we must vrele() the watched vnode, and this must not be done while a non-sleepable lock held. Previously, I deferred the vrele() to until the inotify softc and vnode pollinfo locks were dropped. However, this is not enough since we may still be holding the aforementioned namecache lock. Go further and use a taskqueue thread to release vnode references. Introduce a set of detached watches, and queue a threaded task which releases the vnode reference. Reported by: syzbot+c128f121cb22df955...@syzkaller.appspotmail.com Reviewed by: kib Fixes: f1f230439fa4 ("vfs: Initial revision of inotify") Differential Revision: https://reviews.freebsd.org/D51685 --- sys/kern/vfs_inotify.c | 71 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c index d3cd0d1f9832..746a5a39208e 100644 --- a/sys/kern/vfs_inotify.c +++ b/sys/kern/vfs_inotify.c @@ -34,6 +34,7 @@ #include <sys/sysent.h> #include <sys/syslimits.h> #include <sys/sysproto.h> +#include <sys/taskqueue.h> #include <sys/tree.h> #include <sys/user.h> #include <sys/vnode.h> @@ -168,6 +169,8 @@ struct inotify_softc { size_t nbpending; /* bytes available to read */ uint64_t ino; /* unique identifier */ struct inotify_watch_tree watches; /* active watches */ + TAILQ_HEAD(, inotify_watch) deadwatches; /* watches pending vrele() */ + struct task reaptask; /* task to reap dead watches */ struct selinfo sel; /* select/poll/kevent info */ struct ucred *cred; /* credential ref */ }; @@ -374,6 +377,13 @@ inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch *watc vn_irflag_unset(vp, VIRF_INOTIFY); } +static void +inotify_free_watch(struct inotify_watch *watch) +{ + vrele(watch->vp); + free(watch, M_INOTIFY); +} + /* * Assumes that the watch has already been removed from its softc. */ @@ -389,9 +399,24 @@ inotify_remove_watch(struct inotify_watch *watch) mtx_lock(&vp->v_pollinfo->vpi_lock); inotify_unlink_watch_locked(sc, watch); mtx_unlock(&vp->v_pollinfo->vpi_lock); + inotify_free_watch(watch); +} - vrele(vp); - free(watch, M_INOTIFY); +static void +inotify_reap(void *arg, int pending) +{ + struct inotify_softc *sc; + struct inotify_watch *watch; + + sc = arg; + mtx_lock(&sc->lock); + while ((watch = TAILQ_FIRST(&sc->deadwatches)) != NULL) { + TAILQ_REMOVE(&sc->deadwatches, watch, vlink); + mtx_unlock(&sc->lock); + inotify_free_watch(watch); + mtx_lock(&sc->lock); + } + mtx_unlock(&sc->lock); } static int @@ -403,6 +428,7 @@ inotify_close(struct file *fp, struct thread *td) sc = fp->f_data; + /* Detach watches from their vnodes. */ mtx_lock(&sc->lock); (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { @@ -411,6 +437,17 @@ inotify_close(struct file *fp, struct thread *td) inotify_remove_watch(watch); mtx_lock(&sc->lock); } + + /* Make sure that any asynchronous vrele() calls are done. */ + mtx_unlock(&sc->lock); + taskqueue_drain(taskqueue_thread, &sc->reaptask); + mtx_lock(&sc->lock); + KASSERT(RB_EMPTY(&sc->watches), + ("%s: watches not empty in %p", __func__, sc)); + KASSERT(TAILQ_EMPTY(&sc->deadwatches), + ("%s: deadwatches not empty in %p", __func__, sc)); + + /* Drop pending events. */ while (!STAILQ_EMPTY(&sc->pending)) { rec = inotify_dequeue(sc); if (rec != &sc->overflow) @@ -456,6 +493,8 @@ inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) sc->nextwatch = 1; /* Required for compatibility. */ STAILQ_INIT(&sc->pending); RB_INIT(&sc->watches); + TAILQ_INIT(&sc->deadwatches); + TASK_INIT(&sc->reaptask, 0, inotify_reap, sc); mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); knlist_init_mtx(&sc->sel.si_note, &sc->lock); sc->cred = crhold(td->td_ucred); @@ -560,17 +599,16 @@ inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) return (true); } -static int +static void inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, int event, uint32_t cookie) { struct inotify_watch key; struct inotify_softc *sc; struct inotify_record *rec; - int relecount; bool allocfail; - relecount = 0; + mtx_assert(&watch->vp->v_pollinfo->vpi_lock, MA_OWNED); sc = watch->sc; rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, @@ -599,20 +637,22 @@ inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, /* * Remove the watch, taking care to handle races with - * inotify_close(). + * inotify_close(). The thread that removes the watch is + * responsible for freeing it. */ key.wd = watch->wd; if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { RB_REMOVE(inotify_watch_tree, &sc->watches, watch); inotify_unlink_watch_locked(sc, watch); - free(watch, M_INOTIFY); - /* Defer vrele() to until locks are dropped. */ - relecount++; + /* + * Defer the vrele() to a sleepable thread context. + */ + TAILQ_INSERT_TAIL(&sc->deadwatches, watch, vlink); + taskqueue_enqueue(taskqueue_thread, &sc->reaptask); } } mtx_unlock(&sc->lock); - return (relecount); } void @@ -620,25 +660,18 @@ inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, uint32_t cookie) { struct inotify_watch *watch, *tmp; - int relecount; KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, ("inotify_log: invalid event %#x", event)); - relecount = 0; mtx_lock(&vp->v_pollinfo->vpi_lock); TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { KASSERT(watch->vp == vp, ("inotify_log: watch %p vp != vp", watch)); - if ((watch->mask & event) != 0 || event == IN_UNMOUNT) { - relecount += inotify_log_one(watch, name, namelen, event, - cookie); - } + if ((watch->mask & event) != 0 || event == IN_UNMOUNT) + inotify_log_one(watch, name, namelen, event, cookie); } mtx_unlock(&vp->v_pollinfo->vpi_lock); - - for (int i = 0; i < relecount; i++) - vrele(vp); } /*