When I changed vnode kqueue events to be posted from common code at the VFS 
layer, I introduced a regression in sending those events on stacked file 
systems like nullfs.  The root cause of the problem is that when knotes are 
attached to the vnode, that operation (by way of the VOP bypass mechanism in 
layerfs) drills all the way down to the bottom of the vnode stack and attaches 
there (as intended), but with event delivery being handled now in vnode_if.c, 
it’s the upper vnode that’s being worked with, and no knotes are attached there.

These stacked vnodes already share a bit of state with the bottom layer… 
(locks, etc.), so my solution to this problem is to allow these stacked vnodes 
to also refer to the bottom vnode’s klist.  Each vnode always starts out 
referring to its own, but as a layerfs vnode is constructed, it is told to 
refer to the lower vnode's klist (just as it refers to the lower vnode’s 
interlock and uvm_obj lock).  Various assertions are included to ensure that 
knotes are never attached to a vnode that refers to another vnode’s klist (they 
should always be attached to the bottom-most vnode).

(After applying the patch, vnode_if.c must be rebuilt.)

Index: sys/fs/union/union_subr.c
===================================================================
RCS file: /cvsroot/src/sys/fs/union/union_subr.c,v
retrieving revision 1.81
diff -u -p -r1.81 union_subr.c
--- sys/fs/union/union_subr.c   19 Mar 2022 13:53:32 -0000      1.81
+++ sys/fs/union/union_subr.c   12 Jul 2022 00:17:28 -0000
@@ -232,10 +232,11 @@ union_newupper(struct union_node *un, st
        unlock_ap.a_desc = VDESC(vop_unlock);
        unlock_ap.a_vp = UNIONTOV(un);
        genfs_unlock(&unlock_ap);
-       /* Update union vnode interlock & vmobjlock. */
+       /* Update union vnode interlock, vmobjlock, & klist. */
        vshareilock(UNIONTOV(un), uppervp);
        rw_obj_hold(uppervp->v_uobj.vmobjlock);
        uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
+       vshareklist(UNIONTOV(un), uppervp);
        mutex_exit(&un->un_lock);
        if (ohash != nhash) {
                LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
@@ -577,6 +578,7 @@ union_loadvnode(struct mount *mp, struct
        vshareilock(vp, svp);
        rw_obj_hold(svp->v_uobj.vmobjlock);
        uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
+       vshareklist(vp, svp);
 
        /* detect the root vnode (and aliases) */
        if ((un->un_uppervp == um->um_uppervp) &&
Index: sys/miscfs/genfs/layer_vfsops.c
===================================================================
RCS file: /cvsroot/src/sys/miscfs/genfs/layer_vfsops.c,v
retrieving revision 1.54
diff -u -p -r1.54 layer_vfsops.c
--- sys/miscfs/genfs/layer_vfsops.c     23 Feb 2020 15:46:41 -0000      1.54
+++ sys/miscfs/genfs/layer_vfsops.c     12 Jul 2022 00:17:28 -0000
@@ -205,10 +205,11 @@ layerfs_loadvnode(struct mount *mp, stru
 
        xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);
 
-       /* Share the interlock and vmobjlock with the lower node. */
+       /* Share the interlock, vmobjlock, and klist with the lower node. */
        vshareilock(vp, lowervp);
        rw_obj_hold(lowervp->v_uobj.vmobjlock);
        uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
+       vshareklist(vp, lowervp);
 
        vp->v_tag = lmp->layerm_tag;
        vp->v_type = lowervp->v_type;
Index: sys/kern/vfs_vnode.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnode.c,v
retrieving revision 1.143
diff -u -p -r1.143 vfs_vnode.c
--- sys/kern/vfs_vnode.c        9 Apr 2022 23:45:45 -0000       1.143
+++ sys/kern/vfs_vnode.c        12 Jul 2022 00:17:28 -0000
@@ -457,7 +457,8 @@ vnalloc_marker(struct mount *mp)
        vp->v_mount = mp;
        vp->v_type = VBAD;
        vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
-       klist_init(&vp->v_klist);
+       klist_init(&vip->vi_klist.vk_klist);
+       vp->v_klist = &vip->vi_klist;
        vip->vi_state = VS_MARKER;
 
        return vp;
@@ -475,7 +476,7 @@ vnfree_marker(vnode_t *vp)
        KASSERT(vip->vi_state == VS_MARKER);
        mutex_obj_free(vp->v_interlock);
        uvm_obj_destroy(&vp->v_uobj, true);
-       klist_fini(&vp->v_klist);
+       klist_fini(&vip->vi_klist.vk_klist);
        pool_cache_put(vcache_pool, vip);
 }
 
@@ -1391,7 +1392,8 @@ vcache_alloc(void)
        vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 
        uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
-       klist_init(&vp->v_klist);
+       klist_init(&vip->vi_klist.vk_klist);
+       vp->v_klist = &vip->vi_klist;
        cv_init(&vp->v_cv, "vnode");
        cache_vnode_init(vp);
 
@@ -1453,7 +1455,9 @@ vcache_free(vnode_impl_t *vip)
        mutex_obj_free(vp->v_interlock);
        rw_destroy(&vip->vi_lock);
        uvm_obj_destroy(&vp->v_uobj, true);
-       klist_fini(&vp->v_klist);
+       KASSERT(vp->v_klist == &vip->vi_klist ||
+               SLIST_EMPTY(&vip->vi_klist.vk_klist));
+       klist_fini(&vip->vi_klist.vk_klist);
        cv_destroy(&vp->v_cv);
        cache_vnode_fini(vp);
        pool_cache_put(vcache_pool, vip);
@@ -1916,7 +1920,7 @@ vcache_reclaim(vnode_t *vp)
         * Don't check for interest in NOTE_REVOKE; it's always posted
         * because it sets EV_EOF.
         */
-       KNOTE(&vp->v_klist, NOTE_REVOKE);
+       KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
        mutex_exit(vp->v_interlock);
 
        /*
@@ -2095,3 +2099,28 @@ vshareilock(vnode_t *tvp, vnode_t *fvp)
        tvp->v_interlock = fvp->v_interlock;
        mutex_obj_free(oldlock);
 }
+
+void
+vshareklist(vnode_t *tvp, vnode_t *fvp)
+{
+       /*
+        * If two vnodes share klist state, they must also share
+        * an interlock.
+        */
+       KASSERT(tvp->v_interlock == fvp->v_interlock);
+
+       /*
+        * We make the following assumptions:
+        *
+        * ==> Some other synchronization is happening outside of
+        *     our view to make this safe.
+        *
+        * ==> That the "to" vnode will have the necessary references
+        *     on the "from" vnode so that the storage for the klist
+        *     won't be yanked out from beneath us (the vnode_impl).
+        *
+        * ==> If "from" is also sharing, we then assume that "from"
+        *     has the necessary references, and so on.
+        */
+       tvp->v_klist = fvp->v_klist;
+}
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.233
diff -u -p -r1.233 vfs_vnops.c
--- sys/kern/vfs_vnops.c        6 Jul 2022 13:52:24 -0000       1.233
+++ sys/kern/vfs_vnops.c        12 Jul 2022 00:17:28 -0000
@@ -79,7 +79,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
-#include <sys/vnode.h>
+#include <sys/vnode_impl.h>
 #include <sys/ioctl.h>
 #include <sys/tty.h>
 #include <sys/poll.h>
@@ -1428,9 +1428,17 @@ vn_knote_to_interest(const struct knote 
 void
 vn_knote_attach(struct vnode *vp, struct knote *kn)
 {
+       struct vnode_klist *vk = vp->v_klist;
        long interest = 0;
 
        /*
+        * In the case of layered / stacked file systems, knotes
+        * should only ever be associated with the base vnode.
+        */
+       KASSERT(kn->kn_hook == vp);
+       KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
+
+       /*
         * We maintain a bitmask of the kevents that there is interest in,
         * to minimize the impact of having watchers.  It's silly to have
         * to traverse vn_klist every time a read or write happens simply
@@ -1439,18 +1447,23 @@ vn_knote_attach(struct vnode *vp, struct
         */
 
        mutex_enter(vp->v_interlock);
-       SLIST_INSERT_HEAD(&vp->v_klist, kn, kn_selnext);
-       SLIST_FOREACH(kn, &vp->v_klist, kn_selnext) {
+       SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
+       SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
                interest |= vn_knote_to_interest(kn);
        }
-       vp->v_klist_interest = interest;
+       vk->vk_interest = interest;
        mutex_exit(vp->v_interlock);
 }
 
 void
 vn_knote_detach(struct vnode *vp, struct knote *kn)
 {
-       int interest = 0;
+       struct vnode_klist *vk = vp->v_klist;
+       long interest = 0;
+
+       /* See above. */
+       KASSERT(kn->kn_hook == vp);
+       KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
 
        /*
         * We special case removing the head of the list, because:
@@ -1464,16 +1477,16 @@ vn_knote_detach(struct vnode *vp, struct
         */
 
        mutex_enter(vp->v_interlock);
-       if (__predict_true(kn == SLIST_FIRST(&vp->v_klist))) {
-               SLIST_REMOVE_HEAD(&vp->v_klist, kn_selnext);
-               SLIST_FOREACH(kn, &vp->v_klist, kn_selnext) {
+       if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
+               SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
+               SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
                        interest |= vn_knote_to_interest(kn);
                }
-               vp->v_klist_interest = interest;
+               vk->vk_interest = interest;
        } else {
                struct knote *thiskn, *nextkn, *prevkn = NULL;
 
-               SLIST_FOREACH_SAFE(thiskn, &vp->v_klist, kn_selnext, nextkn) {
+               SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
                        if (thiskn == kn) {
                                KASSERT(kn != NULL);
                                KASSERT(prevkn != NULL);
@@ -1484,7 +1497,7 @@ vn_knote_detach(struct vnode *vp, struct
                                prevkn = thiskn;
                        }
                }
-               vp->v_klist_interest = interest;
+               vk->vk_interest = interest;
        }
        mutex_exit(vp->v_interlock);
 }
Index: sys/kern/vnode_if.sh
===================================================================
RCS file: /cvsroot/src/sys/kern/vnode_if.sh,v
retrieving revision 1.75
diff -u -p -r1.75 vnode_if.sh
--- sys/kern/vnode_if.sh        3 May 2022 13:54:18 -0000       1.75
+++ sys/kern/vnode_if.sh        12 Jul 2022 00:17:28 -0000
@@ -444,7 +444,7 @@ do {                                                        
                \\
         */                                                             \\
        mutex_enter((thisvp)->v_interlock);                             \\
        if (__predict_true((e) == 0)) {                                 \\
-               knote(&(thisvp)->v_klist, (n));                         \\
+               knote(&(thisvp)->v_klist->vk_klist, (n));               \\
        }                                                               \\
        holdrelel((thisvp));                                            \\
        mutex_exit((thisvp)->v_interlock);                              \\
@@ -557,7 +557,7 @@ do {                                                        
                \\
                 * meaningless from the watcher's perspective.          \\
                 */                                                     \\
                if (__predict_true(thisvp->v_op != dead_vnodeop_p)) {   \\
-                       knote(&thisvp->v_klist,                         \\
+                       knote(&thisvp->v_klist->vk_klist,               \\
                            ((ap)->a_fflag & FWRITE)                    \\
                            ? NOTE_CLOSE_WRITE : NOTE_CLOSE);           \\
                }                                                       \\
Index: sys/sys/param.h
===================================================================
RCS file: /cvsroot/src/sys/sys/param.h,v
retrieving revision 1.711
diff -u -p -r1.711 param.h
--- sys/sys/param.h     20 Jun 2022 08:38:56 -0000      1.711
+++ sys/sys/param.h     12 Jul 2022 00:17:28 -0000
@@ -67,7 +67,7 @@
  *     2.99.9          (299000900)
  */
 
-#define        __NetBSD_Version__      999009800       /* NetBSD 9.99.98 */
+#define        __NetBSD_Version__      999009900       /* NetBSD 9.99.99 */
 
 #define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
     (m) * 1000000) + (p) * 100) <= __NetBSD_Version__)
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode.h,v
retrieving revision 1.301
diff -u -p -r1.301 vnode.h
--- sys/sys/vnode.h     25 Mar 2022 08:56:36 -0000      1.301
+++ sys/sys/vnode.h     12 Jul 2022 00:17:28 -0000
@@ -179,8 +179,8 @@ struct vnode {
        enum vtype      v_type;                 /* -   vnode type */
        enum vtagtype   v_tag;                  /* -   type of underlying data 
*/
        void            *v_data;                /* -   private data for fs */
-       struct klist    v_klist;                /* i   notes attached to vnode 
*/
-       long            v_klist_interest;       /* i   what the noes are 
interested in */
+       struct vnode_klist *v_klist;            /* i   kevent / knote info */
+
        void            *v_segvguard;           /* e   for PAX_SEGVGUARD */
 };
 #define        v_mountedhere   v_un.vu_mountedhere
@@ -190,6 +190,19 @@ struct vnode {
 #define        v_ractx         v_un.vu_ractx
 
 typedef struct vnode vnode_t;
+
+/*
+ * Structure that encompasses the kevent state for a vnode.  This is
+ * carved out as a separate structure because some vnodes may share
+ * this state with one another.
+ *
+ * N.B. if two vnodes share a vnode_klist, then they must also share
+ * v_interlock.
+ */
+struct vnode_klist {
+       struct klist    vk_klist;       /* i   notes attached to vnode */
+       long            vk_interest;    /* i   what the notes are interested in 
*/
+};
 #endif
 
 /*
@@ -415,7 +428,7 @@ void vref(struct vnode *);
  * Macro to determine kevent interest on a vnode.
  */
 #define        VN_KEVENT_INTEREST(vp, n)                                       
\
-       ((vp)->v_klist_interest != 0)
+       (((vp)->v_klist->vk_interest & (n)) != 0)
 
 static inline void
 VN_KNOTE(struct vnode *vp, long hint)
@@ -429,7 +442,7 @@ VN_KNOTE(struct vnode *vp, long hint)
         */
        if (__predict_false(VN_KEVENT_INTEREST(vp, hint))) {
                mutex_enter(vp->v_interlock);
-               knote(&vp->v_klist, hint);
+               knote(&vp->v_klist->vk_klist, hint);
                mutex_exit(vp->v_interlock);
        }
 }
@@ -594,6 +607,7 @@ int vdead_check(struct vnode *, int);
 void   vrevoke(struct vnode *);
 void   vremfree(struct vnode *);
 void   vshareilock(struct vnode *, struct vnode *);
+void   vshareklist(struct vnode *, struct vnode *);
 int    vrefcnt(struct vnode *);
 int    vcache_get(struct mount *, const void *, size_t, struct vnode **);
 int    vcache_new(struct mount *, struct vnode *,
Index: sys/sys/vnode_impl.h
===================================================================
RCS file: /cvsroot/src/sys/sys/vnode_impl.h,v
retrieving revision 1.23
diff -u -p -r1.23 vnode_impl.h
--- sys/sys/vnode_impl.h        22 Mar 2020 14:38:37 -0000      1.23
+++ sys/sys/vnode_impl.h        12 Jul 2022 00:17:28 -0000
@@ -77,6 +77,12 @@ struct vnode_impl {
        struct vcache_key vi_key;               /* c   vnode cache key */
 
        /*
+        * The vnode klist is accessed frequently, but rarely
+        * modified.
+        */
+       struct vnode_klist vi_klist;            /* i   kevent / knote state */
+
+       /*
         * vnode cache, LRU and syncer.  This all changes with some
         * regularity so keep it together.
         */
Index: tests/lib/libc/kevent_nullmnt/t_nullmnt.sh
===================================================================
RCS file: /cvsroot/src/tests/lib/libc/kevent_nullmnt/t_nullmnt.sh,v
retrieving revision 1.5
diff -u -p -r1.5 t_nullmnt.sh
--- tests/lib/libc/kevent_nullmnt/t_nullmnt.sh  4 Jun 2022 20:32:49 -0000       
1.5
+++ tests/lib/libc/kevent_nullmnt/t_nullmnt.sh  12 Jul 2022 00:17:28 -0000
@@ -33,7 +33,6 @@ nullmnt_upper_lower_head()
 }
 nullmnt_upper_lower_body()
 {
-       atf_expect_fail "PR kern/56713"
        nullmnt_common lower_dir upper_dir
 } 
 nullmnt_upper_lower_cleanup()
@@ -48,7 +47,6 @@ nullmnt_upper_upper_head()
 }
 nullmnt_upper_upper_body()
 {
-       atf_expect_fail "PR kern/56713"
        nullmnt_common upper_dir upper_dir
 } 
 nullmnt_upper_upper_cleanup()
-- thorpej


Reply via email to