Hello, I've mostly merged improvements done to nullfs in the -current to RELENG_4 branch. Operations on mmaped files were fixed before and this patch focused on the proper vnode locking. Unfortunately, this code require changes in the layout of vnode structure. It is possible to not break binary compatibility by changing behavior of vop_sharedlock() function and using v_vnlock field as this is done in -current. This requires changes only to nfs code by including lock structure in the nfsnode structure (in the original code it was allocated separately for each nfsnode). Size and layout of vnode structure left unchanged. I hope this will not break any 3rd party file system because all new fs'es should use vop_std*lock() VOPs or roll their own. Patch against recent RELENG_4 is attached. And if there is no serious objections I'm plan to commit it on the next week. -- Boris Popov http://www.butya.kz/~bp/
Index: kern/vfs_default.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_default.c,v retrieving revision 1.28.2.1 diff -u -r1.28.2.1 vfs_default.c --- kern/vfs_default.c 2001/05/18 09:58:43 1.28.2.1 +++ kern/vfs_default.c 2001/06/06 08:53:49 @@ -360,14 +360,13 @@ * to be handled in intermediate layers. */ struct vnode *vp = ap->a_vp; + struct lock *l = (struct lock *)vp->v_data; int vnflags, flags = ap->a_flags; - if (vp->v_vnlock == NULL) { - if ((flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), - M_VNODE, M_WAITOK); - lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE); + if (l == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return 0; } switch (flags & LK_TYPE_MASK) { case LK_DRAIN: @@ -396,9 +395,9 @@ if (flags & LK_INTERLOCK) vnflags |= LK_INTERLOCK; #ifndef DEBUG_LOCKS - return (lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); + return (lockmgr(l, vnflags, &vp->v_interlock, ap->a_p)); #else - return (debuglockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p, + return (debuglockmgr(l, vnflags, &vp->v_interlock, ap->a_p, "vop_sharedlock", vp->filename, vp->line)); #endif } @@ -435,13 +434,6 @@ struct vnode *vp = ap->a_vp; int vnflags, flags = ap->a_flags; - if (vp->v_vnlock == NULL) { - if ((flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), - M_VNODE, M_WAITOK); - lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE); - } switch (flags & LK_TYPE_MASK) { case LK_DRAIN: vnflags = LK_DRAIN; @@ -485,13 +477,9 @@ { struct vnode *vp = ap->a_vp; - if (vp->v_vnlock == NULL) { - if (ap->a_flags & LK_INTERLOCK) - simple_unlock(&ap->a_vp->v_interlock); - return (0); - } - return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags, - &ap->a_vp->v_interlock, ap->a_p)); + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&vp->v_interlock); + return (0); } /* @@ -505,10 +493,11 @@ } */ *ap; { struct vnode *vp = ap->a_vp; + struct lock *l = (struct lock *)vp->v_data; - if (vp->v_vnlock == NULL) + if (l == NULL) return (0); - return (lockstatus(vp->v_vnlock, ap->a_p)); + return (lockstatus(l, ap->a_p)); } int Index: kern/vfs_subr.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v retrieving revision 1.249.2.8 diff -u -r1.249.2.8 vfs_subr.c --- kern/vfs_subr.c 2001/05/18 09:58:43 1.249.2.8 +++ kern/vfs_subr.c 2001/06/06 08:53:49 @@ -1707,10 +1707,7 @@ } cache_purge(vp); - if (vp->v_vnlock) { - FREE(vp->v_vnlock, M_VNODE); - vp->v_vnlock = NULL; - } + vp->v_vnlock = NULL; if (VSHOULDFREE(vp)) vfree(vp); Index: nfs/nfs_node.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfs_node.c,v retrieving revision 1.36.2.1 diff -u -r1.36.2.1 nfs_node.c --- nfs/nfs_node.c 2001/03/21 10:50:59 1.36.2.1 +++ nfs/nfs_node.c 2001/06/06 08:53:49 @@ -175,6 +175,7 @@ bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; lockinit(&np->n_rslock, PVFS | rsflags, "nfrslk", 0, LK_NOPAUSE); + lockinit(&np->n_lock, PVFS, "nfsnlk", 0, LK_NOPAUSE); *npp = np; if (nfs_node_hash_lock < 0) Index: nfs/nfs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfs_vnops.c,v retrieving revision 1.150.2.2 diff -u -r1.150.2.2 nfs_vnops.c --- nfs/nfs_vnops.c 2001/03/02 16:45:12 1.150.2.2 +++ nfs/nfs_vnops.c 2001/06/06 08:53:49 @@ -149,6 +149,7 @@ { &vop_getpages_desc, (vop_t *) nfs_getpages }, { &vop_putpages_desc, (vop_t *) nfs_putpages }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, + { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lease_desc, (vop_t *) vop_null }, { &vop_link_desc, (vop_t *) nfs_link }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, @@ -169,6 +170,7 @@ { &vop_setattr_desc, (vop_t *) nfs_setattr }, { &vop_strategy_desc, (vop_t *) nfs_strategy }, { &vop_symlink_desc, (vop_t *) nfs_symlink }, + { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfs_write }, { NULL, NULL } }; @@ -187,11 +189,13 @@ { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, + { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsspec_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, + { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsspec_write }, { NULL, NULL } }; @@ -207,11 +211,13 @@ { &vop_fsync_desc, (vop_t *) nfs_fsync }, { &vop_getattr_desc, (vop_t *) nfs_getattr }, { &vop_inactive_desc, (vop_t *) nfs_inactive }, + { &vop_islocked_desc, (vop_t *) vop_stdislocked }, { &vop_lock_desc, (vop_t *) vop_sharedlock }, { &vop_print_desc, (vop_t *) nfs_print }, { &vop_read_desc, (vop_t *) nfsfifo_read }, { &vop_reclaim_desc, (vop_t *) nfs_reclaim }, { &vop_setattr_desc, (vop_t *) nfs_setattr }, + { &vop_unlock_desc, (vop_t *) vop_stdunlock }, { &vop_write_desc, (vop_t *) nfsfifo_write }, { NULL, NULL } }; Index: nfs/nfsnode.h =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfsnode.h,v retrieving revision 1.32 diff -u -r1.32 nfsnode.h --- nfs/nfsnode.h 1999/12/29 04:54:55 1.32 +++ nfs/nfsnode.h 2001/06/06 08:53:49 @@ -86,6 +86,7 @@ * be well aligned and, therefore, tightly packed. */ struct nfsnode { + struct lock n_lock; LIST_ENTRY(nfsnode) n_hash; /* Hash chain */ CIRCLEQ_ENTRY(nfsnode) n_timer; /* Nqnfs timer chain */ u_quad_t n_size; /* Current size of file */ Index: miscfs/nullfs/null.h =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/Attic/null.h,v retrieving revision 1.11.2.2 diff -u -r1.11.2.2 null.h --- miscfs/nullfs/null.h 2000/10/25 04:26:30 1.11.2.2 +++ miscfs/nullfs/null.h 2001/06/06 08:53:49 @@ -52,6 +52,8 @@ * A cache of vnode references */ struct null_node { + struct lock null_lock; /* Lock for this vnode. MBF */ + struct lock *null_vnlock; /* lock of lower vnode in the stack */ LIST_ENTRY(null_node) null_hash; /* Hash list */ struct vnode *null_lowervp; /* VREFed once */ struct vnode *null_vnode; /* Back pointer */ Index: miscfs/nullfs/null_subr.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/Attic/null_subr.c,v retrieving revision 1.21.2.3 diff -u -r1.21.2.3 null_subr.c --- miscfs/nullfs/null_subr.c 2001/05/30 09:47:02 1.21.2.3 +++ miscfs/nullfs/null_subr.c 2001/06/06 08:53:49 @@ -130,10 +130,11 @@ * stuff, but we don't want to lock * the lower node. */ - if (vget(vp, 0, p)) { + if (vget(vp, LK_EXCLUSIVE | LK_CANRECURSE, p)) { printf ("null_node_find: vget failed.\n"); goto loop; - }; + } + VOP_UNLOCK(lowervp, 0, p); return (vp); } } @@ -176,6 +177,7 @@ vp = *vpp; vp->v_type = lowervp->v_type; + lockinit(&xp->null_lock, PINOD, "nullnode", 0, LK_CANRECURSE); xp->null_vnode = vp; vp->v_data = xp; xp->null_lowervp = lowervp; @@ -192,9 +194,24 @@ vrele(vp); *vpp = othervp; return 0; - }; - VREF(lowervp); /* Extra VREF will be vrele'd in null_node_create */ + } + + /* + * From NetBSD: + * Now lock the new node. We rely on the fact that we were passed + * a locked vnode. If the lower node is exporting a struct lock + * (v_vnlock != NULL) then we just set the upper v_vnlock to the + * lower one, and both are now locked. If the lower node is exporting + * NULL, then we copy that up and manually lock the new vnode. + */ + lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, p); + vp->v_vnlock = lowervp->v_vnlock; + error = VOP_LOCK(vp, LK_EXCLUSIVE | LK_THISLAYER, p); + if (error) + panic("null_node_alloc: can't lock new vnode\n"); + + VREF(lowervp); hd = NULL_NHASH(lowervp); LIST_INSERT_HEAD(hd, xp, null_hash); lockmgr(&null_hashlock, LK_RELEASE, NULL, p); @@ -221,10 +238,10 @@ * null_node_find has taken another reference * to the alias vnode. */ + vrele(lowervp); #ifdef NULLFS_DEBUG vprint("null_node_create: exists", aliasvp); #endif - /* VREF(aliasvp); --- done in null_node_find */ } else { int error; @@ -244,8 +261,6 @@ * aliasvp is already VREF'd by getnewvnode() */ } - - vrele(lowervp); #ifdef DIAGNOSTIC if (lowervp->v_usecount < 1) { Index: miscfs/nullfs/null_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/nullfs/Attic/null_vnops.c,v retrieving revision 1.38.2.4 diff -u -r1.38.2.4 null_vnops.c --- miscfs/nullfs/null_vnops.c 2001/05/30 09:47:02 1.38.2.4 +++ miscfs/nullfs/null_vnops.c 2001/06/06 08:53:49 @@ -195,6 +195,7 @@ static int null_getattr(struct vop_getattr_args *ap); static int null_getvobject(struct vop_getvobject_args *ap); static int null_inactive(struct vop_inactive_args *ap); +static int null_islocked(struct vop_islocked_args *ap); static int null_lock(struct vop_lock_args *ap); static int null_lookup(struct vop_lookup_args *ap); static int null_open(struct vop_open_args *ap); @@ -359,44 +360,43 @@ } */ *ap; { struct componentname *cnp = ap->a_cnp; + struct vnode *dvp = ap->a_dvp; struct proc *p = cnp->cn_proc; int flags = cnp->cn_flags; - struct vop_lock_args lockargs; - struct vop_unlock_args unlockargs; - struct vnode *dvp, *vp; + struct vnode *vp, *ldvp, *lvp; int error; - if ((flags & ISLASTCN) && (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) && + if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); - error = null_bypass((struct vop_generic_args *)ap); + /* + * Although it is possible to call null_bypass(), we'll do + * a direct call to reduce overhead + */ + ldvp = NULLVPTOLOWERVP(dvp); + vp = lvp = NULL; + error = VOP_LOOKUP(ldvp, &lvp, cnp); if (error == EJUSTRETURN && (flags & ISLASTCN) && - (ap->a_dvp->v_mount->mnt_flag & MNT_RDONLY) && + (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; + /* - * We must do the same locking and unlocking at this layer as - * is done in the layers below us. We could figure this out - * based on the error return and the LASTCN, LOCKPARENT, and - * LOCKLEAF flags. However, it is more expidient to just find - * out the state of the lower level vnodes and set ours to the - * same state. + * Rely only on the PDIRUNLOCK flag which should be carefully + * tracked by underlying filesystem. */ - dvp = ap->a_dvp; - vp = *ap->a_vpp; - if (dvp == vp) - return (error); - if (!VOP_ISLOCKED(dvp, NULL)) { - unlockargs.a_vp = dvp; - unlockargs.a_flags = 0; - unlockargs.a_p = p; - vop_nounlock(&unlockargs); - } - if (vp != NULLVP && VOP_ISLOCKED(vp, NULL)) { - lockargs.a_vp = vp; - lockargs.a_flags = LK_SHARED; - lockargs.a_p = p; - vop_nolock(&lockargs); + if (cnp->cn_flags & PDIRUNLOCK) + VOP_UNLOCK(dvp, LK_THISLAYER, p); + if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { + if (ldvp == lvp) { + *ap->a_vpp = dvp; + VREF(dvp); + vrele(lvp); + } else { + error = null_node_create(dvp->v_mount, lvp, &vp); + if (error == 0) + *ap->a_vpp = vp; + } } return (error); } @@ -464,6 +464,8 @@ if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) return (error); + + ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; return (0); } @@ -575,12 +577,64 @@ struct proc *a_p; } */ *ap; { + struct vnode *vp = ap->a_vp; + int flags = ap->a_flags; + struct proc *p = ap->a_p; + struct null_node *np = VTONULL(vp); + struct vnode *lvp; + int error; - vop_nolock(ap); - if ((ap->a_flags & LK_TYPE_MASK) == LK_DRAIN) - return (0); - ap->a_flags &= ~LK_INTERLOCK; - return (null_bypass((struct vop_generic_args *)ap)); + if (flags & LK_THISLAYER) { + if (vp->v_vnlock != NULL) + return 0; /* lock is shared across layers */ + error = lockmgr(&np->null_lock, flags & ~LK_THISLAYER, + &vp->v_interlock, p); + return (error); + } + + if (vp->v_vnlock != NULL) { + /* + * The lower level has exported a struct lock to us. Use + * it so that all vnodes in the stack lock and unlock + * simultaneously. Note: we don't DRAIN the lock as DRAIN + * decommissions the lock - just because our vnode is + * going away doesn't mean the struct lock below us is. + * LK_EXCLUSIVE is fine. + */ + if ((flags & LK_TYPE_MASK) == LK_DRAIN) { + NULLFSDEBUG("null_lock: avoiding LK_DRAIN\n"); + return(lockmgr(vp->v_vnlock, + (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE, + &vp->v_interlock, p)); + } + return(lockmgr(vp->v_vnlock, flags, &vp->v_interlock, p)); + } + /* + * To prevent race conditions involving doing a lookup + * on "..", we have to lock the lower node, then lock our + * node. Most of the time it won't matter that we lock our + * node (as any locking would need the lower one locked + * first). But we can LK_DRAIN the upper lock as a step + * towards decomissioning it. + */ + lvp = NULLVPTOLOWERVP(vp); + if (lvp == NULL) + return (lockmgr(&np->null_lock, flags, &vp->v_interlock, p)); + if (flags & LK_INTERLOCK) { + VI_UNLOCK(vp); + flags &= ~LK_INTERLOCK; + } + if ((flags & LK_TYPE_MASK) == LK_DRAIN) { + error = VOP_LOCK(lvp, + (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE, p); + } else + error = VOP_LOCK(lvp, flags, p); + if (error) + return (error); + error = lockmgr(&np->null_lock, flags, &vp->v_interlock, p); + if (error) + VOP_UNLOCK(lvp, 0, p); + return (error); } /* @@ -596,12 +650,56 @@ struct proc *a_p; } */ *ap; { - vop_nounlock(ap); - ap->a_flags &= ~LK_INTERLOCK; - return (null_bypass((struct vop_generic_args *)ap)); + struct vnode *vp = ap->a_vp; + int flags = ap->a_flags; + struct proc *p = ap->a_p; + struct null_node *np = VTONULL(vp); + struct vnode *lvp; + + if (vp->v_vnlock != NULL) { + if (flags & LK_THISLAYER) + return 0; /* the lock is shared across layers */ + flags &= ~LK_THISLAYER; + return (lockmgr(vp->v_vnlock, flags | LK_RELEASE, + &vp->v_interlock, p)); + } + lvp = NULLVPTOLOWERVP(vp); + if (lvp == NULL) + return (lockmgr(&np->null_lock, flags | LK_RELEASE, &vp->v_interlock, +p)); + if ((flags & LK_THISLAYER) == 0) { + if (flags & LK_INTERLOCK) { + VI_UNLOCK(vp); + flags &= ~LK_INTERLOCK; + } + VOP_UNLOCK(lvp, flags, p); + } else + flags &= ~LK_THISLAYER; + ap->a_flags = flags; + return (lockmgr(&np->null_lock, flags | LK_RELEASE, &vp->v_interlock, p)); } static int +null_islocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct proc *p = ap->a_p; + + if (vp->v_vnlock != NULL) + return (lockstatus(vp->v_vnlock, p)); + return (lockstatus(&VTONULL(vp)->null_lock, p)); +} + + +/* + * There is no way to tell that someone issued remove/rmdir operation + * on the underlying filesystem. For now we just have to release lowevrp + * as soon as possible. + */ +static int null_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; @@ -609,27 +707,34 @@ } */ *ap; { struct vnode *vp = ap->a_vp; + struct proc *p = ap->a_p; struct null_node *xp = VTONULL(vp); struct vnode *lowervp = xp->null_lowervp; + + lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, p); + LIST_REMOVE(xp, null_hash); + lockmgr(&null_hashlock, LK_RELEASE, NULL, p); + + xp->null_lowervp = NULLVP; + if (vp->v_vnlock != NULL) { + vp->v_vnlock = &xp->null_lock; /* we no longer share the lock */ + } else + VOP_UNLOCK(vp, LK_THISLAYER, p); + + vput(lowervp); /* - * Do nothing (and _don't_ bypass). - * Wait to vrele lowervp until reclaim, - * so that until then our null_node is in the - * cache and reusable. - * We still have to tell the lower layer the vnode - * is now inactive though. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. + * Now it is safe to drop references to the lower vnode. + * VOP_INACTIVE() will be called by vrele() if necessary. */ - VOP_INACTIVE(lowervp, ap->a_p); - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); + vrele (lowervp); + return (0); } +/* + * We can free memory in null_inactive, but we do this + * here. (Possible to guard vp->v_data to point somewhere) + */ static int null_reclaim(ap) struct vop_reclaim_args /* { @@ -638,22 +743,11 @@ } */ *ap; { struct vnode *vp = ap->a_vp; - struct proc *p = ap->a_p; - struct null_node *xp = VTONULL(vp); - struct vnode *lowervp = xp->null_lowervp; + void *vdata = vp->v_data; - /* - * Note: in vop_reclaim, vp->v_op == dead_vnodeop_p, - * so we can't call VOPs on ourself. - */ - /* After this assignment, this node will not be re-used. */ - xp->null_lowervp = NULLVP; - lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, p); - LIST_REMOVE(xp, null_hash); - lockmgr(&null_hashlock, LK_RELEASE, NULL, p); vp->v_data = NULL; - FREE(xp, M_NULLFSNODE); - vrele (lowervp); + FREE(vdata, M_NULLFSNODE); + return (0); } @@ -733,6 +827,7 @@ { &vop_getattr_desc, (vop_t *) null_getattr }, { &vop_getvobject_desc, (vop_t *) null_getvobject }, { &vop_inactive_desc, (vop_t *) null_inactive }, + { &vop_islocked_desc, (vop_t *) null_islocked }, { &vop_lock_desc, (vop_t *) null_lock }, { &vop_lookup_desc, (vop_t *) null_lookup }, { &vop_open_desc, (vop_t *) null_open }, Index: sys/lock.h =================================================================== RCS file: /home/ncvs/src/sys/sys/lock.h,v retrieving revision 1.17.2.1 diff -u -r1.17.2.1 lock.h --- sys/lock.h 2000/09/30 02:49:37 1.17.2.1 +++ sys/lock.h 2001/06/06 08:53:49 @@ -141,6 +141,7 @@ getting lk_interlock */ #define LK_RETRY 0x00020000 /* vn_lock: retry until locked */ #define LK_NOOBJ 0x00040000 /* vget: don't create object */ +#define LK_THISLAYER 0x00080000 /* vn_lock: lock/unlock only current layer +*/ /* * Internal state flags corresponding to lk_sharecount, and lk_waitcount