Module Name: src Committed By: hannken Date: Mon Nov 27 10:03:40 UTC 2023
Modified Files: src/sys/kern: vfs_vnode.c Log Message: Implement and use an iterator over LRU lists. Replace the vdrain kernel thread with two threadpool jobs, one to process deferred vrele and one to keep the number of allocated vnodes below limit. To generate a diff of this commit: cvs rdiff -u -r1.151 -r1.152 src/sys/kern/vfs_vnode.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/kern/vfs_vnode.c diff -u src/sys/kern/vfs_vnode.c:1.151 src/sys/kern/vfs_vnode.c:1.152 --- src/sys/kern/vfs_vnode.c:1.151 Wed Nov 22 13:19:50 2023 +++ src/sys/kern/vfs_vnode.c Mon Nov 27 10:03:40 2023 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_vnode.c,v 1.151 2023/11/22 13:19:50 riastradh Exp $ */ +/* $NetBSD: vfs_vnode.c,v 1.152 2023/11/27 10:03:40 hannken Exp $ */ /*- * Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc. @@ -148,7 +148,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.151 2023/11/22 13:19:50 riastradh Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.152 2023/11/27 10:03:40 hannken Exp $"); #ifdef _KERNEL_OPT #include "opt_pax.h" @@ -164,7 +164,6 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c, #include <sys/hash.h> #include <sys/kauth.h> #include <sys/kmem.h> -#include <sys/kthread.h> #include <sys/module.h> #include <sys/mount.h> #include <sys/namei.h> @@ -172,6 +171,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c, #include <sys/syscallargs.h> #include <sys/sysctl.h> #include <sys/systm.h> +#include <sys/threadpool.h> #include <sys/vnode_impl.h> #include <sys/wapbl.h> #include <sys/fstrans.h> @@ -198,14 +198,17 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c, * private cache line as vnodes migrate between them while under the same * lock (vdrain_lock). */ + +typedef struct { + vnode_impl_t *li_marker; +} lru_iter_t; + u_int numvnodes __cacheline_aligned; static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; +static struct threadpool *threadpool; +static struct threadpool_job vdrain_job; +static struct threadpool_job vrele_job; static kmutex_t vdrain_lock __cacheline_aligned; -static kcondvar_t vdrain_cv; -static int vdrain_gen; -static kcondvar_t vdrain_gen_cv; -static bool vdrain_retry; -static lwp_t * vdrain_lwp; SLIST_HEAD(hashhead, vnode_impl); static kmutex_t vcache_lock __cacheline_aligned; static kcondvar_t vcache_cv; @@ -215,16 +218,22 @@ static struct hashhead *vcache_hashtab; static pool_cache_t vcache_pool; static void lru_requeue(vnode_t *, vnodelst_t *); static vnodelst_t * lru_which(vnode_t *); +static vnode_impl_t * lru_iter_first(int, lru_iter_t *); +static vnode_impl_t * lru_iter_next(lru_iter_t *); +static void lru_iter_release(lru_iter_t *); static vnode_impl_t * vcache_alloc(void); static void vcache_dealloc(vnode_impl_t *); static void vcache_free(vnode_impl_t *); static void vcache_init(void); static void vcache_reinit(void); static void vcache_reclaim(vnode_t *); +static void vrele_deferred(vnode_impl_t *); static void vrelel(vnode_t *, int, int); -static void vdrain_thread(void *); static void vnpanic(vnode_t *, const char *, ...) __printflike(2, 3); +static bool vdrain_one(u_int); +static void vdrain_task(struct threadpool_job *); +static void vrele_task(struct threadpool_job *); /* Routines having to do with the management of the vnode table. */ @@ -424,11 +433,10 @@ vfs_vnode_sysinit(void) } vcache_init(); - cv_init(&vdrain_cv, "vdrain"); - cv_init(&vdrain_gen_cv, "vdrainwt"); - error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, - NULL, &vdrain_lwp, "vdrain"); - KASSERTMSG((error == 0), "kthread_create(vdrain) failed: %d", error); + error = threadpool_get(&threadpool, PRI_NONE); + KASSERTMSG((error == 0), "threadpool_get failed: %d", error); + threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain"); + threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele"); } /* @@ -536,189 +544,208 @@ lru_requeue(vnode_t *vp, vnodelst_t *lis */ numvnodes += d; } - if ((d > 0 && numvnodes > desiredvnodes) || - listhd == &lru_list[LRU_VRELE]) - cv_signal(&vdrain_cv); + if (listhd == &lru_list[LRU_VRELE]) + threadpool_schedule_job(threadpool, &vrele_job); + if (d > 0 && numvnodes > desiredvnodes) + threadpool_schedule_job(threadpool, &vdrain_job); if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock); mutex_exit(&vdrain_lock); } /* - * Release deferred vrele vnodes for this mount. - * Called with file system suspended. + * LRU list iterator. + * Caller holds vdrain_lock. */ -void -vrele_flush(struct mount *mp) +static vnode_impl_t * +lru_iter_first(int idx, lru_iter_t *iterp) { - vnode_impl_t *vip, *marker; - vnode_t *vp; - int when = 0; + vnode_impl_t *marker; - KASSERT(fstrans_is_owner(mp)); + KASSERT(mutex_owned(&vdrain_lock)); + mutex_exit(&vdrain_lock); marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); - mutex_enter(&vdrain_lock); - TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist); + marker->vi_lrulisthd = &lru_list[idx]; + iterp->li_marker = marker; - while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { - TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); - TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker, - vi_lrulist); - vp = VIMPL_TO_VNODE(vip); - if (vnis_marker(vp)) - continue; + TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist); - KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); - TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); - vip->vi_lrulisthd = &lru_list[LRU_HOLD]; - vip->vi_lrulisttm = getticks(); - TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); - mutex_exit(&vdrain_lock); + return lru_iter_next(iterp); +} - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - mutex_enter(vp->v_interlock); - vrelel(vp, 0, LK_EXCLUSIVE); +static vnode_impl_t * +lru_iter_next(lru_iter_t *iter) +{ + vnode_impl_t *vip, *marker; + vnodelst_t *listhd; - if (getticks() > when) { - yield(); - when = getticks() + hz / 10; - } + KASSERT(mutex_owned(&vdrain_lock)); + + marker = iter->li_marker; + listhd = marker->vi_lrulisthd; - mutex_enter(&vdrain_lock); + while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { + TAILQ_REMOVE(listhd, marker, vi_lrulist); + TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist); + if (!vnis_marker(VIMPL_TO_VNODE(vip))) + break; } - TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); - mutex_exit(&vdrain_lock); + return vip; +} + +static void +lru_iter_release(lru_iter_t *iter) +{ + vnode_impl_t *marker; + KASSERT(mutex_owned(&vdrain_lock)); + + marker = iter->li_marker; + TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist); + + mutex_exit(&vdrain_lock); vnfree_marker(VIMPL_TO_VNODE(marker)); + mutex_enter(&vdrain_lock); } /* - * Reclaim a cached vnode. Used from vdrain_thread only. + * Release deferred vrele vnodes for this mount. + * Called with file system suspended. */ -static __inline void -vdrain_remove(vnode_t *vp) +void +vrele_flush(struct mount *mp) { - struct mount *mp; + lru_iter_t iter; + vnode_impl_t *vip; - KASSERT(mutex_owned(&vdrain_lock)); + KASSERT(fstrans_is_owner(mp)); - /* Probe usecount (unlocked). */ - if (vrefcnt(vp) > 0) - return; - /* Try v_interlock -- we lock the wrong direction! */ - if (!mutex_tryenter(vp->v_interlock)) - return; - /* Probe usecount and state. */ - if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) { - mutex_exit(vp->v_interlock); - return; - } - mp = vp->v_mount; - if (fstrans_start_nowait(mp) != 0) { - mutex_exit(vp->v_interlock); - return; + mutex_enter(&vdrain_lock); + for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL; + vip = lru_iter_next(&iter)) { + if (VIMPL_TO_VNODE(vip)->v_mount != mp) + continue; + vrele_deferred(vip); } - vdrain_retry = true; + lru_iter_release(&iter); mutex_exit(&vdrain_lock); - - if (vcache_vget(vp) == 0) { - if (!vrecycle(vp)) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - mutex_enter(vp->v_interlock); - vrelel(vp, 0, LK_EXCLUSIVE); - } - } - fstrans_done(mp); - - mutex_enter(&vdrain_lock); } /* - * Release a cached vnode. Used from vdrain_thread only. + * One pass through the LRU lists to keep the number of allocated + * vnodes below target. Returns true if target met. */ -static __inline void -vdrain_vrele(vnode_t *vp) +static bool +vdrain_one(u_int target) { - vnode_impl_t *vip = VNODE_TO_VIMPL(vp); + int ix, lists[] = { LRU_FREE, LRU_HOLD }; + lru_iter_t iter; + vnode_impl_t *vip; + vnode_t *vp; struct mount *mp; KASSERT(mutex_owned(&vdrain_lock)); - mp = vp->v_mount; - if (fstrans_start_nowait(mp) != 0) - return; + for (ix = 0; ix < __arraycount(lists); ix++) { + for (vip = lru_iter_first(lists[ix], &iter); vip != NULL; + vip = lru_iter_next(&iter)) { + if (numvnodes < target) { + lru_iter_release(&iter); + return true; + } - /* - * First remove the vnode from the vrele list. - * Put it on the last lru list, the last vrele() - * will put it back onto the right list before - * its usecount reaches zero. - */ - KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); - TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); - vip->vi_lrulisthd = &lru_list[LRU_HOLD]; - vip->vi_lrulisttm = getticks(); - TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); + vp = VIMPL_TO_VNODE(vip); - vdrain_retry = true; - mutex_exit(&vdrain_lock); + /* Probe usecount (unlocked). */ + if (vrefcnt(vp) > 0) + continue; + /* Try v_interlock -- we lock the wrong direction! */ + if (!mutex_tryenter(vp->v_interlock)) + continue; + /* Probe usecount and state. */ + if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) { + mutex_exit(vp->v_interlock); + continue; + } + mutex_exit(&vdrain_lock); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - mutex_enter(vp->v_interlock); - vrelel(vp, 0, LK_EXCLUSIVE); - fstrans_done(mp); + mp = vp->v_mount; + if (fstrans_start_nowait(mp) != 0) { + mutex_exit(vp->v_interlock); + mutex_enter(&vdrain_lock); + continue; + } - mutex_enter(&vdrain_lock); + if (vcache_vget(vp) == 0) { + if (!vrecycle(vp)) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + mutex_enter(vp->v_interlock); + vrelel(vp, 0, LK_EXCLUSIVE); + } + } + fstrans_done(mp); + + mutex_enter(&vdrain_lock); + } + lru_iter_release(&iter); + } + + return false; } /* - * Helper thread to keep the number of vnodes below desiredvnodes - * and release vnodes from asynchronous vrele. + * threadpool task to keep the number of vnodes below desiredvnodes. */ static void -vdrain_thread(void *cookie) +vdrain_task(struct threadpool_job *job) { - int i; u_int target; - vnode_impl_t *vip, *marker; - marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); + target = desiredvnodes - desiredvnodes / 16; mutex_enter(&vdrain_lock); - for (;;) { - vdrain_retry = false; - target = desiredvnodes - desiredvnodes / 16; - - for (i = 0; i < LRU_COUNT; i++) { - TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist); - while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { - TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); - TAILQ_INSERT_AFTER(&lru_list[i], vip, marker, - vi_lrulist); - if (vnis_marker(VIMPL_TO_VNODE(vip))) - continue; - if (i == LRU_VRELE) - vdrain_vrele(VIMPL_TO_VNODE(vip)); - else if (numvnodes < target) - break; - else - vdrain_remove(VIMPL_TO_VNODE(vip)); + while (!vdrain_one(target)) + ; + + threadpool_job_done(job); + mutex_exit(&vdrain_lock); +} + +/* + * threadpool task to process asynchronous vrele. + */ +static void +vrele_task(struct threadpool_job *job) +{ + int skipped; + lru_iter_t iter; + vnode_impl_t *vip; + struct mount *mp; + + mutex_enter(&vdrain_lock); + while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) { + for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) { + mp = VIMPL_TO_VNODE(vip)->v_mount; + if (fstrans_start_nowait(mp) == 0) { + vrele_deferred(vip); + fstrans_done(mp); + } else { + skipped++; } - TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); } - if (vdrain_retry) { - kpause("vdrainrt", false, 1, &vdrain_lock); - } else { - vdrain_gen++; - cv_broadcast(&vdrain_gen_cv); - cv_wait(&vdrain_cv, &vdrain_lock); - } + lru_iter_release(&iter); + if (skipped) + kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock); } + + threadpool_job_done(job); + lru_iter_release(&iter); + mutex_exit(&vdrain_lock); } /* @@ -773,6 +800,39 @@ vput(vnode_t *vp) } /* + * Release a vnode from the deferred list. + */ +static void +vrele_deferred(vnode_impl_t *vip) +{ + vnode_t *vp; + + KASSERT(mutex_owned(&vdrain_lock)); + KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); + + vp = VIMPL_TO_VNODE(vip); + + /* + * First remove the vnode from the vrele list. + * Put it on the last lru list, the last vrele() + * will put it back onto the right list before + * its usecount reaches zero. + */ + TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); + vip->vi_lrulisthd = &lru_list[LRU_HOLD]; + vip->vi_lrulisttm = getticks(); + TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); + + mutex_exit(&vdrain_lock); + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + mutex_enter(vp->v_interlock); + vrelel(vp, 0, LK_EXCLUSIVE); + + mutex_enter(&vdrain_lock); +} + +/* * Vnode release. If reference count drops to zero, call inactive * routine and either return to freelist or free to the pool. */ @@ -860,7 +920,7 @@ retry: /* * First try to get the vnode locked for VOP_INACTIVE(). - * Defer vnode release to vdrain_thread if caller requests + * Defer vnode release to vrele task if caller requests * it explicitly, is the pagedaemon or the lock failed. */ defer = false; @@ -886,7 +946,7 @@ retry: KASSERT(mutex_owned(vp->v_interlock)); if (defer) { /* - * Defer reclaim to the kthread; it's not safe to + * Defer reclaim to the vrele task; it's not safe to * clean it here. We donate it our last reference. */ if (lktype != LK_NONE) { @@ -2046,20 +2106,15 @@ vdead_check(struct vnode *vp, int flags) int vfs_drainvnodes(void) { - int i, gen; mutex_enter(&vdrain_lock); - for (i = 0; i < 2; i++) { - gen = vdrain_gen; - while (gen == vdrain_gen) { - cv_broadcast(&vdrain_cv); - cv_wait(&vdrain_gen_cv, &vdrain_lock); - } - } - mutex_exit(&vdrain_lock); - if (numvnodes >= desiredvnodes) + if (!vdrain_one(desiredvnodes)) { + mutex_exit(&vdrain_lock); return EBUSY; + } + + mutex_exit(&vdrain_lock); if (vcache_hashsize != desiredvnodes) vcache_reinit();