kern

Mateusz Guzik Thu, 16 Jan 2020 02:44:34 -0800

Author: mjg
Date: Thu Jan 16 10:44:02 2020
New Revision: 356786
URL: https://svnweb.freebsd.org/changeset/base/356786


Log:
  vfs: reimplement vlrureclaim to actually use LRU
  
  Take advantage of global ordering introduced in r356672.
  
  Reviewed by:  mckusick (previous version)
  Differential Revision:        https://reviews.freebsd.org/D23067

Modified:
  head/sys/kern/vfs_subr.c

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c    Thu Jan 16 10:35:47 2020        (r356785)
+++ head/sys/kern/vfs_subr.c    Thu Jan 16 10:44:02 2020        (r356786)
@@ -166,6 +166,7 @@ int vttoif_tab[10] = {
  */
 static TAILQ_HEAD(freelst, vnode) vnode_list;
 static struct vnode *vnode_list_free_marker;
+static struct vnode *vnode_list_reclaim_marker;
 
 /*
  * "Free" vnode target.  Free vnodes are rarely completely free, but are
@@ -653,6 +654,8 @@ vntblinit(void *dummy __unused)
        mtx_unlock(&vnode_list_mtx);
        vnode_list_free_marker = vn_alloc_marker(NULL);
        TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
+       vnode_list_reclaim_marker = vn_alloc_marker(NULL);
+       TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
        vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
            vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
        vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
@@ -1057,6 +1060,17 @@ vattr_null(struct vattr *vap)
 }
 
 /*
+ * Try to reduce the total number of vnodes.
+ *
+ * This routine (and its user) are buggy in at least the following ways:
+ * - all parameters were picked years ago when RAM sizes were significantly
+ *   smaller
+ * - it can pick vnodes based on pages used by the vm object, but filesystems
+ *   like ZFS don't use it making the pick broken
+ * - since ZFS has its own aging policy it gets partially combated by this one
+ * - a dedicated method should be provided for filesystems to let them decide
+ *   whether the vnode should be recycled
+ *
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
@@ -1071,118 +1085,116 @@ vattr_null(struct vattr *vap)
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  *
- * @param mp            Try to reclaim vnodes from this mountpoint
  * @param reclaim_nc_src Only reclaim directories with outgoing namecache
  *                      entries if this argument is strue
  * @param trigger       Only reclaim vnodes with fewer than this many resident
  *                      pages.
+ * @param target        How many vnodes to reclaim.
  * @return              The number of vnodes that were reclaimed.
  */
 static int
-vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger)
+vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
 {
-       struct vnode *vp;
-       int count, done, target;
+       struct vnode *vp, *mvp;
+       struct mount *mp;
+       u_long done;
+       bool retried;
 
+       mtx_assert(&vnode_list_mtx, MA_OWNED);
+
+       retried = false;
        done = 0;
-       vn_start_write(NULL, &mp, V_WAIT);
-       MNT_ILOCK(mp);
-       count = mp->mnt_nvnodelistsize;
-       target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
-       target = target / 10 + 1;
-       while (count != 0 && done < target) {
-               vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-               while (vp != NULL && vp->v_type == VMARKER)
-                       vp = TAILQ_NEXT(vp, v_nmntvnodes);
-               if (vp == NULL)
+
+       mvp = vnode_list_reclaim_marker;
+restart:
+       vp = mvp;
+       while (done < target) {
+               vp = TAILQ_NEXT(vp, v_vnodelist);
+               if (__predict_false(vp == NULL))
                        break;
+
+               if (__predict_false(vp->v_type == VMARKER))
+                       continue;
+
                /*
-                * XXX LRU is completely broken for non-free vnodes.  First
-                * by calling here in mountpoint order, then by moving
-                * unselected vnodes to the end here, and most grossly by
-                * removing the vlruvp() function that was supposed to
-                * maintain the order.  (This function was born broken
-                * since syncer problems prevented it doing anything.)  The
-                * order is closer to LRC (C = Created).
-                *
-                * LRU reclaiming of vnodes seems to have last worked in
-                * FreeBSD-3 where LRU wasn't mentioned under any spelling.
-                * Then there was no hold count, and inactive vnodes were
-                * simply put on the free list in LRU order.  The separate
-                * lists also break LRU.  We prefer to reclaim from the
-                * free list for technical reasons.  This tends to thrash
-                * the free list to keep very unrecently used held vnodes.
-                * The problem is mitigated by keeping the free list large.
-                */
-               TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
-               TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
-               --count;
-               if (!VI_TRYLOCK(vp))
-                       goto next_iter;
-               /*
                 * If it's been deconstructed already, it's still
                 * referenced, or it exceeds the trigger, skip it.
                 * Also skip free vnodes.  We are trying to make space
                 * to expand the free list, not reduce it.
                 */
-               if (vp->v_usecount ||
+               if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
+                   (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
+                       goto next_iter;
+
+               if (vp->v_type == VBAD || vp->v_type == VNON)
+                       goto next_iter;
+
+               if (!VI_TRYLOCK(vp))
+                       goto next_iter;
+
+               if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
                    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
-                   vp->v_holdcnt == 0 ||
-                   VN_IS_DOOMED(vp) || (vp->v_object != NULL &&
+                   vp->v_type == VBAD || vp->v_type == VNON ||
+                   (vp->v_object != NULL &&
                    vp->v_object->resident_page_count > trigger)) {
                        VI_UNLOCK(vp);
                        goto next_iter;
                }
-               MNT_IUNLOCK(mp);
                vholdl(vp);
-               if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
+               VI_UNLOCK(vp);
+               TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+               TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
+               mtx_unlock(&vnode_list_mtx);
+
+               if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
                        vdrop(vp);
-                       goto next_iter_mntunlocked;
+                       goto next_iter_unlocked;
                }
+               if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
+                       vdrop(vp);
+                       vn_finished_write(mp);
+                       goto next_iter_unlocked;
+               }
+
                VI_LOCK(vp);
-               /*
-                * v_usecount may have been bumped after VOP_LOCK() dropped
-                * the vnode interlock and before it was locked again.
-                *
-                * It is not necessary to recheck VIRF_DOOMED because it can
-                * only be set by another thread that holds both the vnode
-                * lock and vnode interlock.  If another thread has the
-                * vnode lock before we get to VOP_LOCK() and obtains the
-                * vnode interlock after VOP_LOCK() drops the vnode
-                * interlock, the other thread will be unable to drop the
-                * vnode lock before our VOP_LOCK() call fails.
-                */
-               if (vp->v_usecount ||
+               if (vp->v_usecount > 0 ||
                    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
                    (vp->v_object != NULL &&
                    vp->v_object->resident_page_count > trigger)) {
                        VOP_UNLOCK(vp);
                        vdropl(vp);
-                       goto next_iter_mntunlocked;
+                       vn_finished_write(mp);
+                       goto next_iter_unlocked;
                }
-               KASSERT(!VN_IS_DOOMED(vp),
-                   ("VIRF_DOOMED unexpectedly detected in vlrureclaim()"));
                counter_u64_add(recycles_count, 1);
                vgonel(vp);
                VOP_UNLOCK(vp);
                vdropl(vp);
+               vn_finished_write(mp);
                done++;
-next_iter_mntunlocked:
-               if (!should_yield())
-                       goto relock_mnt;
-               goto yield;
+next_iter_unlocked:
+               if (should_yield())
+                       kern_yield(PRI_USER);
+               mtx_lock(&vnode_list_mtx);
+               goto restart;
 next_iter:
+               MPASS(vp->v_type != VMARKER);
                if (!should_yield())
                        continue;
-               MNT_IUNLOCK(mp);
-yield:
+               TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+               TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
+               mtx_unlock(&vnode_list_mtx);
                kern_yield(PRI_USER);
-relock_mnt:
-               MNT_ILOCK(mp);
+               mtx_lock(&vnode_list_mtx);
+               goto restart;
        }
-       MNT_IUNLOCK(mp);
-       vn_finished_write(mp);
-       return done;
+       if (done == 0 && !retried) {
+               TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
+               TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
+               retried = true;
+               goto restart;
+       }
+       return (done);
 }
 
 static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
@@ -1291,8 +1303,7 @@ static int vnlruproc_sig;
 static void
 vnlru_proc(void)
 {
-       u_long rnumvnodes, rfreevnodes;
-       struct mount *mp, *nmp;
+       u_long rnumvnodes, rfreevnodes, target;
        unsigned long onumvnodes;
        int done, force, trigger, usevnodes, vsp;
        bool reclaim_nc_src;
@@ -1331,9 +1342,6 @@ vnlru_proc(void)
                            PVFS|PDROP, "vlruwt", hz);
                        continue;
                }
-               mtx_unlock(&vnode_list_mtx);
-               done = 0;
-               rnumvnodes = atomic_load_long(&numvnodes);
                rfreevnodes = atomic_load_long(&freevnodes);
 
                onumvnodes = rnumvnodes;
@@ -1362,18 +1370,10 @@ vnlru_proc(void)
                if (force < 2)
                        trigger = vsmalltrigger;
                reclaim_nc_src = force >= 3;
-               mtx_lock(&mountlist_mtx);
-               for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-                       if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
-                               nmp = TAILQ_NEXT(mp, mnt_list);
-                               continue;
-                       }
-                       done += vlrureclaim(mp, reclaim_nc_src, trigger);
-                       mtx_lock(&mountlist_mtx);
-                       nmp = TAILQ_NEXT(mp, mnt_list);
-                       vfs_unbusy(mp);
-               }
-               mtx_unlock(&mountlist_mtx);
+               target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 
1);
+               target = target / 10 + 1;
+               done = vlrureclaim(reclaim_nc_src, trigger, target);
+               mtx_unlock(&vnode_list_mtx);
                if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
                        uma_reclaim(UMA_RECLAIM_DRAIN);
                if (done == 0) {
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

svn commit: r356786 - head/sys/kern

Reply via email to