The branch stable/14 has been updated by mjg:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=cfbc3927613a8455498db5ff3e67046a4dab15a1

commit cfbc3927613a8455498db5ff3e67046a4dab15a1
Author:     Mateusz Guzik <m...@freebsd.org>
AuthorDate: 2023-09-14 19:08:40 +0000
Commit:     Mateusz Guzik <m...@freebsd.org>
CommitDate: 2023-10-13 23:41:47 +0000

    vfs: prefix regular vnlru with a special case for free vnodes
    
    Works around severe performance problems in certain corner cases, see
    the commentary added.
    
    Modifying vnlru logic has proven rather error prone in the past and a
    release is near, thus take the easy way out and fix it without having to
    dig into the current machinery.
    
    (cherry picked from commit 90a008e94bb205e5b8f3c41d57e155b59a6be95d)
---
 sys/kern/vfs_subr.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 4 deletions(-)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5e39a149ef36..5834feff080c 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1427,6 +1427,14 @@ vnlru_free_locked(int count)
        return (ret);
 }
 
+static int
+vnlru_free(int count)
+{
+
+       mtx_lock(&vnode_list_mtx);
+       return (vnlru_free_locked(count));
+}
+
 void
 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
@@ -1594,6 +1602,106 @@ vnlru_kick_cond(void)
        mtx_unlock(&vnode_list_mtx);
 }
 
+static void
+vnlru_proc_sleep(void)
+{
+
+       if (vnlruproc_sig) {
+               vnlruproc_sig = 0;
+               wakeup(&vnlruproc_sig);
+       }
+       msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz);
+}
+
+/*
+ * A lighter version of the machinery below.
+ *
+ * Tries to reach goals only by recycling free vnodes and does not invoke
+ * uma_reclaim(UMA_RECLAIM_DRAIN).
+ *
+ * This works around pathological behavior in vnlru in presence of tons of free
+ * vnodes, but without having to rewrite the machinery at this time. Said
+ * behavior boils down to continuously trying to reclaim all kinds of vnodes
+ * (cycling through all levels of "force") when the count is transiently above
+ * limit. This happens a lot when all vnodes are used up and vn_alloc
+ * speculatively increments the counter.
+ *
+ * Sample testcase: vnode limit 8388608, 20 separate directory trees each with
+ * 1 million files in total and 20 find(1) processes stating them in parallel
+ * (one per each tree).
+ *
+ * On a kernel with only stock machinery this needs anywhere between 60 and 120
+ * seconds to execute (time varies *wildly* between runs). With the workaround
+ * it consistently stays around 20 seconds.
+ *
+ * That is to say the entire thing needs a fundamental redesign (most notably
+ * to accommodate faster recycling), the above only tries to get it ouf the 
way.
+ *
+ * Return values are:
+ * -1 -- fallback to regular vnlru loop
+ *  0 -- do nothing, go to sleep
+ * >0 -- recycle this many vnodes
+ */
+static long
+vnlru_proc_light_pick(void)
+{
+       u_long rnumvnodes, rfreevnodes;
+
+       if (vstir || vnlruproc_sig == 1)
+               return (-1);
+
+       rnumvnodes = atomic_load_long(&numvnodes);
+       rfreevnodes = vnlru_read_freevnodes();
+
+       /*
+        * vnode limit might have changed and now we may be at a significant
+        * excess. Bail if we can't sort it out with free vnodes.
+        */
+       if (rnumvnodes > desiredvnodes) {
+               if (rnumvnodes - rfreevnodes >= desiredvnodes ||
+                   rfreevnodes <= wantfreevnodes) {
+                       return (-1);
+               }
+
+               return (rnumvnodes - desiredvnodes);
+       }
+
+       /*
+        * Don't try to reach wantfreevnodes target if there are too few vnodes
+        * to begin with.
+        */
+       if (rnumvnodes < wantfreevnodes) {
+               return (0);
+       }
+
+       if (rfreevnodes < wantfreevnodes) {
+               return (-1);
+       }
+
+       return (0);
+}
+
+static bool
+vnlru_proc_light(void)
+{
+       long freecount;
+
+       mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+
+       freecount = vnlru_proc_light_pick();
+       if (freecount == -1)
+               return (false);
+
+       if (freecount != 0) {
+               vnlru_free(freecount);
+       }
+
+       mtx_lock(&vnode_list_mtx);
+       vnlru_proc_sleep();
+       mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+       return (true);
+}
+
 static void
 vnlru_proc(void)
 {
@@ -1609,6 +1717,10 @@ vnlru_proc(void)
        want_reread = false;
        for (;;) {
                kproc_suspend_check(vnlruproc);
+
+               if (force == 0 && vnlru_proc_light())
+                       continue;
+
                mtx_lock(&vnode_list_mtx);
                rnumvnodes = atomic_load_long(&numvnodes);
 
@@ -1639,10 +1751,7 @@ vnlru_proc(void)
                        vstir = false;
                }
                if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
-                       vnlruproc_sig = 0;
-                       wakeup(&vnlruproc_sig);
-                       msleep(vnlruproc, &vnode_list_mtx,
-                           PVFS|PDROP, "vlruwt", hz);
+                       vnlru_proc_sleep();
                        continue;
                }
                rfreevnodes = vnlru_read_freevnodes();

Reply via email to