Author: mjg
Date: Sat Oct 17 08:48:58 2020
New Revision: 366785
URL: https://svnweb.freebsd.org/changeset/base/366785

Log:
  cache: rework parts of negative entry management
  
  - declutter sysctl vfs.cache by moving relevant entries into
  vfs.cache.neg
  - add a little more parallelism to eviction by replacing the
  global lock with an atomically modified counter
  - track more statistics
  
  The code needs further effort.

Modified:
  head/sys/kern/vfs_cache.c

Modified: head/sys/kern/vfs_cache.c
==============================================================================
--- head/sys/kern/vfs_cache.c   Sat Oct 17 08:48:32 2020        (r366784)
+++ head/sys/kern/vfs_cache.c   Sat Oct 17 08:48:58 2020        (r366785)
@@ -113,7 +113,7 @@ SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct v
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
     "char *");
-SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
     "char *");
 
 SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", 
"bool");
@@ -251,9 +251,6 @@ cache_ncp_canuse(struct namecache *ncp)
  * bucketlock  mtx     for access to given set of hash buckets
  * neglist     mtx     negative entry LRU management
  *
- * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
- * shrinking the LRU list.
- *
  * It is legal to take multiple vnodelock and bucketlock locks. The locking
  * order is lower address first. Both are recursive.
  *
@@ -305,13 +302,14 @@ static bool __read_frequently cache_fast_revlookup = t
 SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
     &cache_fast_revlookup, 0, "");
 
-static struct mtx __exclusive_cache_line       ncneg_shrink_lock;
+static u_int __exclusive_cache_line neg_cycle;
 
 #define ncneghash      3
 #define        numneglists     (ncneghash + 1)
 
 struct neglist {
-       struct mtx              nl_lock;
+       struct mtx              nl_evict_lock;
+       struct mtx              nl_lock __aligned(CACHE_LINE_SIZE);
        TAILQ_HEAD(, namecache) nl_list;
        TAILQ_HEAD(, namecache) nl_hotlist;
        u_long                  nl_hotnum;
@@ -473,10 +471,6 @@ static long zap_and_exit_bucket_fail2; STATNODE_ULONG(
 static long cache_lock_vnodes_cel_3_failures;
 STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
     "Number of times 3-way vnode locking failed");
-STATNODE_COUNTER(numneg_evicted,
-    "Number of negative entries evicted when adding a new entry");
-STATNODE_COUNTER(shrinking_skipped,
-    "Number of times shrinking was already in progress");
 
 static void cache_zap_locked(struct namecache *ncp);
 static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
@@ -683,21 +677,6 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OP
     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
     "VFS cache effectiveness statistics");
 
-static int
-sysctl_hotnum(SYSCTL_HANDLER_ARGS)
-{
-       int i, out;
-
-       out = 0;
-       for (i = 0; i < numneglists; i++)
-               out += neglists[i].nl_hotnum;
-
-       return (SYSCTL_OUT(req, &out, sizeof(out)));
-}
-SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD |
-    CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I",
-    "Number of hot negative entries");
-
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
@@ -792,27 +771,77 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE
 /*
  * Negative entries management
  *
- * A variation of LRU scheme is used. New entries are hashed into one of
- * numneglists cold lists. Entries get promoted to the hot list on first hit.
+ * Various workloads create plenty of negative entries and barely use them
+ * afterwards. Moreover malicious users can keep performing bogus lookups
+ * adding even more entries. For example "make tinderbox" as of writing this
+ * comment ends up with 2.6M namecache entries in total, 1.2M of which are
+ * negative.
  *
- * The shrinker will demote hot list head and evict from the cold list in a
- * round-robin manner.
+ * As such, a rather aggressive eviction method is needed. The currently
+ * employed method is a placeholder.
+ *
+ * Entries are split over numneglists separate lists, each of which is further
+ * split into hot and cold entries. Entries get promoted after getting a hit.
+ * Eviction happens on addition of new entry.
  */
+static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "Name cache negative entry statistics");
+
+SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
+    "Number of negative cache entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_created);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
+    "Number of created negative entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evicted);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
+    "Number of evicted negative entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
+    &neg_evict_skipped_empty,
+    "Number of times evicting failed due to lack of entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, 
CTLFLAG_RD,
+    &neg_evict_skipped_contended,
+    "Number of times evicting failed due to contention");
+
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
+    "Number of cache hits (negative)");
+
+static int
+sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
+{
+       int i, out;
+
+       out = 0;
+       for (i = 0; i < numneglists; i++)
+               out += neglists[i].nl_hotnum;
+
+       return (SYSCTL_OUT(req, &out, sizeof(out)));
+}
+SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
+    "Number of hot negative entries");
+
 static void
-cache_negative_init(struct namecache *ncp)
+cache_neg_init(struct namecache *ncp)
 {
        struct negstate *ns;
 
        ncp->nc_flag |= NCF_NEGATIVE;
        ns = NCP2NEGSTATE(ncp);
        ns->neg_flag = 0;
+       counter_u64_add(neg_created, 1);
 }
 
 /*
  * Move a negative entry to the hot list.
  */
 static void
-cache_negative_promote(struct namecache *ncp)
+cache_neg_promote(struct namecache *ncp)
 {
        struct neglist *nl;
        struct negstate *ns;
@@ -838,7 +867,7 @@ cache_negative_promote(struct namecache *ncp)
  * up again.
  */
 static bool
-cache_negative_promote_cond(struct vnode *dvp, struct componentname *cnp,
+cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
     struct namecache *oncp, uint32_t hash)
 {
        struct namecache *ncp;
@@ -896,7 +925,7 @@ cache_negative_promote_cond(struct vnode *dvp, struct 
                goto out_abort;
        }
 
-       cache_negative_promote(ncp);
+       cache_neg_promote(ncp);
 
        SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
        counter_u64_add(numneghits, 1);
@@ -910,7 +939,7 @@ out_abort:
 }
 
 static void
-cache_negative_hit(struct namecache *ncp)
+cache_neg_hit(struct namecache *ncp)
 {
        struct neglist *nl;
        struct negstate *ns;
@@ -920,12 +949,12 @@ cache_negative_hit(struct namecache *ncp)
                return;
        nl = NCP2NEGLIST(ncp);
        mtx_lock(&nl->nl_lock);
-       cache_negative_promote(ncp);
+       cache_neg_promote(ncp);
        mtx_unlock(&nl->nl_lock);
 }
 
 static void
-cache_negative_insert(struct namecache *ncp)
+cache_neg_insert(struct namecache *ncp)
 {
        struct neglist *nl;
 
@@ -939,7 +968,7 @@ cache_negative_insert(struct namecache *ncp)
 }
 
 static void
-cache_negative_remove(struct namecache *ncp)
+cache_neg_remove(struct namecache *ncp)
 {
        struct neglist *nl;
        struct negstate *ns;
@@ -959,30 +988,22 @@ cache_negative_remove(struct namecache *ncp)
 }
 
 static struct neglist *
-cache_negative_shrink_select(void)
+cache_neg_evict_select(void)
 {
        struct neglist *nl;
-       static u_int cycle;
-       u_int i;
+       u_int c;
 
-       cycle++;
-       for (i = 0; i < numneglists; i++) {
-               nl = &neglists[(cycle + i) % numneglists];
-               if (TAILQ_FIRST(&nl->nl_list) == NULL &&
-                   TAILQ_FIRST(&nl->nl_hotlist) == NULL)
-                       continue;
-               mtx_lock(&nl->nl_lock);
-               if (TAILQ_FIRST(&nl->nl_list) != NULL ||
-                   TAILQ_FIRST(&nl->nl_hotlist) != NULL)
-                       return (nl);
-               mtx_unlock(&nl->nl_lock);
+       c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
+       nl = &neglists[c % numneglists];
+       if (!mtx_trylock(&nl->nl_evict_lock)) {
+               counter_u64_add(neg_evict_skipped_contended, 1);
+               return (NULL);
        }
-
-       return (NULL);
+       return (nl);
 }
 
 static void
-cache_negative_zap_one(void)
+cache_neg_evict(void)
 {
        struct namecache *ncp, *ncp2;
        struct neglist *nl;
@@ -990,18 +1011,12 @@ cache_negative_zap_one(void)
        struct mtx *dvlp;
        struct mtx *blp;
 
-       if (mtx_owner(&ncneg_shrink_lock) != NULL ||
-           !mtx_trylock(&ncneg_shrink_lock)) {
-               counter_u64_add(shrinking_skipped, 1);
-               return;
-       }
-
-       nl = cache_negative_shrink_select();
-       mtx_unlock(&ncneg_shrink_lock);
+       nl = cache_neg_evict_select();
        if (nl == NULL) {
                return;
        }
 
+       mtx_lock(&nl->nl_lock);
        ncp = TAILQ_FIRST(&nl->nl_hotlist);
        if (ncp != NULL) {
                ns = NCP2NEGSTATE(ncp);
@@ -1011,11 +1026,17 @@ cache_negative_zap_one(void)
                ns->neg_flag &= ~NEG_HOT;
        }
        ncp = TAILQ_FIRST(&nl->nl_list);
-       MPASS(ncp != NULL);
+       if (ncp == NULL) {
+               counter_u64_add(neg_evict_skipped_empty, 1);
+               mtx_unlock(&nl->nl_lock);
+               mtx_unlock(&nl->nl_evict_lock);
+               return;
+       }
        ns = NCP2NEGSTATE(ncp);
        dvlp = VP2VNODELOCK(ncp->nc_dvp);
        blp = NCP2BUCKETLOCK(ncp);
        mtx_unlock(&nl->nl_lock);
+       mtx_unlock(&nl->nl_evict_lock);
        mtx_lock(dvlp);
        mtx_lock(blp);
        /*
@@ -1031,10 +1052,10 @@ cache_negative_zap_one(void)
                ncp = NULL;
        } else {
                vfs_smr_exit();
-               SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
+               SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
                    ncp->nc_name);
                cache_zap_locked(ncp);
-               counter_u64_add(numneg_evicted, 1);
+               counter_u64_add(neg_evicted, 1);
        }
        mtx_unlock(blp);
        mtx_unlock(dvlp);
@@ -1074,7 +1095,7 @@ cache_zap_locked(struct namecache *ncp)
        } else {
                SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
                    ncp->nc_name);
-               cache_negative_remove(ncp);
+               cache_neg_remove(ncp);
        }
        if (ncp->nc_flag & NCF_ISDOTDOT) {
                if (ncp == ncp->nc_dvp->v_cache_dd) {
@@ -1414,7 +1435,7 @@ negative_success:
        cache_out_ts(ncp, tsp, ticksp);
        counter_u64_add(numneghits, 1);
        whiteout = (ncp->nc_flag & NCF_WHITE);
-       cache_negative_hit(ncp);
+       cache_neg_hit(ncp);
        mtx_unlock(dvlp);
        if (whiteout)
                cnp->cn_flags |= ISWHITEOUT;
@@ -1525,7 +1546,7 @@ negative_success:
        cache_out_ts(ncp, tsp, ticksp);
        counter_u64_add(numneghits, 1);
        whiteout = (ncp->nc_flag & NCF_WHITE);
-       cache_negative_hit(ncp);
+       cache_neg_hit(ncp);
        mtx_unlock(blp);
        if (whiteout)
                cnp->cn_flags |= ISWHITEOUT;
@@ -1628,7 +1649,7 @@ negative_success:
        }
        if (!neg_hot) {
                vfs_smr_exit();
-               if (!cache_negative_promote_cond(dvp, cnp, ncp, hash))
+               if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
                        goto out_fallback;
        } else {
                SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, 
ncp->nc_name);
@@ -1927,15 +1948,15 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
         * Bugs:
         * 1. filesystems may end up tryng to add an already existing entry
         * (for example this can happen after a cache miss during concurrent
-        * lookup), in which case we will call cache_negative_zap_one despite
-        * not adding anything.
+        * lookup), in which case we will call cache_neg_evict despite not
+        * adding anything.
         * 2. the routine may fail to free anything and no provisions are made
         * to make it try harder (see the inside for failure modes)
         * 3. it only ever looks at negative entries.
         */
        lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
        if (numneg * ncnegfactor > lnumcache) {
-               cache_negative_zap_one();
+               cache_neg_evict();
                lnumcache = atomic_load_long(&numcache);
        }
        if (__predict_false(lnumcache >= ncsize)) {
@@ -1956,7 +1977,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
        ncp->nc_flag = flag | NCF_WIP;
        ncp->nc_vp = vp;
        if (vp == NULL)
-               cache_negative_init(ncp);
+               cache_neg_init(ncp);
        ncp->nc_dvp = dvp;
        if (tsp != NULL) {
                ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
@@ -2081,7 +2102,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
        } else {
                if (cnp->cn_flags & ISWHITEOUT)
                        ncp->nc_flag |= NCF_WHITE;
-               cache_negative_insert(ncp);
+               cache_neg_insert(ncp);
                SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
                    ncp->nc_name);
        }
@@ -2183,12 +2204,11 @@ nchinit(void *dummy __unused)
                mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 
        for (i = 0; i < numneglists; i++) {
+               mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
                mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
                TAILQ_INIT(&neglists[i].nl_list);
                TAILQ_INIT(&neglists[i].nl_hotlist);
        }
-
-       mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
@@ -3485,7 +3505,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl,
        dvp = fpl->dvp;
 
        cache_fpl_smr_exit(fpl);
-       if (cache_negative_promote_cond(dvp, cnp, oncp, hash))
+       if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
                return (cache_fpl_handled(fpl, ENOENT));
        else
                return (cache_fpl_aborted(fpl));
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to