Author: jeff
Date: Sat Mar 17 18:14:49 2018
New Revision: 331106
URL: https://svnweb.freebsd.org/changeset/base/331106

Log:
  Move the dirty queues inside the per-domain structure.  This resolves a bug
  where we had not hit global dirty limits but a single queue was starved
  for space by dirty buffers.  A single buf_daemon is maintained for now.
  
  Add a bd_speedup() when we are low on bufspace.  This can happen due to SUJ
  keeping many bufs locked until a cg block is written.  Document this with
  a comment.
  
  Fix sysctls to work with per-domain variables.  Add more ddb debugging.
  
  Reported by:  pho
  Reviewed by:  kib
  Tested by:    pho
  Sponsored by: Netflix, Dell/EMC Isilon
  Differential Revision:        https://reviews.freebsd.org/D14705

Modified:
  head/sys/kern/vfs_bio.c

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c     Sat Mar 17 17:56:08 2018        (r331105)
+++ head/sys/kern/vfs_bio.c     Sat Mar 17 18:14:49 2018        (r331106)
@@ -50,6 +50,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
+#include <sys/bitset.h>
 #include <sys/conf.h>
 #include <sys/counter.h>
 #include <sys/buf.h>
@@ -100,6 +101,52 @@ struct     buf_ops buf_ops_bio = {
        .bop_bdflush    =       bufbdflush,
 };
 
+struct bufqueue {
+       struct mtx_padalign     bq_lock;
+       TAILQ_HEAD(, buf)       bq_queue;
+       uint8_t                 bq_index;
+       uint16_t                bq_subqueue;
+       int                     bq_len;
+} __aligned(CACHE_LINE_SIZE);
+
+#define        BQ_LOCKPTR(bq)          (&(bq)->bq_lock)
+#define        BQ_LOCK(bq)             mtx_lock(BQ_LOCKPTR((bq)))
+#define        BQ_UNLOCK(bq)           mtx_unlock(BQ_LOCKPTR((bq)))
+#define        BQ_ASSERT_LOCKED(bq)    mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+
+struct bufdomain {
+       struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
+       struct bufqueue bd_dirtyq;
+       struct bufqueue *bd_cleanq;
+       struct mtx_padalign bd_run_lock;
+       /* Constants */
+       long            bd_maxbufspace;
+       long            bd_hibufspace;
+       long            bd_lobufspace;
+       long            bd_bufspacethresh;
+       int             bd_hifreebuffers;
+       int             bd_lofreebuffers;
+       int             bd_hidirtybuffers;
+       int             bd_lodirtybuffers;
+       int             bd_dirtybufthresh;
+       int             bd_lim;
+       /* atomics */
+       int             bd_wanted;
+       int __aligned(CACHE_LINE_SIZE)  bd_numdirtybuffers;
+       int __aligned(CACHE_LINE_SIZE)  bd_running;
+       long __aligned(CACHE_LINE_SIZE) bd_bufspace;
+       int __aligned(CACHE_LINE_SIZE)  bd_freebuffers;
+} __aligned(CACHE_LINE_SIZE);
+
+#define        BD_LOCKPTR(bd)          (&(bd)->bd_cleanq->bq_lock)
+#define        BD_LOCK(bd)             mtx_lock(BD_LOCKPTR((bd)))
+#define        BD_UNLOCK(bd)           mtx_unlock(BD_LOCKPTR((bd)))
+#define        BD_ASSERT_LOCKED(bd)    mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
+#define        BD_RUN_LOCKPTR(bd)      (&(bd)->bd_run_lock)
+#define        BD_RUN_LOCK(bd)         mtx_lock(BD_RUN_LOCKPTR((bd)))
+#define        BD_RUN_UNLOCK(bd)       mtx_unlock(BD_RUN_LOCKPTR((bd)))
+#define        BD_DOMAIN(bd)           (bd - bdomain)
+
 static struct buf *buf;                /* buffer header pool */
 extern struct buf *swbuf;      /* Swap buffer header pool. */
 caddr_t unmapped_buf;
@@ -123,8 +170,8 @@ static int vfs_bio_clcheck(struct vnode *vp, int size,
                daddr_t lblkno, daddr_t blkno);
 static void breada(struct vnode *, daddr_t *, int *, int, struct ucred *, int,
                void (*)(struct buf *));
-static int buf_flush(struct vnode *vp, int);
-static int flushbufqueues(struct vnode *, int, int);
+static int buf_flush(struct vnode *vp, struct bufdomain *, int);
+static int flushbufqueues(struct vnode *, struct bufdomain *, int, int);
 static void buf_daemon(void);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
@@ -133,6 +180,16 @@ static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
+static inline struct bufdomain *bufdomain(struct buf *);
+static void bq_remove(struct bufqueue *bq, struct buf *bp);
+static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
+static int buf_recycle(struct bufdomain *, bool kva);
+static void bq_init(struct bufqueue *bq, int qindex, int cpu,
+           const char *lockname);
+static void bd_init(struct bufdomain *bd);
+static int bd_flushall(struct bufdomain *bd);
+static int sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS);
+static int sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS);
 
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 int vmiodirenable = TRUE;
@@ -147,7 +204,9 @@ static counter_u64_t bufkvaspace;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, maxbufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &maxbufspace,
+    __offsetof(struct bufdomain, bd_maxbufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
@@ -156,14 +215,20 @@ static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, lobufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lobufspace,
+    __offsetof(struct bufdomain, bd_lobufspace), sysctl_bufdomain_long, "L",
     "Minimum amount of buffers we want to have");
 long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, hibufspace,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hibufspace,
+    __offsetof(struct bufdomain, bd_hibufspace), sysctl_bufdomain_long, "L",
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
-SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
-    0, "Bufspace consumed before waking the daemon to free some");
+SYSCTL_PROC(_vfs, OID_AUTO, bufspacethresh,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &bufspacethresh,
+    __offsetof(struct bufdomain, bd_bufspacethresh), sysctl_bufdomain_long, 
"L",
+    "Bufspace consumed before waking the daemon to free some");
 static counter_u64_t buffreekvacnt;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt,
     "Number of times we have freed the KVA space from some buffer");
@@ -190,26 +255,37 @@ SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_R
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
-static int numdirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
+static int sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs, OID_AUTO, numdirtybuffers,
+    CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_RD, NULL, 0, sysctl_numdirtybuffers, 
"I",
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, lodirtybuffers,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lodirtybuffers,
+    __offsetof(struct bufdomain, bd_lodirtybuffers), sysctl_bufdomain_int, "L",
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
-SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, hidirtybuffers,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hidirtybuffers,
+    __offsetof(struct bufdomain, bd_hidirtybuffers), sysctl_bufdomain_int, "L",
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
-SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
-    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
+SYSCTL_PROC(_vfs, OID_AUTO, dirtybufthresh,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &dirtybufthresh,
+    __offsetof(struct bufdomain, bd_dirtybufthresh), sysctl_bufdomain_int, "L",
+    "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
-SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, lofreebuffers,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &lofreebuffers,
+    __offsetof(struct bufdomain, bd_lofreebuffers), sysctl_bufdomain_int, "L",
    "Target number of free buffers");
 static int hifreebuffers;
-SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
+SYSCTL_PROC(_vfs, OID_AUTO, hifreebuffers,
+    CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &hifreebuffers,
+    __offsetof(struct bufdomain, bd_hifreebuffers), sysctl_bufdomain_int, "L",
    "Threshold for clean buffer recycling");
 static counter_u64_t getnewbufcalls;
 SYSCTL_COUNTER_U64(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD,
@@ -294,66 +370,19 @@ static int bdirtywait;
 #define QUEUE_CLEAN    3       /* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL 4       /* not an queue index, but mark for sentinel */
 
-struct bufqueue {
-       struct mtx_padalign     bq_lock;
-       TAILQ_HEAD(, buf)       bq_queue;
-       uint8_t                 bq_index;
-       uint16_t                bq_subqueue;
-       int                     bq_len;
-} __aligned(CACHE_LINE_SIZE);
+/* Maximum number of buffer domains. */
+#define        BUF_DOMAINS     8
 
-#define        BQ_LOCKPTR(bq)          (&(bq)->bq_lock)
-#define        BQ_LOCK(bq)             mtx_lock(BQ_LOCKPTR((bq)))
-#define        BQ_UNLOCK(bq)           mtx_unlock(BQ_LOCKPTR((bq)))
-#define        BQ_ASSERT_LOCKED(bq)    mtx_assert(BQ_LOCKPTR((bq)), MA_OWNED)
+struct bufdomainset bdlodirty;         /* Domains > lodirty */
+struct bufdomainset bdhidirty;         /* Domains > hidirty */
 
-struct bufqueue __exclusive_cache_line bqempty;
-struct bufqueue __exclusive_cache_line bqdirty;
-
-struct bufdomain {
-       struct bufqueue bd_subq[MAXCPU + 1]; /* Per-cpu sub queues + global */
-       struct bufqueue *bd_cleanq;
-       struct mtx_padalign bd_run_lock;
-       /* Constants */
-       long            bd_maxbufspace;
-       long            bd_hibufspace;
-       long            bd_lobufspace;
-       long            bd_bufspacethresh;
-       int             bd_hifreebuffers;
-       int             bd_lofreebuffers;
-       int             bd_lim;
-       /* atomics */
-       int             bd_wanted;
-       int  __aligned(CACHE_LINE_SIZE) bd_running;
-       long __aligned(CACHE_LINE_SIZE) bd_bufspace;
-       int __aligned(CACHE_LINE_SIZE)  bd_freebuffers;
-} __aligned(CACHE_LINE_SIZE);
-
-#define        BD_LOCKPTR(bd)          (&(bd)->bd_cleanq->bq_lock)
-#define        BD_LOCK(bd)             mtx_lock(BD_LOCKPTR((bd)))
-#define        BD_UNLOCK(bd)           mtx_unlock(BD_LOCKPTR((bd)))
-#define        BD_ASSERT_LOCKED(bd)    mtx_assert(BD_LOCKPTR((bd)), MA_OWNED)
-#define        BD_RUN_LOCKPTR(bd)      (&(bd)->bd_run_lock)
-#define        BD_RUN_LOCK(bd)         mtx_lock(BD_RUN_LOCKPTR((bd)))
-#define        BD_RUN_UNLOCK(bd)       mtx_unlock(BD_RUN_LOCKPTR((bd)))
-#define        BD_DOMAIN(bd)           (bd - bdclean)
-
-/* Maximum number of clean buffer domains. */
-#define        CLEAN_DOMAINS   8
-
 /* Configured number of clean queues. */
-static int __read_mostly clean_domains;
+static int __read_mostly buf_domains;
 
-struct bufdomain __exclusive_cache_line bdclean[CLEAN_DOMAINS];
+BITSET_DEFINE(bufdomainset, BUF_DOMAINS);
+struct bufdomain __exclusive_cache_line bdomain[BUF_DOMAINS];
+struct bufqueue __exclusive_cache_line bqempty;
 
-static void bq_remove(struct bufqueue *bq, struct buf *bp);
-static void bq_insert(struct bufqueue *bq, struct buf *bp, bool unlock);
-static int buf_recycle(struct bufdomain *, bool kva);
-static void bq_init(struct bufqueue *bq, int qindex, int cpu,
-           const char *lockname);
-static void bd_init(struct bufdomain *bd);
-static int bd_flushall(struct bufdomain *bd);
-
 /*
  * per-cpu empty buffer cache.
  */
@@ -393,6 +422,44 @@ sysctl_runningspace(SYSCTL_HANDLER_ARGS)
        return (error);
 }
 
+static int
+sysctl_bufdomain_int(SYSCTL_HANDLER_ARGS)
+{
+       int error;
+       int value;
+       int i;
+
+       value = *(int *)arg1;
+       error = sysctl_handle_int(oidp, &value, 0, req);
+       if (error != 0 || req->newptr == NULL)
+               return (error);
+       *(int *)arg1 = value;
+       for (i = 0; i < buf_domains; i++)
+               *(int *)(((uintptr_t)&bdomain[i]) + arg2) =
+                   value / buf_domains;
+
+       return (error);
+}
+
+static int
+sysctl_bufdomain_long(SYSCTL_HANDLER_ARGS)
+{
+       long value;
+       int error;
+       int i;
+
+       value = *(long *)arg1;
+       error = sysctl_handle_long(oidp, &value, 0, req);
+       if (error != 0 || req->newptr == NULL)
+               return (error);
+       *(long *)arg1 = value;
+       for (i = 0; i < buf_domains; i++)
+               *(long *)(((uintptr_t)&bdomain[i]) + arg2) =
+                   value / buf_domains;
+
+       return (error);
+}
+
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
@@ -403,8 +470,8 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
        int i;
 
        lvalue = 0;
-       for (i = 0; i < clean_domains; i++)
-               lvalue += bdclean[i].bd_bufspace;
+       for (i = 0; i < buf_domains; i++)
+               lvalue += bdomain[i].bd_bufspace;
        if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
                return (sysctl_handle_long(oidp, &lvalue, 0, req));
        if (lvalue > INT_MAX)
@@ -421,12 +488,24 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
        int i;
 
        lvalue = 0;
-       for (i = 0; i < clean_domains; i++)
-               lvalue += bdclean[i].bd_bufspace;
+       for (i = 0; i < buf_domains; i++)
+               lvalue += bdomain[i].bd_bufspace;
        return (sysctl_handle_long(oidp, &lvalue, 0, req));
 }
 #endif
 
+static int
+sysctl_numdirtybuffers(SYSCTL_HANDLER_ARGS)
+{
+       int value;
+       int i;
+
+       value = 0;
+       for (i = 0; i < buf_domains; i++)
+               value += bdomain[i].bd_numdirtybuffers;
+       return (sysctl_handle_int(oidp, &value, 0, req));
+}
+
 /*
  *     bdirtywakeup:
  *
@@ -444,18 +523,59 @@ bdirtywakeup(void)
 }
 
 /*
+ *     bd_clear:
+ *
+ *     Clear a domain from the appropriate bitsets when dirtybuffers
+ *     is decremented.
+ */
+static void
+bd_clear(struct bufdomain *bd)
+{
+
+       mtx_lock(&bdirtylock);
+       if (bd->bd_numdirtybuffers <= bd->bd_lodirtybuffers)
+               BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+       if (bd->bd_numdirtybuffers <= bd->bd_hidirtybuffers)
+               BIT_CLR(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+       mtx_unlock(&bdirtylock);
+}
+
+/*
+ *     bd_set:
+ *
+ *     Set a domain in the appropriate bitsets when dirtybuffers
+ *     is incremented.
+ */
+static void
+bd_set(struct bufdomain *bd)
+{
+
+       mtx_lock(&bdirtylock);
+       if (bd->bd_numdirtybuffers > bd->bd_lodirtybuffers)
+               BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdlodirty);
+       if (bd->bd_numdirtybuffers > bd->bd_hidirtybuffers)
+               BIT_SET(BUF_DOMAINS, BD_DOMAIN(bd), &bdhidirty);
+       mtx_unlock(&bdirtylock);
+}
+
+/*
  *     bdirtysub:
  *
  *     Decrement the numdirtybuffers count by one and wakeup any
  *     threads blocked in bwillwrite().
  */
 static void
-bdirtysub(void)
+bdirtysub(struct buf *bp)
 {
+       struct bufdomain *bd;
+       int num;
 
-       if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
-           (lodirtybuffers + hidirtybuffers) / 2)
+       bd = bufdomain(bp);
+       num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, -1);
+       if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
                bdirtywakeup();
+       if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+               bd_clear(bd);
 }
 
 /*
@@ -465,16 +585,21 @@ bdirtysub(void)
  *     daemon if needed.
  */
 static void
-bdirtyadd(void)
+bdirtyadd(struct buf *bp)
 {
+       struct bufdomain *bd;
+       int num;
 
        /*
         * Only do the wakeup once as we cross the boundary.  The
         * buf daemon will keep running until the condition clears.
         */
-       if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
-           (lodirtybuffers + hidirtybuffers) / 2)
+       bd = bufdomain(bp);
+       num = atomic_fetchadd_int(&bd->bd_numdirtybuffers, 1);
+       if (num == (bd->bd_lodirtybuffers + bd->bd_hidirtybuffers) / 2)
                bd_wakeup();
+       if (num == bd->bd_lodirtybuffers || num == bd->bd_hidirtybuffers)
+               bd_set(bd);
 }
 
 /*
@@ -539,11 +664,11 @@ bufspace_adjust(struct buf *bp, int bufsize)
 
        KASSERT((bp->b_flags & B_MALLOC) == 0,
            ("bufspace_adjust: malloc buf %p", bp));
-       bd = &bdclean[bp->b_domain];
+       bd = bufdomain(bp);
        diff = bufsize - bp->b_bufsize;
        if (diff < 0) {
                atomic_subtract_long(&bd->bd_bufspace, -diff);
-       } else {
+       } else if (diff > 0) {
                space = atomic_fetchadd_long(&bd->bd_bufspace, diff);
                /* Wake up the daemon on the transition. */
                if (space < bd->bd_bufspacethresh &&
@@ -638,7 +763,7 @@ bufspace_wait(struct bufdomain *bd, struct vnode *vp, 
                         * recursion.
                         */
                        td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-                       fl = buf_flush(vp, flushbufqtarget);
+                       fl = buf_flush(vp, bd, flushbufqtarget);
                        td->td_pflags &= norunbuf;
                        BD_LOCK(bd);
                        if (fl != 0)
@@ -700,6 +825,15 @@ bufspace_daemon(void *arg)
                        if (buf_recycle(bd, false) != 0) {
                                if (bd_flushall(bd))
                                        continue;
+                               /*
+                                * Speedup dirty if we've run out of clean
+                                * buffers.  This is possible in particular
+                                * because softdep may held many bufs locked
+                                * pending writes to other bufs which are
+                                * marked for delayed write, exhausting
+                                * clean space until they are written.
+                                */
+                               bd_speedup();
                                BD_LOCK(bd);
                                if (bd->bd_wanted) {
                                        msleep(&bd->bd_wanted, BD_LOCKPTR(bd),
@@ -1025,7 +1159,6 @@ bufinit(void)
            ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
            MAXBSIZE));
        bq_init(&bqempty, QUEUE_EMPTY, -1, "bufq empty lock");
-       bq_init(&bqdirty, QUEUE_DIRTY, -1, "bufq dirty lock");
        mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
        mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
        mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
@@ -1093,7 +1226,6 @@ bufinit(void)
         */
        hidirtybuffers = nbuf / 4 + 20;
        dirtybufthresh = hidirtybuffers * 9 / 10;
-       numdirtybuffers = 0;
        /*
         * To support extreme low-memory systems, make sure hidirtybuffers
         * cannot eat up all available buffer space.  This occurs when our
@@ -1128,22 +1260,26 @@ bufinit(void)
         * One queue per-256mb up to the max.  More queues gives better
         * concurrency but less accurate LRU.
         */
-       clean_domains = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_DOMAINS);
-       for (i = 0 ; i < clean_domains; i++) {
+       buf_domains = MIN(howmany(maxbufspace, 256*1024*1024), BUF_DOMAINS);
+       for (i = 0 ; i < buf_domains; i++) {
                struct bufdomain *bd;
 
-               bd = &bdclean[i];
+               bd = &bdomain[i];
                bd_init(bd);
-               bd->bd_freebuffers = nbuf / clean_domains;
-               bd->bd_hifreebuffers = hifreebuffers / clean_domains;
-               bd->bd_lofreebuffers = lofreebuffers / clean_domains;
+               bd->bd_freebuffers = nbuf / buf_domains;
+               bd->bd_hifreebuffers = hifreebuffers / buf_domains;
+               bd->bd_lofreebuffers = lofreebuffers / buf_domains;
                bd->bd_bufspace = 0;
-               bd->bd_maxbufspace = maxbufspace / clean_domains;
-               bd->bd_hibufspace = hibufspace / clean_domains;
-               bd->bd_lobufspace = lobufspace / clean_domains;
-               bd->bd_bufspacethresh = bufspacethresh / clean_domains;
+               bd->bd_maxbufspace = maxbufspace / buf_domains;
+               bd->bd_hibufspace = hibufspace / buf_domains;
+               bd->bd_lobufspace = lobufspace / buf_domains;
+               bd->bd_bufspacethresh = bufspacethresh / buf_domains;
+               bd->bd_numdirtybuffers = 0;
+               bd->bd_hidirtybuffers = hidirtybuffers / buf_domains;
+               bd->bd_lodirtybuffers = lodirtybuffers / buf_domains;
+               bd->bd_dirtybufthresh = dirtybufthresh / buf_domains;
                /* Don't allow more than 2% of bufs in the per-cpu caches. */
-               bd->bd_lim = nbuf / clean_domains / 50 / mp_ncpus;
+               bd->bd_lim = nbuf / buf_domains / 50 / mp_ncpus;
        }
        getnewbufcalls = counter_u64_alloc(M_WAITOK);
        getnewbufrestarts = counter_u64_alloc(M_WAITOK);
@@ -1327,6 +1463,13 @@ bpmap_qenter(struct buf *bp)
            (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
+static inline struct bufdomain *
+bufdomain(struct buf *bp)
+{
+
+       return (&bdomain[bp->b_domain]);
+}
+
 static struct bufqueue *
 bufqueue(struct buf *bp)
 {
@@ -1339,9 +1482,9 @@ bufqueue(struct buf *bp)
        case QUEUE_EMPTY:
                return (&bqempty);
        case QUEUE_DIRTY:
-               return (&bqdirty);
+               return (&bufdomain(bp)->bd_dirtyq);
        case QUEUE_CLEAN:
-               return (&bdclean[bp->b_domain].bd_subq[bp->b_subqueue]);
+               return (&bufdomain(bp)->bd_subq[bp->b_subqueue]);
        default:
                break;
        }
@@ -1404,14 +1547,14 @@ binsfree(struct buf *bp, int qindex)
                bq_remove(bq, bp);
                BQ_UNLOCK(bq);
        }
+       bd = bufdomain(bp);
        if (qindex == QUEUE_CLEAN) {
-               bd = &bdclean[bp->b_domain];
                if (bd->bd_lim != 0)
                        bq = &bd->bd_subq[PCPU_GET(cpuid)];
                else
                        bq = bd->bd_cleanq;
        } else
-               bq = &bqdirty;
+               bq = &bd->bd_dirtyq;
        bq_insert(bq, bp, true);
 }
 
@@ -1439,7 +1582,7 @@ buf_free(struct buf *bp)
        if (!LIST_EMPTY(&bp->b_dep))
                buf_deallocate(bp);
        bufkva_free(bp);
-       atomic_add_int(&bdclean[bp->b_domain].bd_freebuffers, 1);
+       atomic_add_int(&bufdomain(bp)->bd_freebuffers, 1);
        BUF_UNLOCK(bp);
        uma_zfree(buf_zone, bp);
 }
@@ -1715,9 +1858,10 @@ bd_init(struct bufdomain *bd)
        int domain;
        int i;
 
-       domain = bd - bdclean;
+       domain = bd - bdomain;
        bd->bd_cleanq = &bd->bd_subq[mp_maxid + 1];
        bq_init(bd->bd_cleanq, QUEUE_CLEAN, mp_maxid + 1, "bufq clean lock");
+       bq_init(&bd->bd_dirtyq, QUEUE_DIRTY, -1, "bufq dirty lock");
        for (i = 0; i <= mp_maxid; i++)
                bq_init(&bd->bd_subq[i], QUEUE_CLEAN, i,
                    "bufq clean subqueue lock");
@@ -1809,7 +1953,7 @@ bq_insert(struct bufqueue *bq, struct buf *bp, bool un
        if (bp->b_qindex != QUEUE_NONE)
                panic("bq_insert: free buffer %p onto another queue?", bp);
 
-       bd = &bdclean[bp->b_domain];
+       bd = bufdomain(bp);
        if (bp->b_flags & B_AGE) {
                /* Place this buf directly on the real queue. */
                if (bq->bq_index == QUEUE_CLEAN)
@@ -1926,8 +2070,8 @@ bufkva_reclaim(vmem_t *vmem, int flags)
 
        done = false;
        for (i = 0; i < 5; i++) {
-               for (q = 0; q < clean_domains; q++)
-                       if (buf_recycle(&bdclean[q], true) != 0)
+               for (q = 0; q < buf_domains; q++)
+                       if (buf_recycle(&bdomain[q], true) != 0)
                                done = true;
                if (done)
                        break;
@@ -2319,7 +2463,7 @@ bdirty(struct buf *bp)
        if ((bp->b_flags & B_DELWRI) == 0) {
                bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
                reassignbuf(bp);
-               bdirtyadd();
+               bdirtyadd(bp);
        }
 }
 
@@ -2347,7 +2491,7 @@ bundirty(struct buf *bp)
        if (bp->b_flags & B_DELWRI) {
                bp->b_flags &= ~B_DELWRI;
                reassignbuf(bp);
-               bdirtysub();
+               bdirtysub(bp);
        }
        /*
         * Since it is now being written, we can clear its deferred write flag.
@@ -2419,9 +2563,9 @@ void
 bwillwrite(void)
 {
 
-       if (numdirtybuffers >= hidirtybuffers) {
+       if (buf_dirty_count_severe()) {
                mtx_lock(&bdirtylock);
-               while (numdirtybuffers >= hidirtybuffers) {
+               while (buf_dirty_count_severe()) {
                        bdirtywait = 1;
                        msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
                            "flswai", 0);
@@ -2437,7 +2581,7 @@ int
 buf_dirty_count_severe(void)
 {
 
-       return(numdirtybuffers >= hidirtybuffers);
+       return (!BIT_EMPTY(BUF_DOMAINS, &bdhidirty));
 }
 
 /*
@@ -2523,7 +2667,7 @@ brelse(struct buf *bp)
                if (!LIST_EMPTY(&bp->b_dep))
                        buf_deallocate(bp);
                if (bp->b_flags & B_DELWRI)
-                       bdirtysub();
+                       bdirtysub(bp);
                bp->b_flags &= ~(B_DELWRI | B_CACHE);
                if ((bp->b_flags & B_VMIO) == 0) {
                        allocbuf(bp, 0);
@@ -3138,9 +3282,9 @@ getnewbuf(struct vnode *vp, int slpflag, int slptimeo,
        else
                metadata = false;
        if (vp == NULL)
-               bd = &bdclean[0];
+               bd = &bdomain[0];
        else
-               bd = &bdclean[vp->v_bufobj.bo_domain];
+               bd = &bdomain[vp->v_bufobj.bo_domain];
 
        counter_u64_add(getnewbufcalls, 1);
        reserved = false;
@@ -3186,11 +3330,11 @@ static struct kproc_desc buf_kp = {
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
-buf_flush(struct vnode *vp, int target)
+buf_flush(struct vnode *vp, struct bufdomain *bd, int target)
 {
        int flushed;
 
-       flushed = flushbufqueues(vp, target, 0);
+       flushed = flushbufqueues(vp, bd, target, 0);
        if (flushed == 0) {
                /*
                 * Could not find any buffers without rollback
@@ -3199,7 +3343,7 @@ buf_flush(struct vnode *vp, int target)
                 */
                if (vp != NULL && target > 2)
                        target /= 2;
-               flushbufqueues(vp, target, 1);
+               flushbufqueues(vp, bd, target, 1);
        }
        return (flushed);
 }
@@ -3207,6 +3351,8 @@ buf_flush(struct vnode *vp, int target)
 static void
 buf_daemon()
 {
+       struct bufdomain *bd;
+       int speedupreq;
        int lodirty;
        int i;
 
@@ -3219,11 +3365,11 @@ buf_daemon()
        /*
         * Start the buf clean daemons as children threads.
         */
-       for (i = 0 ; i < clean_domains; i++) {
+       for (i = 0 ; i < buf_domains; i++) {
                int error;
 
                error = kthread_add((void (*)(void *))bufspace_daemon,
-                   &bdclean[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
+                   &bdomain[i], curproc, NULL, 0, 0, "bufspacedaemon-%d", i);
                if (error)
                        panic("error %d spawning bufspace daemon", error);
        }
@@ -3238,20 +3384,30 @@ buf_daemon()
                mtx_unlock(&bdlock);
 
                kproc_suspend_check(bufdaemonproc);
-               lodirty = lodirtybuffers;
-               if (bd_speedupreq) {
-                       lodirty = numdirtybuffers / 2;
-                       bd_speedupreq = 0;
-               }
+
                /*
-                * Do the flush.  Limit the amount of in-transit I/O we
-                * allow to build up, otherwise we would completely saturate
-                * the I/O system.
+                * Save speedupreq for this pass and reset to capture new
+                * requests.
                 */
-               while (numdirtybuffers > lodirty) {
-                       if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
-                               break;
-                       kern_yield(PRI_USER);
+               speedupreq = bd_speedupreq;
+               bd_speedupreq = 0;
+
+               /*
+                * Flush each domain sequentially according to its level and
+                * the speedup request.
+                */
+               for (i = 0; i < buf_domains; i++) {
+                       bd = &bdomain[i];
+                       if (speedupreq)
+                               lodirty = bd->bd_numdirtybuffers / 2;
+                       else
+                               lodirty = bd->bd_lodirtybuffers;
+                       while (bd->bd_numdirtybuffers > lodirty) {
+                               if (buf_flush(NULL, bd,
+                                   bd->bd_numdirtybuffers - lodirty) == 0)
+                                       break;
+                               kern_yield(PRI_USER);
+                       }
                }
 
                /*
@@ -3265,7 +3421,7 @@ buf_daemon()
                 * to avoid endless loops on unlockable buffers.
                 */
                mtx_lock(&bdlock);
-               if (numdirtybuffers <= lodirtybuffers) {
+               if (!BIT_EMPTY(BUF_DOMAINS, &bdlodirty)) {
                        /*
                         * We reached our low water mark, reset the
                         * request and sleep until we are needed again.
@@ -3304,7 +3460,8 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, 
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(struct vnode *lvp, int target, int flushdeps)
+flushbufqueues(struct vnode *lvp, struct bufdomain *bd, int target,
+    int flushdeps)
 {
        struct bufqueue *bq;
        struct buf *sentinel;
@@ -3317,7 +3474,7 @@ flushbufqueues(struct vnode *lvp, int target, int flus
        bool unlock;
 
        flushed = 0;
-       bq = &bqdirty;
+       bq = &bd->bd_dirtyq;
        bp = NULL;
        sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
        sentinel->b_qindex = QUEUE_SENTINEL;
@@ -3653,7 +3810,7 @@ bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int
                        panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
                }
                counter_u64_add(mappingrestarts, 1);
-               bufspace_wait(&bdclean[bp->b_domain], bp->b_vp, gbflags, 0, 0);
+               bufspace_wait(bufdomain(bp), bp->b_vp, gbflags, 0, 0);
        }
 has_addr:
        if (need_mapping) {
@@ -3851,7 +4008,7 @@ loop:
                 */
                if (flags & GB_NOCREAT)
                        return NULL;
-               if (bdclean[bo->bo_domain].bd_freebuffers == 0 &&
+               if (bdomain[bo->bo_domain].bd_freebuffers == 0 &&
                    TD_IS_IDLETHREAD(curthread))
                        return NULL;
 
@@ -3908,7 +4065,7 @@ loop:
                if (gbincore(bo, blkno)) {
                        BO_UNLOCK(bo);
                        bp->b_flags |= B_INVAL;
-                       bufspace_release(&bdclean[bp->b_domain], maxsize);
+                       bufspace_release(bufdomain(bp), maxsize);
                        brelse(bp);
                        goto loop;
                }
@@ -3943,7 +4100,7 @@ loop:
                }
 
                allocbuf(bp, size);
-               bufspace_release(&bdclean[bp->b_domain], maxsize);
+               bufspace_release(bufdomain(bp), maxsize);
                bp->b_flags &= ~B_DONE;
        }
        CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
@@ -3972,7 +4129,7 @@ geteblk(int size, int flags)
                        return (NULL);
        }
        allocbuf(bp, size);
-       bufspace_release(&bdclean[bp->b_domain], maxsize);
+       bufspace_release(bufdomain(bp), maxsize);
        bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
        BUF_ASSERT_HELD(bp);
        return (bp);
@@ -4841,7 +4998,7 @@ bufobj_init(struct bufobj *bo, void *private)
        static volatile int bufobj_cleanq;
 
         bo->bo_domain =
-            atomic_fetchadd_int(&bufobj_cleanq, 1) % clean_domains;
+            atomic_fetchadd_int(&bufobj_cleanq, 1) % buf_domains;
         rw_init(BO_LOCKPTR(bo), "bufobj interlock");
         bo->bo_private = private;
         TAILQ_INIT(&bo->bo_clean.bv_hd);
@@ -5164,6 +5321,7 @@ DB_SHOW_COMMAND(buffer, db_show_buffer)
                }
                db_printf("\n");
        }
+       BUF_LOCKPRINTINFO(bp);
 #if defined(FULL_BUF_TRACKING)
        db_printf("b_io_tracking: b_io_tcnt = %u\n", bp->b_io_tcnt);
 
@@ -5178,19 +5336,19 @@ DB_SHOW_COMMAND(buffer, db_show_buffer)
        db_printf("b_io_tracking: %s\n", bp->b_io_tracking);
 #endif
        db_printf(" ");
-       BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(bufqueues, bufqueues)
 {
        struct bufdomain *bd;
-       int i, j;
+       struct buf *bp;
+       long total;
+       int i, j, cnt;
 
        db_printf("bqempty: %d\n", bqempty.bq_len);
-       db_printf("bqdirty: %d\n", bqdirty.bq_len);
 
-       for (i = 0; i < clean_domains; i++) {
-               bd = &bdclean[i];
+       for (i = 0; i < buf_domains; i++) {
+               bd = &bdomain[i];
                db_printf("Buf domain %d\n", i);
                db_printf("\tfreebufs\t%d\n", bd->bd_freebuffers);
                db_printf("\tlofreebufs\t%d\n", bd->bd_lofreebuffers);
@@ -5202,13 +5360,43 @@ DB_SHOW_COMMAND(bufqueues, bufqueues)
                db_printf("\tlobufspace\t%ld\n", bd->bd_lobufspace);
                db_printf("\tbufspacethresh\t%ld\n", bd->bd_bufspacethresh);
                db_printf("\n");
-               db_printf("\tcleanq count\t%d\n", bd->bd_cleanq->bq_len);
+               db_printf("\tnumdirtybuffers\t%d\n", bd->bd_numdirtybuffers);
+               db_printf("\tlodirtybuffers\t%d\n", bd->bd_lodirtybuffers);
+               db_printf("\thidirtybuffers\t%d\n", bd->bd_hidirtybuffers);
+               db_printf("\tdirtybufthresh\t%d\n", bd->bd_dirtybufthresh);
+               db_printf("\n");
+               total = 0;
+               TAILQ_FOREACH(bp, &bd->bd_cleanq->bq_queue, b_freelist)
+                       total += bp->b_bufsize;
+               db_printf("\tcleanq count\t%d (%ld)\n",
+                   bd->bd_cleanq->bq_len, total);
+               total = 0;
+               TAILQ_FOREACH(bp, &bd->bd_dirtyq.bq_queue, b_freelist)
+                       total += bp->b_bufsize;
+               db_printf("\tdirtyq count\t%d (%ld)\n",
+                   bd->bd_dirtyq.bq_len, total);
                db_printf("\twakeup\t\t%d\n", bd->bd_wanted);
                db_printf("\tlim\t\t%d\n", bd->bd_lim);
                db_printf("\tCPU ");
                for (j = 0; j <= mp_maxid; j++)
                        db_printf("%d, ", bd->bd_subq[j].bq_len);
                db_printf("\n");
+               cnt = 0;
+               total = 0;
+               for (j = 0; j < nbuf; j++)
+                       if (buf[j].b_domain == i && BUF_ISLOCKED(&buf[j])) {
+                               cnt++;
+                               total += buf[j].b_bufsize;
+                       }
+               db_printf("\tLocked buffers: %d space %ld\n", cnt, total);
+               cnt = 0;
+               total = 0;
+               for (j = 0; j < nbuf; j++)
+                       if (buf[j].b_domain == i) {
+                               cnt++;
+                               total += buf[j].b_bufsize;
+                       }
+               db_printf("\tTotal buffers: %d space %ld\n", cnt, total);
        }
 }
 
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to