Author: kib
Date: Mon Mar 16 15:39:46 2009
New Revision: 189878
URL: http://svn.freebsd.org/changeset/base/189878

Log:
  Fix two issues with bufdaemon, often causing the processes to hang in
  the "nbufkv" sleep.
  
  First, ffs background cg group block write requests a new buffer for
  the shadow copy. When ffs_bufwrite() is called from the bufdaemon due
  to buffers shortage, requesting the buffer deadlock bufdaemon.
  Introduce a new flag for getnewbuf(), GB_NOWAIT_BD, to request getblk
  to not block while allocating the buffer, and return failure
  instead. Add a flag argument to the geteblk to allow to pass the flags
  to getblk(). Do not repeat the getnewbuf() call from geteblk if buffer
  allocation failed and either GB_NOWAIT_BD is specified, or geteblk()
  is called from bufdaemon (or its helper, see below). In
  ffs_bufwrite(), fall back to synchronous cg block write if shadow
  block allocation failed.
  
  Since r107847, buffer write assumes that vnode owning the buffer is
  locked. The second problem is that buffer cache may accumulate many
  buffers belonging to limited number of vnodes. With such workload,
  quite often threads that own the mentioned vnodes locks are trying to
  read another block from the vnodes, and, due to buffer cache
  exhaustion, are asking bufdaemon for help. Bufdaemon is unable to make
  any substantial progress because the vnodes are locked.
  
  Allow the threads owning vnode locks to help the bufdaemon by doing
  the flush pass over the buffer cache before getnewbuf() is going to
  uninterruptible sleep. Move the flushing code from buf_daemon() to new
  helper function buf_do_flush(), that is called from getnewbuf().  The
  number of buffers flushed by single call to buf_do_flush() from
  getnewbuf() is limited by new sysctl vfs.flushbufqtarget.  Prevent
  recursive calls to buf_do_flush() by marking the bufdaemon and threads
  that temporarily help bufdaemon by TDP_BUFNEED flag.
  
  In collaboration with:        pho
  Reviewed by:   tegge (previous version)
  Tested by:     glebius, yandex ...
  MFC after:     3 weeks

Modified:
  head/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
  head/sys/kern/vfs_bio.c
  head/sys/sys/buf.h
  head/sys/sys/proc.h
  head/sys/ufs/ffs/ffs_vfsops.c

Modified: head/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c
==============================================================================
--- head/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c       Mon Mar 16 15:09:47 2009        
(r189877)
+++ head/sys/gnu/fs/xfs/FreeBSD/xfs_buf.c       Mon Mar 16 15:39:46 2009        
(r189878)
@@ -81,7 +81,7 @@ xfs_buf_get_empty(size_t size,  xfs_buft
 {
        struct buf *bp;
 
-       bp = geteblk(0);
+       bp = geteblk(0, 0);
        if (bp != NULL) {
                bp->b_bufsize = size;
                bp->b_bcount = size;
@@ -100,7 +100,7 @@ xfs_buf_get_noaddr(size_t len, xfs_bufta
        if (len >= MAXPHYS)
                return (NULL);
 
-       bp = geteblk(len);
+       bp = geteblk(len, 0);
        if (bp != NULL) {
                BUF_ASSERT_HELD(bp);
 

Modified: head/sys/kern/vfs_bio.c
==============================================================================
--- head/sys/kern/vfs_bio.c     Mon Mar 16 15:09:47 2009        (r189877)
+++ head/sys/kern/vfs_bio.c     Mon Mar 16 15:39:46 2009        (r189878)
@@ -106,7 +106,8 @@ static void vfs_setdirty_locked_object(s
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
                daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int, int);
+static int buf_do_flush(struct vnode *vp);
+static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
@@ -198,6 +199,9 @@ SYSCTL_INT(_vfs, OID_AUTO, getnewbufcall
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 
0,
     "Number of times getnewbuf has had to restart a buffer aquisition");
+static int flushbufqtarget = 100;
+SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
+    "Amount of work to do in flushbufqueues when helping bufdaemon");
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
@@ -258,6 +262,7 @@ static struct mtx nblock;
 #define QUEUE_DIRTY_GIANT 3    /* B_DELWRI buffers that need giant */
 #define QUEUE_EMPTYKVA 4       /* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY    5       /* empty buffer headers */
+#define QUEUE_SENTINEL 1024    /* not an queue index, but mark for sentinel */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
@@ -1710,21 +1715,23 @@ vfs_bio_awrite(struct buf *bp)
  */
 
 static struct buf *
-getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+    int gbflags)
 {
+       struct thread *td;
        struct buf *bp;
        struct buf *nbp;
        int defrag = 0;
        int nqindex;
        static int flushingbufs;
 
+       td = curthread;
        /*
         * We can't afford to block since we might be holding a vnode lock,
         * which may prevent system daemons from running.  We deal with
         * low-memory situations by proactively returning memory and running
         * async I/O rather then sync I/O.
         */
-
        atomic_add_int(&getnewbufcalls, 1);
        atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
@@ -1956,8 +1963,9 @@ restart:
         */
 
        if (bp == NULL) {
-               int flags;
+               int flags, norunbuf;
                char *waitmsg;
+               int fl;
 
                if (defrag) {
                        flags = VFS_BIO_NEED_BUFSPACE;
@@ -1975,9 +1983,35 @@ restart:
                mtx_unlock(&bqlock);
 
                bd_speedup();   /* heeeelp */
+               if (gbflags & GB_NOWAIT_BD)
+                       return (NULL);
 
                mtx_lock(&nblock);
                while (needsbuffer & flags) {
+                       if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
+                               mtx_unlock(&nblock);
+                               /*
+                                * getblk() is called with a vnode
+                                * locked, and some majority of the
+                                * dirty buffers may as well belong to
+                                * the vnode. Flushing the buffers
+                                * there would make a progress that
+                                * cannot be achieved by the
+                                * buf_daemon, that cannot lock the
+                                * vnode.
+                                */
+                               norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+                                   (td->td_pflags & TDP_NORUNNINGBUF);
+                               /* play bufdaemon */
+                               td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+                               fl = buf_do_flush(vp);
+                               td->td_pflags &= norunbuf;
+                               mtx_lock(&nblock);
+                               if (fl != 0)
+                                       continue;
+                               if ((needsbuffer & flags) == 0)
+                                       break;
+                       }
                        if (msleep(&needsbuffer, &nblock,
                            (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
                                mtx_unlock(&nblock);
@@ -2046,6 +2080,35 @@ static struct kproc_desc buf_kp = {
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
+static int
+buf_do_flush(struct vnode *vp)
+{
+       int flushed;
+
+       flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+       /* The list empty check here is slightly racy */
+       if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+               mtx_lock(&Giant);
+               flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
+               mtx_unlock(&Giant);
+       }
+       if (flushed == 0) {
+               /*
+                * Could not find any buffers without rollback
+                * dependencies, so just write the first one
+                * in the hopes of eventually making progress.
+                */
+               flushbufqueues(vp, QUEUE_DIRTY, 1);
+               if (!TAILQ_EMPTY(
+                           &bufqueues[QUEUE_DIRTY_GIANT])) {
+                       mtx_lock(&Giant);
+                       flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
+                       mtx_unlock(&Giant);
+               }
+       }
+       return (flushed);
+}
+
 static void
 buf_daemon()
 {
@@ -2059,7 +2122,7 @@ buf_daemon()
        /*
         * This process is allowed to take the buffer cache to the limit
         */
-       curthread->td_pflags |= TDP_NORUNNINGBUF;
+       curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
        mtx_lock(&bdlock);
        for (;;) {
                bd_request = 0;
@@ -2074,30 +2137,8 @@ buf_daemon()
                 * normally would so they can run in parallel with our drain.
                 */
                while (numdirtybuffers > lodirtybuffers) {
-                       int flushed;
-
-                       flushed = flushbufqueues(QUEUE_DIRTY, 0);
-                       /* The list empty check here is slightly racy */
-                       if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
-                               mtx_lock(&Giant);
-                               flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
-                               mtx_unlock(&Giant);
-                       }
-                       if (flushed == 0) {
-                               /*
-                                * Could not find any buffers without rollback
-                                * dependencies, so just write the first one
-                                * in the hopes of eventually making progress.
-                                */
-                               flushbufqueues(QUEUE_DIRTY, 1);
-                               if (!TAILQ_EMPTY(
-                                   &bufqueues[QUEUE_DIRTY_GIANT])) {
-                                       mtx_lock(&Giant);
-                                       flushbufqueues(QUEUE_DIRTY_GIANT, 1);
-                                       mtx_unlock(&Giant);
-                               }
+                       if (buf_do_flush(NULL) == 0)
                                break;
-                       }
                        uio_yield();
                }
 
@@ -2143,7 +2184,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 {
        struct buf sentinel;
        struct vnode *vp;
@@ -2153,20 +2194,37 @@ flushbufqueues(int queue, int flushdeps)
        int flushed;
        int target;
 
-       target = numdirtybuffers - lodirtybuffers;
-       if (flushdeps && target > 2)
-               target /= 2;
+       if (lvp == NULL) {
+               target = numdirtybuffers - lodirtybuffers;
+               if (flushdeps && target > 2)
+                       target /= 2;
+       } else
+               target = flushbufqtarget;
        flushed = 0;
        bp = NULL;
+       sentinel.b_qindex = QUEUE_SENTINEL;
        mtx_lock(&bqlock);
-       TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
+       TAILQ_INSERT_HEAD(&bufqueues[queue], &sentinel, b_freelist);
        while (flushed != target) {
-               bp = TAILQ_FIRST(&bufqueues[queue]);
-               if (bp == &sentinel)
+               bp = TAILQ_NEXT(&sentinel, b_freelist);
+               if (bp != NULL) {
+                       TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
+                       TAILQ_INSERT_AFTER(&bufqueues[queue], bp, &sentinel,
+                           b_freelist);
+               } else
                        break;
-               TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
-               TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
-
+               /*
+                * Skip sentinels inserted by other invocations of the
+                * flushbufqueues(), taking care to not reorder them.
+                */
+               if (bp->b_qindex == QUEUE_SENTINEL)
+                       continue;
+               /*
+                * Only flush the buffers that belong to the
+                * vnode locked by the curthread.
+                */
+               if (lvp != NULL && bp->b_vp != lvp)
+                       continue;
                if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                        continue;
                if (bp->b_pin_count > 0) {
@@ -2214,16 +2272,27 @@ flushbufqueues(int queue, int flushdeps)
                        BUF_UNLOCK(bp);
                        continue;
                }
-               if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
+               if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) 
{
                        mtx_unlock(&bqlock);
                        CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
                            bp, bp->b_vp, bp->b_flags);
-                       vfs_bio_awrite(bp);
+                       if (curproc == bufdaemonproc)
+                               vfs_bio_awrite(bp);
+                       else {
+                               bremfree(bp);
+                               bwrite(bp);
+                       }
                        vn_finished_write(mp);
                        VOP_UNLOCK(vp, 0);
                        flushwithdeps += hasdeps;
                        flushed++;
-                       waitrunningbufspace();
+
+                       /*
+                        * Sleeping on runningbufspace while holding
+                        * vnode lock leads to deadlock.
+                        */
+                       if (curproc == bufdaemonproc)
+                               waitrunningbufspace();
                        numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
                        mtx_lock(&bqlock);
                        continue;
@@ -2605,7 +2674,7 @@ loop:
                maxsize = vmio ? size + (offset & PAGE_MASK) : size;
                maxsize = imax(maxsize, bsize);
 
-               bp = getnewbuf(slpflag, slptimeo, size, maxsize);
+               bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
                if (bp == NULL) {
                        if (slpflag || slptimeo)
                                return NULL;
@@ -2680,14 +2749,17 @@ loop:
  * set to B_INVAL.
  */
 struct buf *
-geteblk(int size)
+geteblk(int size, int flags)
 {
        struct buf *bp;
        int maxsize;
 
        maxsize = (size + BKVAMASK) & ~BKVAMASK;
-       while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
-               continue;
+       while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+               if ((flags & GB_NOWAIT_BD) &&
+                   (curthread->td_pflags & TDP_BUFNEED) != 0)
+                       return (NULL);
+       }
        allocbuf(bp, size);
        bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
        BUF_ASSERT_HELD(bp);

Modified: head/sys/sys/buf.h
==============================================================================
--- head/sys/sys/buf.h  Mon Mar 16 15:09:47 2009        (r189877)
+++ head/sys/sys/buf.h  Mon Mar 16 15:39:46 2009        (r189878)
@@ -443,6 +443,7 @@ buf_countdeps(struct buf *bp, int i)
  */
 #define        GB_LOCK_NOWAIT  0x0001          /* Fail if we block on a buf 
lock. */
 #define        GB_NOCREAT      0x0002          /* Don't create a buf if not 
found. */
+#define        GB_NOWAIT_BD    0x0004          /* Do not wait for bufdaemon */
 
 #ifdef _KERNEL
 extern int     nbuf;                   /* The number of buffer headers */
@@ -487,7 +488,7 @@ struct buf *     getpbuf(int *);
 struct buf *incore(struct bufobj *, daddr_t);
 struct buf *gbincore(struct bufobj *, daddr_t);
 struct buf *getblk(struct vnode *, daddr_t, int, int, int, int);
-struct buf *geteblk(int);
+struct buf *geteblk(int, int);
 int    bufwait(struct buf *);
 int    bufwrite(struct buf *);
 void   bufdone(struct buf *);

Modified: head/sys/sys/proc.h
==============================================================================
--- head/sys/sys/proc.h Mon Mar 16 15:09:47 2009        (r189877)
+++ head/sys/sys/proc.h Mon Mar 16 15:39:46 2009        (r189878)
@@ -345,7 +345,7 @@ do {                                                        
                \
 #define        TDP_OLDMASK     0x00000001 /* Need to restore mask after 
suspend. */
 #define        TDP_INKTR       0x00000002 /* Thread is currently in KTR code. 
*/
 #define        TDP_INKTRACE    0x00000004 /* Thread is currently in KTRACE 
code. */
-#define        TDP_UNUSED8     0x00000008 /* available */
+#define        TDP_BUFNEED     0x00000008 /* Do not recurse into the buf flush 
*/
 #define        TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in 
progress. */
 #define        TDP_ALTSTACK    0x00000020 /* Have alternate signal stack. */
 #define        TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock 
treatment. */

Modified: head/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c       Mon Mar 16 15:09:47 2009        
(r189877)
+++ head/sys/ufs/ffs/ffs_vfsops.c       Mon Mar 16 15:39:46 2009        
(r189878)
@@ -1845,7 +1845,9 @@ ffs_bufwrite(struct buf *bp)
                    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
 
                /* get a new block */
-               newbp = geteblk(bp->b_bufsize);
+               newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
+               if (newbp == NULL)
+                       goto normal_write;
 
                /*
                 * set it to be identical to the old block.  We have to
@@ -1885,6 +1887,7 @@ ffs_bufwrite(struct buf *bp)
        }
 
        /* Let the normal bufwrite do the rest for us */
+normal_write:
        return (bufwrite(bp));
 }
 
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to