Author: markj
Date: Wed May 20 18:29:23 2020
New Revision: 361287
URL: https://svnweb.freebsd.org/changeset/base/361287

Log:
  Don't block on the range lock in zfs_getpages().
  
  After r358443 the vnode object lock no longer synchronizes concurrent
  zfs_getpages() and zfs_write() (which must update vnode pages to
  maintain coherence).  This created a potential deadlock between ZFS
  range locks and VM page busy locks: a fault on a mapped file will cause
  the fault page to be busied, after which zfs_getpages() locks a range
  around the file offset in order to map adjacent, resident pages;
  zfs_write() locks the range first, and then must busy vnode pages when
  synchronizing.
  
  Solve this by adding a non-blocking mode for ZFS range locks, and using
  it in zfs_getpages().  If zfs_getpages() fails to acquire the range
  lock, only the fault page will be populated.
  
  Reported by:  bdrewery
  Reviewed by:  avg
  Tested by:    pho
  Sponsored by: The FreeBSD Foundation
  Differential Revision:        https://reviews.freebsd.org/D24839

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h Wed May 
20 17:48:18 2020        (r361286)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h Wed May 
20 18:29:23 2020        (r361287)
@@ -78,6 +78,8 @@ void rangelock_fini(rangelock_t *);
 
 locked_range_t *rangelock_enter(rangelock_t *,
     uint64_t, uint64_t, rangelock_type_t);
+locked_range_t *rangelock_tryenter(rangelock_t *,
+    uint64_t, uint64_t, rangelock_type_t);
 void rangelock_exit(locked_range_t *);
 void rangelock_reduce(locked_range_t *, uint64_t, uint64_t);
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c     Wed May 
20 17:48:18 2020        (r361286)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_rlock.c     Wed May 
20 18:29:23 2020        (r361287)
@@ -136,10 +136,11 @@ rangelock_fini(rangelock_t *rl)
 }
 
 /*
- * Check if a write lock can be grabbed, or wait and recheck until available.
+ * Check if a write lock can be grabbed.  If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
  */
-static void
-rangelock_enter_writer(rangelock_t *rl, locked_range_t *new)
+static boolean_t
+rangelock_enter_writer(rangelock_t *rl, locked_range_t *new, boolean_t 
nonblock)
 {
        avl_tree_t *tree = &rl->rl_tree;
        locked_range_t *lr;
@@ -169,7 +170,7 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t
                 */
                if (avl_numnodes(tree) == 0) {
                        avl_add(tree, new);
-                       return;
+                       return (B_TRUE);
                }
 
                /*
@@ -190,8 +191,10 @@ rangelock_enter_writer(rangelock_t *rl, locked_range_t
                        goto wait;
 
                avl_insert(tree, new, where);
-               return;
+               return (B_TRUE);
 wait:
+               if (nonblock)
+                       return (B_FALSE);
                if (!lr->lr_write_wanted) {
                        cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
                        lr->lr_write_wanted = B_TRUE;
@@ -373,10 +376,11 @@ rangelock_add_reader(avl_tree_t *tree, locked_range_t 
 }
 
 /*
- * Check if a reader lock can be grabbed, or wait and recheck until available.
+ * Check if a reader lock can be grabbed.  If not, fail immediately or sleep 
and
+ * recheck until available, depending on the value of the "nonblock" parameter.
  */
-static void
-rangelock_enter_reader(rangelock_t *rl, locked_range_t *new)
+static boolean_t
+rangelock_enter_reader(rangelock_t *rl, locked_range_t *new, boolean_t 
nonblock)
 {
        avl_tree_t *tree = &rl->rl_tree;
        locked_range_t *prev, *next;
@@ -397,6 +401,8 @@ retry:
         */
        if (prev && (off < prev->lr_offset + prev->lr_length)) {
                if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+                       if (nonblock)
+                               return (B_FALSE);
                        if (!prev->lr_read_wanted) {
                                cv_init(&prev->lr_read_cv,
                                    NULL, CV_DEFAULT, NULL);
@@ -421,6 +427,8 @@ retry:
                if (off + len <= next->lr_offset)
                        goto got_lock;
                if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+                       if (nonblock)
+                               return (B_FALSE);
                        if (!next->lr_read_wanted) {
                                cv_init(&next->lr_read_cv,
                                    NULL, CV_DEFAULT, NULL);
@@ -439,6 +447,7 @@ got_lock:
         * locks and bumping ref counts (r_count).
         */
        rangelock_add_reader(tree, new, prev, where);
+       return (B_TRUE);
 }
 
 /*
@@ -448,13 +457,13 @@ got_lock:
  * the range lock structure for later unlocking (or reduce range if the
  * entire file is locked as RL_WRITER).
  */
-locked_range_t *
-rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
-    rangelock_type_t type)
+static locked_range_t *
+_rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+    rangelock_type_t type, boolean_t nonblock)
 {
        ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
 
-       locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP);
+       locked_range_t *new = kmem_alloc(sizeof (*new), KM_SLEEP);
        new->lr_rangelock = rl;
        new->lr_offset = off;
        if (len + off < off)    /* overflow */
@@ -471,14 +480,32 @@ rangelock_enter(rangelock_t *rl, uint64_t off, uint64_
                /*
                 * First check for the usual case of no locks
                 */
-               if (avl_numnodes(&rl->rl_tree) == 0)
+               if (avl_numnodes(&rl->rl_tree) == 0) {
                        avl_add(&rl->rl_tree, new);
-               else
-                       rangelock_enter_reader(rl, new);
-       } else
-               rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */
+               } else if (!rangelock_enter_reader(rl, new, nonblock)) {
+                       kmem_free(new, sizeof (*new));
+                       new = NULL;
+               }
+       } else if (!rangelock_enter_writer(rl, new, nonblock)) {
+               kmem_free(new, sizeof (*new));
+               new = NULL;
+       }
        mutex_exit(&rl->rl_lock);
        return (new);
+}
+
+locked_range_t *
+rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len,
+    rangelock_type_t type)
+{
+       return (_rangelock_enter(rl, off, len, type, B_FALSE));
+}
+
+locked_range_t *
+rangelock_tryenter(rangelock_t *rl, uint64_t off, uint64_t len,
+    rangelock_type_t type)
+{
+       return (_rangelock_enter(rl, off, len, type, B_TRUE));
 }
 
 /*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c     Wed May 
20 17:48:18 2020        (r361286)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c     Wed May 
20 18:29:23 2020        (r361287)
@@ -4498,13 +4498,27 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int coun
        end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
        /*
-        * Lock a range covering all required and optional pages.
-        * Note that we need to handle the case of the block size growing.
+        * Try to lock a range covering all required and optional pages, to
+        * handle the case of the block size growing.  It is not safe to block
+        * on the range lock since the owner may be waiting for the fault page
+        * to be unbusied.
         */
        for (;;) {
                blksz = zp->z_blksz;
-               lr = rangelock_enter(&zp->z_rangelock, rounddown(start, blksz),
+               lr = rangelock_tryenter(&zp->z_rangelock,
+                   rounddown(start, blksz),
                    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+               if (lr == NULL) {
+                       if (rahead != NULL) {
+                               *rahead = 0;
+                               rahead = NULL;
+                       }
+                       if (rbehind != NULL) {
+                               *rbehind = 0;
+                               rbehind = NULL;
+                       }
+                       break;
+               }
                if (blksz == zp->z_blksz)
                        break;
                rangelock_exit(lr);
@@ -4515,7 +4529,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int coun
        obj_size = object->un_pager.vnp.vnp_size;
        zfs_vmobject_wunlock(object);
        if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
-               rangelock_exit(lr);
+               if (lr != NULL)
+                       rangelock_exit(lr);
                ZFS_EXIT(zfsvfs);
                return (zfs_vm_pagerret_bad);
        }
@@ -4543,7 +4558,8 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int coun
        error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
            MIN(end, obj_size) - (end - PAGE_SIZE));
 
-       rangelock_exit(lr);
+       if (lr != NULL)
+               rangelock_exit(lr);
        ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
        ZFS_EXIT(zfsvfs);
 
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to