Author: avg
Date: Thu Nov 21 13:46:16 2019
New Revision: 354949
URL: https://svnweb.freebsd.org/changeset/base/354949

Log:
  10405 Implement ZFS sorted scans
  
  illumos/illumos-gate@a3874b8b1fe5103fc1f961609557c0587435fec0
  
https://github.com/illumos/illumos-gate/commit/a3874b8b1fe5103fc1f961609557c0587435fec0
  
  https://www.illumos.org/issues/10405
    The original implementation is: 
https://github.com/zfsonlinux/zfs/commit/d4a72f23863382bdf6d0ae33196f5b5decbc48fd
  
  Author: Toomas Soome <tso...@me.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_scan.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h
  vendor-sys/illumos/dist/uts/common/sys/taskq.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/zpool/zpool_main.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_status.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c     Thu Nov 21 13:35:43 
2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c     Thu Nov 21 13:46:16 
2019        (r354949)
@@ -348,7 +348,8 @@ int                 arc_no_grow_shift = 5;
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int             arc_min_prefetch_lifespan;
+static int             zfs_arc_min_prefetch_ms = 1;
+static int             zfs_arc_min_prescient_prefetch_ms = 6;
 
 /*
  * If this percent of memory is free, don't throttle.
@@ -690,8 +691,9 @@ typedef struct arc_stats {
        kstat_named_t arcstat_meta_limit;
        kstat_named_t arcstat_meta_max;
        kstat_named_t arcstat_meta_min;
-       kstat_named_t arcstat_sync_wait_for_async;
+       kstat_named_t arcstat_async_upgrade_sync;
        kstat_named_t arcstat_demand_hit_predictive_prefetch;
+       kstat_named_t arcstat_demand_hit_prescient_prefetch;
 } arc_stats_t;
 
 static arc_stats_t arc_stats = {
@@ -774,8 +776,9 @@ static arc_stats_t arc_stats = {
        { "arc_meta_limit",             KSTAT_DATA_UINT64 },
        { "arc_meta_max",               KSTAT_DATA_UINT64 },
        { "arc_meta_min",               KSTAT_DATA_UINT64 },
-       { "sync_wait_for_async",        KSTAT_DATA_UINT64 },
+       { "async_upgrade_sync",         KSTAT_DATA_UINT64 },
        { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+       { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 };
 
 #define        ARCSTAT(stat)   (arc_stats.stat.value.ui64)
@@ -872,22 +875,23 @@ typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
        void                    *acb_private;
-       arc_done_func_t         *acb_done;
+       arc_read_done_func_t    *acb_done;
        arc_buf_t               *acb_buf;
        boolean_t               acb_compressed;
        zio_t                   *acb_zio_dummy;
+       zio_t                   *acb_zio_head;
        arc_callback_t          *acb_next;
 };
 
 typedef struct arc_write_callback arc_write_callback_t;
 
 struct arc_write_callback {
-       void            *awcb_private;
-       arc_done_func_t *awcb_ready;
-       arc_done_func_t *awcb_children_ready;
-       arc_done_func_t *awcb_physdone;
-       arc_done_func_t *awcb_done;
-       arc_buf_t       *awcb_buf;
+       void                    *awcb_private;
+       arc_write_done_func_t   *awcb_ready;
+       arc_write_done_func_t   *awcb_children_ready;
+       arc_write_done_func_t   *awcb_physdone;
+       arc_write_done_func_t   *awcb_done;
+       arc_buf_t               *awcb_buf;
 };
 
 /*
@@ -1013,6 +1017,8 @@ struct arc_buf_hdr {
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & 
ARC_FLAG_IO_IN_PROGRESS)
 #define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define        HDR_PRESCIENT_PREFETCH(hdr)     \
+       ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define        HDR_COMPRESSION_ENABLED(hdr)    \
        ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
@@ -3243,6 +3249,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
        arc_state_t *evicted_state, *state;
        int64_t bytes_evicted = 0;
+       int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+           zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
 
        ASSERT(MUTEX_HELD(hash_lock));
        ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3295,8 +3303,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
        /* prefetch buffers have a minimum lifespan */
        if (HDR_IO_IN_PROGRESS(hdr) ||
            ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-           arc_min_prefetch_lifespan)) {
+           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
                ARCSTAT_BUMP(arcstat_evict_skip);
                return (bytes_evicted);
        }
@@ -4607,13 +4614,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * - move the buffer to the head of the list if this is
                 *   another prefetch (to make it less likely to be evicted).
                 */
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
                                /* link protected by hash lock */
                                ASSERT(multilist_link_active(
                                    &hdr->b_l1hdr.b_arc_node));
                        } else {
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
                        hdr->b_l1hdr.b_arc_access = now;
@@ -4644,10 +4653,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        new_state = arc_mru;
-                       if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                       if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
                        DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
                } else {
                        new_state = arc_mfu;
@@ -4668,11 +4680,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * If it was a prefetch, we will explicitly move it to
                 * the head of the list now.
                 */
-               if ((HDR_PREFETCH(hdr)) != 0) {
-                       ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-                       /* link protected by hash_lock */
-                       ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-               }
                ARCSTAT_BUMP(arcstat_mfu_hits);
                hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
        } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
@@ -4683,12 +4690,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        /*
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
                        new_state = arc_mru;
                }
 
@@ -4710,21 +4716,26 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
        }
 }
 
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
-       if (zio == NULL || zio->io_error == 0)
-               bcopy(buf->b_data, arg, arc_buf_size(buf));
+       if (buf == NULL)
+               return;
+
+       bcopy(buf->b_data, arg, arc_buf_size(buf));
        arc_buf_destroy(buf, arg);
 }
 
-/* a generic arc_done_func_t */
+/* a generic arc_read_done_func_t */
 void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
        arc_buf_t **bufp = arg;
+
        if (buf == NULL) {
                ASSERT(zio == NULL || zio->io_error != 0);
                *bufp = NULL;
@@ -4759,7 +4770,6 @@ arc_read_done(zio_t *zio)
        arc_callback_t  *callback_list;
        arc_callback_t  *acb;
        boolean_t       freeable = B_FALSE;
-       boolean_t       no_zio_error = (zio->io_error == 0);
 
        /*
         * The hdr was inserted into hash-table and removed from lists
@@ -4785,7 +4795,7 @@ arc_read_done(zio_t *zio)
                ASSERT3P(hash_lock, !=, NULL);
        }
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                /* byteswap if necessary */
                if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
                        if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -4806,7 +4816,8 @@ arc_read_done(zio_t *zio)
        callback_list = hdr->b_l1hdr.b_acb;
        ASSERT3P(callback_list, !=, NULL);
 
-       if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+       if (hash_lock && zio->io_error == 0 &&
+           hdr->b_l1hdr.b_state == arc_anon) {
                /*
                 * Only call arc_access on anonymous buffers.  This is because
                 * if we've issued an I/O for an evicted buffer, we've already
@@ -4827,30 +4838,29 @@ arc_read_done(zio_t *zio)
                if (!acb->acb_done)
                        continue;
 
-               /* This is a demand read since prefetches don't use callbacks */
                callback_cnt++;
 
-               if (no_zio_error) {
-                       int error = arc_buf_alloc_impl(hdr, acb->acb_private,
-                           acb->acb_compressed, zio->io_error == 0,
-                           &acb->acb_buf);
-                       if (error != 0) {
-                               /*
-                                * Decompression failed.  Set io_error
-                                * so that when we call acb_done (below),
-                                * we will indicate that the read failed.
-                                * Note that in the unusual case where one
-                                * callback is compressed and another
-                                * uncompressed, we will mark all of them
-                                * as failed, even though the uncompressed
-                                * one can't actually fail.  In this case,
-                                * the hdr will not be anonymous, because
-                                * if there are multiple callbacks, it's
-                                * because multiple threads found the same
-                                * arc buf in the hash table.
-                                */
-                               zio->io_error = error;
-                       }
+               if (zio->io_error != 0)
+                       continue;
+
+               int error = arc_buf_alloc_impl(hdr, acb->acb_private,
+                   acb->acb_compressed, B_TRUE, &acb->acb_buf);
+               if (error != 0) {
+                       /*
+                        * Decompression failed.  Set io_error
+                        * so that when we call acb_done (below),
+                        * we will indicate that the read failed.
+                        * Note that in the unusual case where one
+                        * callback is compressed and another
+                        * uncompressed, we will mark all of them
+                        * as failed, even though the uncompressed
+                        * one can't actually fail.  In this case,
+                        * the hdr will not be anonymous, because
+                        * if there are multiple callbacks, it's
+                        * because multiple threads found the same
+                        * arc buf in the hash table.
+                        */
+                       zio->io_error = error;
                }
        }
        /*
@@ -4873,7 +4883,7 @@ arc_read_done(zio_t *zio)
        ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
            callback_list != NULL);
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                arc_hdr_verify(hdr, zio->io_bp);
        } else {
                arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -4916,7 +4926,8 @@ arc_read_done(zio_t *zio)
                                arc_buf_destroy(acb->acb_buf, acb->acb_private);
                                acb->acb_buf = NULL;
                        }
-                       acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+                       acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+                           acb->acb_buf, acb->acb_private);
                }
 
                if (acb->acb_zio_dummy != NULL) {
@@ -4951,7 +4962,7 @@ arc_read_done(zio_t *zio)
  * for readers of this block.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t 
*done,
     void *private, zio_priority_t priority, int zio_flags,
     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
@@ -4960,6 +4971,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, a
        zio_t *rzio;
        uint64_t guid = spa_load_guid(spa);
        boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
+       int rc = 0;
 
        ASSERT(!BP_IS_EMBEDDED(bp) ||
            BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -4978,32 +4990,20 @@ top:
                *arc_flags |= ARC_FLAG_CACHED;
 
                if (HDR_IO_IN_PROGRESS(hdr)) {
+                       zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 
+                       ASSERT3P(head_zio, !=, NULL);
                        if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
                            priority == ZIO_PRIORITY_SYNC_READ) {
                                /*
-                                * This sync read must wait for an
-                                * in-progress async read (e.g. a predictive
-                                * prefetch).  Async reads are queued
-                                * separately at the vdev_queue layer, so
-                                * this is a form of priority inversion.
-                                * Ideally, we would "inherit" the demand
-                                * i/o's priority by moving the i/o from
-                                * the async queue to the synchronous queue,
-                                * but there is currently no mechanism to do
-                                * so.  Track this so that we can evaluate
-                                * the magnitude of this potential performance
-                                * problem.
-                                *
-                                * Note that if the prefetch i/o is already
-                                * active (has been issued to the device),
-                                * the prefetch improved performance, because
-                                * we issued it sooner than we would have
-                                * without the prefetch.
+                                * This is a sync read that needs to wait for
+                                * an in-flight async read. Request that the
+                                * zio have its priority upgraded.
                                 */
-                               DTRACE_PROBE1(arc__sync__wait__for__async,
+                               zio_change_priority(head_zio, priority);
+                               DTRACE_PROBE1(arc__async__upgrade__sync,
                                    arc_buf_hdr_t *, hdr);
-                               ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+                               ARCSTAT_BUMP(arcstat_async_upgrade_sync);
                        }
                        if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
                                arc_hdr_clear_flags(hdr,
@@ -5030,6 +5030,7 @@ top:
                                            spa, NULL, NULL, NULL, zio_flags);
 
                                ASSERT3P(acb->acb_done, !=, NULL);
+                               acb->acb_zio_head = head_zio;
                                acb->acb_next = hdr->b_l1hdr.b_acb;
                                hdr->b_l1hdr.b_acb = acb;
                                mutex_exit(hash_lock);
@@ -5057,17 +5058,33 @@ top:
                                arc_hdr_clear_flags(hdr,
                                    ARC_FLAG_PREDICTIVE_PREFETCH);
                        }
+
+                       if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+                               ARCSTAT_BUMP(
+                                   arcstat_demand_hit_prescient_prefetch);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
+
                        ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
 
                        /* Get a buf with the desired data in it. */
-                       VERIFY0(arc_buf_alloc_impl(hdr, private,
-                           compressed_read, B_TRUE, &buf));
+                       rc = arc_buf_alloc_impl(hdr, private,
+                           compressed_read, B_TRUE, &buf);
+                       if (rc != 0) {
+                               arc_buf_destroy(buf, private);
+                               buf = NULL;
+                       }
+                       ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+                           rc == 0 || rc != ENOENT);
                } else if (*arc_flags & ARC_FLAG_PREFETCH &&
                    zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
                        arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
                }
                DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
                arc_access(hdr, hash_lock);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                mutex_exit(hash_lock);
@@ -5077,7 +5094,7 @@ top:
                    data, metadata, hits);
 
                if (done)
-                       done(NULL, buf, private);
+                       done(NULL, zb, bp, buf, private);
        } else {
                uint64_t lsize = BP_GET_LSIZE(bp);
                uint64_t psize = BP_GET_PSIZE(bp);
@@ -5151,6 +5168,9 @@ top:
 
                if (*arc_flags & ARC_FLAG_PREFETCH)
                        arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                if (BP_GET_LEVEL(bp) > 0)
@@ -5180,14 +5200,17 @@ top:
                                vd = NULL;
                }
 
-               if (priority == ZIO_PRIORITY_ASYNC_READ)
+               /*
+                * We count both async reads and scrub IOs as asynchronous so
+                * that both can be upgraded in the event of a cache hit while
+                * the read IO is still in-flight.
+                */
+               if (priority == ZIO_PRIORITY_ASYNC_READ ||
+                   priority == ZIO_PRIORITY_SCRUB)
                        arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
                else
                        arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
-               if (hash_lock != NULL)
-                       mutex_exit(hash_lock);
-
                /*
                 * At this point, we have a level 1 cache miss.  Try again in
                 * L2ARC if possible.
@@ -5257,6 +5280,11 @@ top:
                                    ZIO_FLAG_CANFAIL |
                                    ZIO_FLAG_DONT_PROPAGATE |
                                    ZIO_FLAG_DONT_RETRY, B_FALSE);
+                               acb->acb_zio_head = rzio;
+
+                               if (hash_lock != NULL)
+                                       mutex_exit(hash_lock);
+
                                DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
                                    zio_t *, rzio);
                                ARCSTAT_INCR(arcstat_l2_read_bytes, size);
@@ -5271,6 +5299,8 @@ top:
                                        return (0);
 
                                /* l2arc read error; goto zio_read() */
+                               if (hash_lock != NULL)
+                                       mutex_enter(hash_lock);
                        } else {
                                DTRACE_PROBE1(l2arc__miss,
                                    arc_buf_hdr_t *, hdr);
@@ -5291,7 +5321,11 @@ top:
 
                rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
                    arc_read_done, hdr, priority, zio_flags, zb);
+               acb->acb_zio_head = rzio;
 
+               if (hash_lock != NULL)
+                       mutex_exit(hash_lock);
+
                if (*arc_flags & ARC_FLAG_WAIT)
                        return (zio_wait(rzio));
 
@@ -5778,9 +5812,9 @@ arc_write_done(zio_t *zio)
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
-    arc_done_func_t *children_ready, arc_done_func_t *physdone,
-    arc_done_func_t *done, void *private, zio_priority_t priority,
+    boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
+    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+    arc_write_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, const zbookmark_phys_t *zb)
 {
        arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -6191,9 +6225,6 @@ arc_init(void)
 #endif
        mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
-
-       /* Convert seconds to clock ticks */
-       arc_min_prefetch_lifespan = 1 * hz;
 
        /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
        arc_c_min = MAX(allmem / 32, 64 << 20);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c    Thu Nov 21 13:35:43 
2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c    Thu Nov 21 13:46:16 
2019        (r354949)
@@ -942,7 +942,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t o
 }
 
 static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
 
@@ -961,15 +962,19 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                ASSERT3P(db->db_buf, ==, NULL);
                db->db_state = DB_UNCACHED;
        } else if (db->db_level == 0 && db->db_freed_in_flight) {
-               /* freed in flight */
+               /* we were freed in flight; disregard any error */
                ASSERT(zio == NULL || zio->io_error == 0);
+               if (buf == NULL) {
+                       buf = arc_alloc_buf(db->db_objset->os_spa,
+                           db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+               }
                arc_release(buf, db);
                bzero(buf->b_data, db->db.db_size);
                arc_buf_freeze(buf);
                db->db_freed_in_flight = FALSE;
                dbuf_set_data(db, buf);
                db->db_state = DB_CACHED;
-       } else {
+       } else if (buf != NULL) {
                /* success */
                ASSERT(zio == NULL || zio->io_error == 0);
                dbuf_set_data(db, buf);
@@ -2395,7 +2400,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, bl
  * prefetch if the next block down is our target.
  */
 static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
        dbuf_prefetch_arg_t *dpa = private;
 
@@ -2442,11 +2448,11 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abu
        }
 
        dpa->dpa_curlevel--;
-
        uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
            (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
        blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
            P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+
        if (BP_IS_HOLE(bp)) {
                kmem_free(dpa, sizeof (*dpa));
        } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
@@ -3852,7 +3858,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, d
                 * ready callback so that we can properly handle an indirect
                 * block that only contains holes.
                 */
-               arc_done_func_t *children_ready_cb = NULL;
+               arc_write_done_func_t *children_ready_cb = NULL;
                if (db->db_level != 0)
                        children_ready_cb = dbuf_write_children_ready;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c     Thu Nov 21 13:35:43 
2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c     Thu Nov 21 13:46:16 
2019        (r354949)
@@ -1106,14 +1106,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
+       dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
        dmu_tx_t *tx;
-       zio_t *rio = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+       zio_t *rio;
 
        ASSERT(spa_syncing_txg(spa) == txg);
 
        tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
+       rio = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+       /*
+        * This function may cause an immediate scan of ddt blocks (see
+        * the comment above dsl_scan_ddt() for details). We set the
+        * scan's root zio here so that we can wait for any scan IOs in
+        * addition to the regular ddt IOs.
+        */
+       ASSERT3P(scn->scn_zio_root, ==, NULL);
+       scn->scn_zio_root = rio;
+
        for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
                ddt_t *ddt = spa->spa_ddt[c];
                if (ddt == NULL)
@@ -1123,6 +1135,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
        }
 
        (void) zio_wait(rio);
+       scn->scn_zio_root = NULL;
 
        dmu_tx_commit(tx);
 }

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c      Thu Nov 21 
13:35:43 2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c      Thu Nov 21 
13:46:16 2019        (r354949)
@@ -398,6 +398,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
 
        ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
+#if 0
        /*
         * The $ORIGIN dataset (if it exists) doesn't have an associated
         * objset, so there's no reason to open it. The $ORIGIN dataset
@@ -408,6 +409,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
                ASSERT3P(ds->ds_dir, !=,
                    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
        }
+#endif
 
        os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
        os->os_dsl_dataset = ds;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c    Thu Nov 21 
13:35:43 2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c    Thu Nov 21 
13:46:16 2019        (r354949)
@@ -492,7 +492,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const 
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
        prefetch_data_t *pfd = arg;
-       arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+       arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+           ARC_FLAG_PRESCIENT_PREFETCH;
 
        ASSERT(pfd->pd_bytes_fetched >= 0);
        if (bp == NULL)

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c        Thu Nov 21 
13:35:43 2019        (r354948)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c        Thu Nov 21 
13:46:16 2019        (r354949)
@@ -51,34 +51,157 @@
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/range_tree.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limited performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
-static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
-static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
 
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+    uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t 
txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executing I/Os per top-level vdev.
+ * Tune with care. Very high settings (hundreds) are known to trigger
+ * some firmware bugs and resets on certain SSDs.
+ */
 int zfs_top_maxinflight = 32;          /* maximum I/Os per top-level */
-int zfs_resilver_delay = 2;            /* number of ticks to delay resilver */
-int zfs_scrub_delay = 4;               /* number of ticks to delay scrub */
-int zfs_scan_idle = 50;                        /* idle window in clock ticks */
+unsigned int zfs_resilver_delay = 2;   /* number of ticks to delay resilver */
+unsigned int zfs_scrub_delay = 4;      /* number of ticks to delay scrub */
+unsigned int zfs_scan_idle = 50;       /* idle window in clock ticks */
 
-int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
-int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
-int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
-int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+uint64_t zfs_scan_max_ext_gap = 2 << 20;       /* in bytes */
+
+unsigned int zfs_scan_checkpoint_intval = 7200;        /* seconds */
+#define        ZFS_SCAN_CHECKPOINT_INTVAL      
SEC_TO_TICK(zfs_scan_checkpoint_intval)
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+uint64_t zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20;      /* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;        /* bytes */
+int zfs_scan_mem_lim_fact = 20;                /* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20;   /* fraction of mem lim above */
+
+unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+/* min millisecs to obsolete per txg */
+unsigned int zfs_obsolete_min_time_ms = 500;
+/* min millisecs to resilver per txg */
+unsigned int zfs_resilver_min_time_ms = 3000;
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
 /* max number of blocks to free in a single TXG */
 uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define        SCAN_IMPORT_WAIT_TXGS           5
+
+
 #define        DSL_SCAN_IS_SCRUB_RESILVER(scn) \
        ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
        (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -97,6 +220,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
        dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
 };
 
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+       uint64_t        sds_dsobj;
+       uint64_t        sds_txg;
+       avl_node_t      sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ *     write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+       SYNC_OPTIONAL,
+       SYNC_MANDATORY,
+       SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+       /* fields from blkptr_t */
+       uint64_t                sio_offset;
+       uint64_t                sio_blk_prop;
+       uint64_t                sio_phys_birth;
+       uint64_t                sio_birth;
+       zio_cksum_t             sio_cksum;
+       uint32_t                sio_asize;
+
+       /* fields from zio_t */
+       int                     sio_flags;
+       zbookmark_phys_t        sio_zb;
+
+       /* members for queue sorting */
+       union {
+               avl_node_t      sio_addr_node; /* link into issueing queue */
+               list_node_t     sio_list_node; /* link for issuing to disk */
+       } sio_nodes;
+} scan_io_t;
+
+struct dsl_scan_io_queue {
+       dsl_scan_t      *q_scn; /* associated dsl_scan_t */
+       vdev_t          *q_vd; /* top-level vdev that this queue represents */
+
+       /* trees used for sorting I/Os and extents of I/Os */
+       range_tree_t    *q_exts_by_addr;
+       avl_tree_t      q_exts_by_size;
+       avl_tree_t      q_sios_by_addr;
+
+       /* members for zio rate limiting */
+       uint64_t        q_maxinflight_bytes;
+       uint64_t        q_inflight_bytes;
+       kcondvar_t      q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+       /* per txg statistics */
+       uint64_t        q_total_seg_size_this_txg;
+       uint64_t        q_segs_this_txg;
+       uint64_t        q_total_zio_size_this_txg;
+       uint64_t        q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+       zfs_refcount_t spc_refcnt;      /* refcount for memory management */
+       dsl_scan_t *spc_scn;            /* dsl_scan_t for the pool */
+       boolean_t spc_root;             /* is this prefetch for an objset? */
+       uint8_t spc_indblkshift;        /* dn_indblkshift of current dnode */
+       uint16_t spc_datablkszsec;      /* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+       avl_node_t spic_avl_node;       /* link into scn->scn_prefetch_queue */
+       scan_prefetch_ctx_t *spic_spc;  /* spc for the callback */
+       blkptr_t spic_bp;               /* bp to prefetch */
+       zbookmark_phys_t spic_zb;       /* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+    scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache;
+
+void
+scan_init(void)
+{
+       /*
+        * This is used in ext_size_compare() to weight segments
+        * based on how sparse they are. This cannot be changed
+        * mid-scan and the tree comparison functions don't currently
+        * have a mechansim for passing additional context to the
+        * compare functions. Thus we store this value globally and
+        * we only allow it to be set at module intiailization time
+        */
+       fill_weight = zfs_scan_fill_weight;
+
+       sio_cache = kmem_cache_create("sio_cache",
+           sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+scan_fini(void)
+{
+       kmem_cache_destroy(sio_cache);
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+       return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+       return (dsl_scan_is_running(dp->dp_scan) &&
+           dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+{
+       bzero(bp, sizeof (*bp));
+       DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
+       DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
+       DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
+       bp->blk_prop = sio->sio_blk_prop;
+       bp->blk_phys_birth = sio->sio_phys_birth;
+       bp->blk_birth = sio->sio_birth;
+       bp->blk_fill = 1;       /* we always only work with data pointers */
+       bp->blk_cksum = sio->sio_cksum;
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+       /* we discard the vdev id, since we can deduce it from the queue */
+       sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
+       sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
+       sio->sio_blk_prop = bp->blk_prop;
+       sio->sio_phys_birth = bp->blk_phys_birth;
+       sio->sio_birth = bp->blk_birth;
+       sio->sio_cksum = bp->blk_cksum;
+}
+
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
@@ -117,6 +397,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
        scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
            SPA_FEATURE_ASYNC_DESTROY);
 
+       bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+       avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+           offsetof(scan_ds_t, sds_node));
+       avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+           sizeof (scan_prefetch_issue_ctx_t),
+           offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            "scrub_func", sizeof (uint64_t), 1, &f);
        if (err == 0) {
@@ -127,7 +414,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                scn->scn_restart_txg = txg;
                zfs_dbgmsg("old-style scrub was in progress; "
                    "restarting new-style scrub in txg %llu",
-                   scn->scn_restart_txg);
+                   (longlong_t)scn->scn_restart_txg);
 
                /*
                 * Load the queue obj from the old location so that it
@@ -145,7 +432,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                else if (err)
                        return (err);
 
-               if (scn->scn_phys.scn_state == DSS_SCANNING &&
+               /*
+                * We might be restarting after a reboot, so jump the issued
+                * counter to how far we've scanned. We know we're consistent
+                * up to here.
+                */
+               scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+               if (dsl_scan_is_running(scn) &&
                    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
                        /*
                         * A new-type scrub was in progress on an old
@@ -157,10 +451,26 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                        scn->scn_restart_txg = txg;
                        zfs_dbgmsg("new-style scrub was modified "
                            "by old software; restarting in txg %llu",
-                           scn->scn_restart_txg);
+                           (longlong_t)scn->scn_restart_txg);
                }

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to