Author: mav
Date: Tue Jul 31 00:02:42 2018
New Revision: 336949
URL: https://svnweb.freebsd.org/changeset/base/336949

Log:
  MFV r336948: 9112 Improve allocation performance on high-end systems
  
  On high-end systems running async sequential write workloads, especially
  NUMA systems with flash or NVMe storage, one significant performance
  bottleneck is selecting a metaslab to do allocations from. This process
  can be parallelized, providing significant performance increases for
  these workloads.
  
  illumos/illumos-gate@f78cdc34af236a6199dd9e21376f4a46348c0d56
  
  Reviewed by: Matthew Ahrens <mahr...@delphix.com>
  Reviewed by: George Wilson <george.wil...@delphix.com>
  Reviewed by: Serapheim Dimitropoulos <serapheim.dimi...@delphix.com>
  Reviewed by: Alexander Motin <m...@freebsd.org>
  Approved by: Gordon Ross <g...@nexenta.com>
  Author: Paul Dagnelie <p...@delphix.com>

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Mon Jul 
30 23:53:25 2018        (r336948)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Tue Jul 
31 00:02:42 2018        (r336949)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
@@ -275,6 +275,8 @@ static uint64_t metaslab_weight(metaslab_t *);
 static void metaslab_set_fragmentation(metaslab_t *);
 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
 
 kmem_cache_t *metaslab_alloc_trace_cache;
 
@@ -294,7 +296,12 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
        mc->mc_rotor = NULL;
        mc->mc_ops = ops;
        mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
-       refcount_create_tracked(&mc->mc_alloc_slots);
+       mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
+           sizeof (refcount_t), KM_SLEEP);
+       mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
+           sizeof (uint64_t), KM_SLEEP);
+       for (int i = 0; i < spa->spa_alloc_count; i++)
+               refcount_create_tracked(&mc->mc_alloc_slots[i]);
 
        return (mc);
 }
@@ -308,7 +315,12 @@ metaslab_class_destroy(metaslab_class_t *mc)
        ASSERT(mc->mc_space == 0);
        ASSERT(mc->mc_dspace == 0);
 
-       refcount_destroy(&mc->mc_alloc_slots);
+       for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
+               refcount_destroy(&mc->mc_alloc_slots[i]);
+       kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
+           sizeof (refcount_t));
+       kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
+           sizeof (uint64_t));
        mutex_destroy(&mc->mc_lock);
        kmem_free(mc, sizeof (metaslab_class_t));
 }
@@ -532,6 +544,30 @@ metaslab_compare(const void *x1, const void *x2)
        const metaslab_t *m1 = x1;
        const metaslab_t *m2 = x2;
 
+       int sort1 = 0;
+       int sort2 = 0;
+       if (m1->ms_allocator != -1 && m1->ms_primary)
+               sort1 = 1;
+       else if (m1->ms_allocator != -1 && !m1->ms_primary)
+               sort1 = 2;
+       if (m2->ms_allocator != -1 && m2->ms_primary)
+               sort2 = 1;
+       else if (m2->ms_allocator != -1 && !m2->ms_primary)
+               sort2 = 2;
+
+       /*
+        * Sort inactive metaslabs first, then primaries, then secondaries. When
+        * selecting a metaslab to allocate from, an allocator first tries its
+        * primary, then secondary active metaslab. If it doesn't have active
+        * metaslabs, or can't allocate from them, it searches for an inactive
+        * metaslab to activate. If it can't find a suitable one, it will steal
+        * a primary or secondary metaslab from another allocator.
+        */
+       if (sort1 < sort2)
+               return (-1);
+       if (sort1 > sort2)
+               return (1);
+
        if (m1->ms_weight < m2->ms_weight)
                return (1);
        if (m1->ms_weight > m2->ms_weight)
@@ -683,12 +719,16 @@ metaslab_group_alloc_update(metaslab_group_t *mg)
 }
 
 metaslab_group_t *
-metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 {
        metaslab_group_t *mg;
 
        mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
        mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+       mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+           KM_SLEEP);
+       mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
+           KM_SLEEP);
        avl_create(&mg->mg_metaslab_tree, metaslab_compare,
            sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
        mg->mg_vd = vd;
@@ -696,8 +736,17 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd
        mg->mg_activation_count = 0;
        mg->mg_initialized = B_FALSE;
        mg->mg_no_free_space = B_TRUE;
-       refcount_create_tracked(&mg->mg_alloc_queue_depth);
+       mg->mg_allocators = allocators;
 
+       mg->mg_alloc_queue_depth = kmem_zalloc(allocators * sizeof (refcount_t),
+           KM_SLEEP);
+       mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
+           sizeof (uint64_t), KM_SLEEP);
+       for (int i = 0; i < allocators; i++) {
+               refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
+               mg->mg_cur_max_alloc_queue_depth[i] = 0;
+       }
+
        mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
            minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
 
@@ -718,8 +767,20 @@ metaslab_group_destroy(metaslab_group_t *mg)
 
        taskq_destroy(mg->mg_taskq);
        avl_destroy(&mg->mg_metaslab_tree);
+       kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
+       kmem_free(mg->mg_secondaries, mg->mg_allocators *
+           sizeof (metaslab_t *));
        mutex_destroy(&mg->mg_lock);
-       refcount_destroy(&mg->mg_alloc_queue_depth);
+
+       for (int i = 0; i < mg->mg_allocators; i++) {
+               refcount_destroy(&mg->mg_alloc_queue_depth[i]);
+               mg->mg_cur_max_alloc_queue_depth[i] = 0;
+       }
+       kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
+           sizeof (refcount_t));
+       kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
+           sizeof (uint64_t));
+
        kmem_free(mg, sizeof (metaslab_group_t));
 }
 
@@ -799,6 +860,22 @@ metaslab_group_passivate(metaslab_group_t *mg)
        taskq_wait(mg->mg_taskq);
        spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
        metaslab_group_alloc_update(mg);
+       for (int i = 0; i < mg->mg_allocators; i++) {
+               metaslab_t *msp = mg->mg_primaries[i];
+               if (msp != NULL) {
+                       mutex_enter(&msp->ms_lock);
+                       metaslab_passivate(msp,
+                           metaslab_weight_from_range_tree(msp));
+                       mutex_exit(&msp->ms_lock);
+               }
+               msp = mg->mg_secondaries[i];
+               if (msp != NULL) {
+                       mutex_enter(&msp->ms_lock);
+                       metaslab_passivate(msp,
+                           metaslab_weight_from_range_tree(msp));
+                       mutex_exit(&msp->ms_lock);
+               }
+       }
 
        mgprev = mg->mg_prev;
        mgnext = mg->mg_next;
@@ -940,6 +1017,17 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t
 }
 
 static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t 
weight)
+{
+       ASSERT(MUTEX_HELD(&mg->mg_lock));
+       ASSERT(msp->ms_group == mg);
+       avl_remove(&mg->mg_metaslab_tree, msp);
+       msp->ms_weight = weight;
+       avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 {
        /*
@@ -950,10 +1038,7 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *
        ASSERT(MUTEX_HELD(&msp->ms_lock));
 
        mutex_enter(&mg->mg_lock);
-       ASSERT(msp->ms_group == mg);
-       avl_remove(&mg->mg_metaslab_tree, msp);
-       msp->ms_weight = weight;
-       avl_add(&mg->mg_metaslab_tree, msp);
+       metaslab_group_sort_impl(mg, msp, weight);
        mutex_exit(&mg->mg_lock);
 }
 
@@ -1001,7 +1086,7 @@ metaslab_group_fragmentation(metaslab_group_t *mg)
  */
 static boolean_t
 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
-    uint64_t psize)
+    uint64_t psize, int allocator)
 {
        spa_t *spa = mg->mg_vd->vdev_spa;
        metaslab_class_t *mc = mg->mg_class;
@@ -1030,7 +1115,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metas
        if (mg->mg_allocatable) {
                metaslab_group_t *mgp;
                int64_t qdepth;
-               uint64_t qmax = mg->mg_max_alloc_queue_depth;
+               uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
 
                if (!mc->mc_alloc_throttle_enabled)
                        return (B_TRUE);
@@ -1042,7 +1127,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metas
                if (mg->mg_no_free_space)
                        return (B_FALSE);
 
-               qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+               qdepth = refcount_count(&mg->mg_alloc_queue_depth[allocator]);
 
                /*
                 * If this metaslab group is below its qmax or it's
@@ -1061,9 +1146,10 @@ metaslab_group_allocatable(metaslab_group_t *mg, metas
                 * groups at the same time when we make this check.
                 */
                for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
-                       qmax = mgp->mg_max_alloc_queue_depth;
+                       qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
 
-                       qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+                       qdepth = refcount_count(
+                           &mgp->mg_alloc_queue_depth[allocator]);
 
                        /*
                         * If there is another metaslab group that
@@ -1471,6 +1557,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6
        ms->ms_id = id;
        ms->ms_start = id << vd->vdev_ms_shift;
        ms->ms_size = 1ULL << vd->vdev_ms_shift;
+       ms->ms_allocator = -1;
+       ms->ms_new = B_TRUE;
 
        /*
         * We only open space map objects that already exist. All others
@@ -1567,6 +1655,7 @@ metaslab_fini(metaslab_t *msp)
        cv_destroy(&msp->ms_load_cv);
        mutex_destroy(&msp->ms_lock);
        mutex_destroy(&msp->ms_sync_lock);
+       ASSERT3U(msp->ms_allocator, ==, -1);
 
        kmem_free(msp, sizeof (metaslab_t));
 }
@@ -1963,19 +2052,59 @@ metaslab_weight(metaslab_t *msp)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    int allocator, uint64_t activation_weight)
 {
+       /*
+        * If we're activating for the claim code, we don't want to actually
+        * set the metaslab up for a specific allocator.
+        */
+       if (activation_weight == METASLAB_WEIGHT_CLAIM)
+               return (0);
+       metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+           mg->mg_primaries : mg->mg_secondaries);
+
        ASSERT(MUTEX_HELD(&msp->ms_lock));
+       mutex_enter(&mg->mg_lock);
+       if (arr[allocator] != NULL) {
+               mutex_exit(&mg->mg_lock);
+               return (EEXIST);
+       }
 
+       arr[allocator] = msp;
+       ASSERT3S(msp->ms_allocator, ==, -1);
+       msp->ms_allocator = allocator;
+       msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+       mutex_exit(&mg->mg_lock);
+
+       return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+
        if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
+               int error = 0;
                metaslab_load_wait(msp);
                if (!msp->ms_loaded) {
-                       int error = metaslab_load(msp);
-                       if (error) {
+                       if ((error = metaslab_load(msp)) != 0) {
                                metaslab_group_sort(msp->ms_group, msp, 0);
                                return (error);
                        }
                }
+               if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+                       /*
+                        * The metaslab was activated for another allocator
+                        * while we were waiting, we should reselect.
+                        */
+                       return (EBUSY);
+               }
+               if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+                   allocator, activation_weight)) != 0) {
+                       return (error);
+               }
 
                msp->ms_activation_weight = msp->ms_weight;
                metaslab_group_sort(msp->ms_group, msp,
@@ -1988,6 +2117,34 @@ metaslab_activate(metaslab_t *msp, uint64_t activation
 }
 
 static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+               metaslab_group_sort(mg, msp, weight);
+               return;
+       }
+
+       mutex_enter(&mg->mg_lock);
+       ASSERT3P(msp->ms_group, ==, mg);
+       if (msp->ms_primary) {
+               ASSERT3U(0, <=, msp->ms_allocator);
+               ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+               ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
+               ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+               mg->mg_primaries[msp->ms_allocator] = NULL;
+       } else {
+               ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+               ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
+               mg->mg_secondaries[msp->ms_allocator] = NULL;
+       }
+       msp->ms_allocator = -1;
+       metaslab_group_sort_impl(mg, msp, weight);
+       mutex_exit(&mg->mg_lock);
+}
+
+static void
 metaslab_passivate(metaslab_t *msp, uint64_t weight)
 {
        uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
@@ -2002,7 +2159,7 @@ metaslab_passivate(metaslab_t *msp, uint64_t weight)
        ASSERT0(weight & METASLAB_ACTIVE_MASK);
 
        msp->ms_activation_weight = 0;
-       metaslab_group_sort(msp->ms_group, msp, weight);
+       metaslab_passivate_allocator(msp->ms_group, msp, weight);
        ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
@@ -2556,11 +2713,18 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
                vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
        }
 
+       if (msp->ms_new) {
+               msp->ms_new = B_FALSE;
+               mutex_enter(&mg->mg_lock);
+               mg->mg_ms_ready++;
+               mutex_exit(&mg->mg_lock);
+       }
        /*
         * Calculate the new weights before unloading any metaslabs.
         * This will give us the most accurate weighting.
         */
-       metaslab_group_sort(mg, msp, metaslab_weight(msp));
+       metaslab_group_sort(mg, msp, metaslab_weight(msp) |
+           (msp->ms_weight & METASLAB_ACTIVE_MASK));
 
        /*
         * If the metaslab is loaded and we've not tried to load or allocate
@@ -2572,6 +2736,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
                        VERIFY0(range_tree_space(
                            msp->ms_allocating[(txg + t) & TXG_MASK]));
                }
+               if (msp->ms_allocator != -1) {
+                       metaslab_passivate(msp, msp->ms_weight &
+                           ~METASLAB_ACTIVE_MASK);
+               }
 
                if (!metaslab_debug_unload)
                        metaslab_unload(msp);
@@ -2665,7 +2833,8 @@ metaslab_alloc_trace_fini(void)
  */
 static void
 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
-    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset)
+    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+    int allocator)
 {
        if (!metaslab_trace_enabled)
                return;
@@ -2698,6 +2867,7 @@ metaslab_trace_add(zio_alloc_list_t *zal, metaslab_gro
        mat->mat_dva_id = dva_id;
        mat->mat_offset = offset;
        mat->mat_weight = 0;
+       mat->mat_allocator = allocator;
 
        if (msp != NULL)
                mat->mat_weight = msp->ms_weight;
@@ -2738,35 +2908,56 @@ metaslab_trace_fini(zio_alloc_list_t *zal)
  */
 
 static void
-metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator)
 {
        if (!(flags & METASLAB_ASYNC_ALLOC) ||
-           flags & METASLAB_DONT_THROTTLE)
+           (flags & METASLAB_DONT_THROTTLE))
                return;
 
        metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
        if (!mg->mg_class->mc_alloc_throttle_enabled)
                return;
 
-       (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+       (void) refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
 }
 
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+       uint64_t max = mg->mg_max_alloc_queue_depth;
+       uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+       while (cur < max) {
+               if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
+                   cur, cur + 1) == cur) {
+                       atomic_inc_64(
+                           &mg->mg_class->mc_alloc_max_slots[allocator]);
+                       return;
+               }
+               cur = mg->mg_cur_max_alloc_queue_depth[allocator];
+       }
+}
+
 void
-metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator, boolean_t io_complete)
 {
        if (!(flags & METASLAB_ASYNC_ALLOC) ||
-           flags & METASLAB_DONT_THROTTLE)
+           (flags & METASLAB_DONT_THROTTLE))
                return;
 
        metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
        if (!mg->mg_class->mc_alloc_throttle_enabled)
                return;
 
-       (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+       (void) refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
+       if (io_complete)
+               metaslab_group_increment_qdepth(mg, allocator);
 }
 
 void
-metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+    int allocator)
 {
 #ifdef ZFS_DEBUG
        const dva_t *dva = bp->blk_dva;
@@ -2775,7 +2966,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t
        for (int d = 0; d < ndvas; d++) {
                uint64_t vdev = DVA_GET_VDEV(&dva[d]);
                metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
-               VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+               VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth[allocator],
+                   tag));
        }
 #endif
 }
@@ -2817,91 +3009,146 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, u
        return (start);
 }
 
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried.  In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab.  This is not optimal, but generally does not 
cause
+ * any problems (a possible exception being if every metaslab is completely 
full
+ * except for the the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+    dva_t *dva, int d, uint64_t min_distance, uint64_t asize, int allocator,
+    zio_alloc_list_t *zal, metaslab_t *search, boolean_t *was_active)
+{
+       avl_index_t idx;
+       avl_tree_t *t = &mg->mg_metaslab_tree;
+       metaslab_t *msp = avl_find(t, search, &idx);
+       if (msp == NULL)
+               msp = avl_nearest(t, idx, AVL_AFTER);
+
+       for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+               int i;
+               if (!metaslab_should_allocate(msp, asize)) {
+                       metaslab_trace_add(zal, mg, msp, asize, d,
+                           TRACE_TOO_SMALL, allocator);
+                       continue;
+               }
+
+               /*
+                * If the selected metaslab is condensing, skip it.
+                */
+               if (msp->ms_condensing)
+                       continue;
+
+               *was_active = msp->ms_allocator != -1;
+               /*
+                * If we're activating as primary, this is our first allocation
+                * from this disk, so we don't need to check how close we are.
+                * If the metaslab under consideration was already active,
+                * we're getting desperate enough to steal another allocator's
+                * metaslab, so we still don't care about distances.
+                */
+               if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+                       break;
+
+               uint64_t target_distance = min_distance
+                   + (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+                   min_distance >> 1);
+
+               for (i = 0; i < d; i++) {
+                       if (metaslab_distance(msp, &dva[i]) < target_distance)
+                               break;
+               }
+               if (i == d)
+                       break;
+       }
+
+       if (msp != NULL) {
+               search->ms_weight = msp->ms_weight;
+               search->ms_start = msp->ms_start + 1;
+               search->ms_allocator = msp->ms_allocator;
+               search->ms_primary = msp->ms_primary;
+       }
+       return (msp);
+}
+
+/* ARGSUSED */
 static uint64_t
 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
        metaslab_t *msp = NULL;
        uint64_t offset = -1ULL;
        uint64_t activation_weight;
-       uint64_t target_distance;
-       int i;
+       boolean_t tertiary = B_FALSE;
 
        activation_weight = METASLAB_WEIGHT_PRIMARY;
-       for (i = 0; i < d; i++) {
-               if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+       for (int i = 0; i < d; i++) {
+               if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+                   DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
                        activation_weight = METASLAB_WEIGHT_SECONDARY;
+               } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+                   DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+                       tertiary = B_TRUE;
                        break;
                }
        }
 
+       /*
+        * If we don't have enough metaslabs active to fill the entire array, we
+        * just use the 0th slot.
+        */
+       if (mg->mg_ms_ready < mg->mg_allocators * 2) {
+               tertiary = B_FALSE;
+               allocator = 0;
+       }
+
+       ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
        metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
        search->ms_weight = UINT64_MAX;
        search->ms_start = 0;
+       /*
+        * At the end of the metaslab tree are the already-active metaslabs,
+        * first the primaries, then the secondaries. When we resume searching
+        * through the tree, we need to consider ms_allocator and ms_primary so
+        * we start in the location right after where we left off, and don't
+        * accidentally loop forever considering the same metaslabs.
+        */
+       search->ms_allocator = -1;
+       search->ms_primary = B_TRUE;
        for (;;) {
-               boolean_t was_active;
-               avl_tree_t *t = &mg->mg_metaslab_tree;
-               avl_index_t idx;
+               boolean_t was_active = B_FALSE;
 
                mutex_enter(&mg->mg_lock);
 
-               /*
-                * Find the metaslab with the highest weight that is less
-                * than what we've already tried.  In the common case, this
-                * means that we will examine each metaslab at most once.
-                * Note that concurrent callers could reorder metaslabs
-                * by activation/passivation once we have dropped the mg_lock.
-                * If a metaslab is activated by another thread, and we fail
-                * to allocate from the metaslab we have selected, we may
-                * not try the newly-activated metaslab, and instead activate
-                * another metaslab.  This is not optimal, but generally
-                * does not cause any problems (a possible exception being
-                * if every metaslab is completely full except for the
-                * the newly-activated metaslab which we fail to examine).
-                */
-               msp = avl_find(t, search, &idx);
-               if (msp == NULL)
-                       msp = avl_nearest(t, idx, AVL_AFTER);
-               for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
-
-                       if (!metaslab_should_allocate(msp, asize)) {
-                               metaslab_trace_add(zal, mg, msp, asize, d,
-                                   TRACE_TOO_SMALL);
-                               continue;
-                       }
-
-                       /*
-                        * If the selected metaslab is condensing, skip it.
-                        */
-                       if (msp->ms_condensing)
-                               continue;
-
-                       was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
-                       if (activation_weight == METASLAB_WEIGHT_PRIMARY)
-                               break;
-
-                       target_distance = min_distance +
-                           (space_map_allocated(msp->ms_sm) != 0 ? 0 :
-                           min_distance >> 1);
-
-                       for (i = 0; i < d; i++) {
-                               if (metaslab_distance(msp, &dva[i]) <
-                                   target_distance)
-                                       break;
-                       }
-                       if (i == d)
-                               break;
+               if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+                   mg->mg_primaries[allocator] != NULL) {
+                       msp = mg->mg_primaries[allocator];
+                       was_active = B_TRUE;
+               } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+                   mg->mg_secondaries[allocator] != NULL && !tertiary) {
+                       msp = mg->mg_secondaries[allocator];
+                       was_active = B_TRUE;
+               } else {
+                       msp = find_valid_metaslab(mg, activation_weight, dva, d,
+                           min_distance, asize, allocator, zal, search,
+                           &was_active);
                }
+
                mutex_exit(&mg->mg_lock);
                if (msp == NULL) {
                        kmem_free(search, sizeof (*search));
                        return (-1ULL);
                }
-               search->ms_weight = msp->ms_weight;
-               search->ms_start = msp->ms_start + 1;
 
                mutex_enter(&msp->ms_lock);
-
                /*
                 * Ensure that the metaslab we have selected is still
                 * capable of handling our request. It's possible that
@@ -2915,18 +3162,32 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_
                        continue;
                }
 
-               if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
-                   activation_weight == METASLAB_WEIGHT_PRIMARY) {
-                       metaslab_passivate(msp,
-                           msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+               /*
+                * If the metaslab is freshly activated for an allocator that
+                * isn't the one we're allocating from, or if it's a primary and
+                * we're seeking a secondary (or vice versa), we go back and
+                * select a new metaslab.
+                */
+               if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+                   (msp->ms_allocator != -1) &&
+                   (msp->ms_allocator != allocator || ((activation_weight ==
+                   METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
                        mutex_exit(&msp->ms_lock);
                        continue;
                }
 
-               if (metaslab_activate(msp, activation_weight) != 0) {
+               if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+                       metaslab_passivate(msp, msp->ms_weight &
+                           ~METASLAB_WEIGHT_CLAIM);
                        mutex_exit(&msp->ms_lock);
                        continue;
                }
+
+               if (metaslab_activate(msp, allocator, activation_weight) != 0) {
+                       mutex_exit(&msp->ms_lock);
+                       continue;
+               }
+
                msp->ms_selected_txg = txg;
 
                /*
@@ -2939,7 +3200,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_
                if (!metaslab_should_allocate(msp, asize)) {
                        /* Passivate this metaslab and select a new one. */
                        metaslab_trace_add(zal, mg, msp, asize, d,
-                           TRACE_TOO_SMALL);
+                           TRACE_TOO_SMALL, allocator);
                        goto next;
                }
 
@@ -2950,13 +3211,15 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_
                 */
                if (msp->ms_condensing) {
                        metaslab_trace_add(zal, mg, msp, asize, d,
-                           TRACE_CONDENSING);
+                           TRACE_CONDENSING, allocator);
+                       metaslab_passivate(msp, msp->ms_weight &
+                           ~METASLAB_ACTIVE_MASK);
                        mutex_exit(&msp->ms_lock);
                        continue;
                }
 
                offset = metaslab_block_alloc(msp, asize, txg);
-               metaslab_trace_add(zal, mg, msp, asize, d, offset);
+               metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
 
                if (offset != -1ULL) {
                        /* Proactively passivate the metaslab, if needed */
@@ -3012,19 +3275,20 @@ next:
 
 static uint64_t
 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
-    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
+    uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d,
+    int allocator)
 {
        uint64_t offset;
        ASSERT(mg->mg_initialized);
 
        offset = metaslab_group_alloc_normal(mg, zal, asize, txg,
-           min_distance, dva, d);
+           min_distance, dva, d, allocator);
 
        mutex_enter(&mg->mg_lock);
        if (offset == -1ULL) {
                mg->mg_failed_allocations++;
                metaslab_trace_add(zal, mg, NULL, asize, d,
-                   TRACE_GROUP_FAILURE);
+                   TRACE_GROUP_FAILURE, allocator);
                if (asize == SPA_GANGBLOCKSIZE) {
                        /*
                         * This metaslab group was unable to allocate
@@ -3059,7 +3323,7 @@ int ditto_same_vdev_distance_shift = 3;
 int
 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
-    zio_alloc_list_t *zal)
+    zio_alloc_list_t *zal, int allocator)
 {
        metaslab_group_t *mg, *rotor;
        vdev_t *vd;
@@ -3071,7 +3335,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, u
         * For testing, make some blocks above a certain size be gang blocks.
         */
        if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
-               metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG);
+               metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+                   allocator);
                return (SET_ERROR(ENOSPC));
        }
 
@@ -3157,12 +3422,12 @@ top:
                 */
                if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
                        allocatable = metaslab_group_allocatable(mg, rotor,
-                           psize);
+                           psize, allocator);
                }
 
                if (!allocatable) {
                        metaslab_trace_add(zal, mg, NULL, psize, d,
-                           TRACE_NOT_ALLOCATABLE);
+                           TRACE_NOT_ALLOCATABLE, allocator);
                        goto next;
                }
 
@@ -3177,7 +3442,7 @@ top:
                    vd->vdev_state < VDEV_STATE_HEALTHY) &&
                    d == 0 && !try_hard && vd->vdev_children == 0) {
                        metaslab_trace_add(zal, mg, NULL, psize, d,
-                           TRACE_VDEV_ERROR);
+                           TRACE_VDEV_ERROR, allocator);
                        goto next;
                }
 
@@ -3201,7 +3466,7 @@ top:
                ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
                uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
-                   distance, dva, d);
+                   distance, dva, d, allocator);
 
                if (offset != -1ULL) {
                        /*
@@ -3264,7 +3529,7 @@ next:
 
        bzero(&dva[d], sizeof (dva_t));
 
-       metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC);
+       metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
        return (SET_ERROR(ENOSPC));
 }
 
@@ -3565,18 +3830,20 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolea
  * the reservation.
  */
 boolean_t
-metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
-    int flags)
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+    zio_t *zio, int flags)
 {
        uint64_t available_slots = 0;
        boolean_t slot_reserved = B_FALSE;
+       uint64_t max = mc->mc_alloc_max_slots[allocator];
 
        ASSERT(mc->mc_alloc_throttle_enabled);
        mutex_enter(&mc->mc_lock);
 
-       uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
-       if (reserved_slots < mc->mc_alloc_max_slots)
-               available_slots = mc->mc_alloc_max_slots - reserved_slots;
+       uint64_t reserved_slots =
+           refcount_count(&mc->mc_alloc_slots[allocator]);
+       if (reserved_slots < max)
+               available_slots = max - reserved_slots;
 
        if (slots <= available_slots || GANG_ALLOCATION(flags)) {
                /*
@@ -3584,7 +3851,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, 
                 * them individually when an I/O completes.
                 */
                for (int d = 0; d < slots; d++) {
-                       reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+                       reserved_slots =
+                           refcount_add(&mc->mc_alloc_slots[allocator],
+                           zio);
                }
                zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
                slot_reserved = B_TRUE;
@@ -3595,12 +3864,14 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, 
 }
 
 void
-metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+    int allocator, zio_t *zio)
 {
        ASSERT(mc->mc_alloc_throttle_enabled);
        mutex_enter(&mc->mc_lock);
        for (int d = 0; d < slots; d++) {
-               (void) refcount_remove(&mc->mc_alloc_slots, zio);
+               (void) refcount_remove(&mc->mc_alloc_slots[allocator],
+                   zio);
        }
        mutex_exit(&mc->mc_lock);
 }
@@ -3622,7 +3893,13 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, u
        mutex_enter(&msp->ms_lock);
 
        if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
-               error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+               error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+       /*
+        * No need to fail in that case; someone else has activated the
+        * metaslab, but that doesn't preclude us from using it.
+        */
+       if (error == EBUSY)
+               error = 0;
 
        if (error == 0 &&
            !range_tree_contains(msp->ms_allocatable, offset, size))
@@ -3727,7 +4004,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint6
 int
 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
-    zio_alloc_list_t *zal, zio_t *zio)
+    zio_alloc_list_t *zal, zio_t *zio, int allocator)
 {
        dva_t *dva = bp->blk_dva;
        dva_t *hintdva = hintbp->blk_dva;
@@ -3750,12 +4027,13 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint6
 
        for (int d = 0; d < ndvas; d++) {
                error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
-                   txg, flags, zal);
+                   txg, flags, zal, allocator);
                if (error != 0) {
                        for (d--; d >= 0; d--) {
                                metaslab_unalloc_dva(spa, &dva[d], txg);
                                metaslab_group_alloc_decrement(spa,
-                                   DVA_GET_VDEV(&dva[d]), zio, flags);
+                                   DVA_GET_VDEV(&dva[d]), zio, flags,
+                                   allocator, B_FALSE);
                                bzero(&dva[d], sizeof (dva_t));
                        }
                        spa_config_exit(spa, SCL_ALLOC, FTAG);
@@ -3766,7 +4044,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint6
                         * based on the newly allocated dva.
                         */
                        metaslab_group_alloc_increment(spa,
-                           DVA_GET_VDEV(&dva[d]), zio, flags);
+                           DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
                }
 
        }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Mon Jul 30 
23:53:25 2018        (r336948)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Tue Jul 31 
00:02:42 2018        (r336949)
@@ -7776,9 +7776,11 @@ spa_sync(spa_t *spa, uint64_t txg)
        spa->spa_syncing_txg = txg;
        spa->spa_sync_pass = 0;
 
-       mutex_enter(&spa->spa_alloc_lock);
-       VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-       mutex_exit(&spa->spa_alloc_lock);
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               mutex_enter(&spa->spa_alloc_locks[i]);
+               VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+               mutex_exit(&spa->spa_alloc_locks[i]);
+       }
 
        /*
         * If there are any pending vdev state changes, convert them
@@ -7844,7 +7846,7 @@ spa_sync(spa_t *spa, uint64_t txg)
         * The max queue depth will not change in the middle of syncing
         * out this txg.
         */
-       uint64_t queue_depth_total = 0;
+       uint64_t slots_per_allocator = 0;
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *tvd = rvd->vdev_child[c];
                metaslab_group_t *mg = tvd->vdev_mg;
@@ -7858,18 +7860,23 @@ spa_sync(spa_t *spa, uint64_t txg)
                 * allocations look at mg_max_alloc_queue_depth, and async
                 * allocations all happen from spa_sync().
                 */
-               ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+               for (int i = 0; i < spa->spa_alloc_count; i++)
+                       ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
                mg->mg_max_alloc_queue_depth = max_queue_depth;
-               queue_depth_total += mg->mg_max_alloc_queue_depth;
+
+               for (int i = 0; i < spa->spa_alloc_count; i++) {
+                       mg->mg_cur_max_alloc_queue_depth[i] =
+                           zfs_vdev_def_queue_depth;
+               }
+               slots_per_allocator += zfs_vdev_def_queue_depth;
        }
        metaslab_class_t *mc = spa_normal_class(spa);
-       ASSERT0(refcount_count(&mc->mc_alloc_slots));
-       mc->mc_alloc_max_slots = queue_depth_total;
+       for (int i = 0; i < spa->spa_alloc_count; i++) {
+               ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
+               mc->mc_alloc_max_slots[i] = slots_per_allocator;
+       }
        mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
 
-       ASSERT3U(mc->mc_alloc_max_slots, <=,
-           max_queue_depth * rvd->vdev_children);
-
        for (int c = 0; c < rvd->vdev_children; c++) {
                vdev_t *vd = rvd->vdev_child[c];
                vdev_indirect_state_sync_verify(vd);
@@ -8052,9 +8059,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        dsl_pool_sync_done(dp, txg);
 
-       mutex_enter(&spa->spa_alloc_lock);
-       VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
-       mutex_exit(&spa->spa_alloc_lock);
+       for (int i = 0; i < spa->spa_alloc_count; i++) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to