Author: avg
Date: Wed Nov 20 10:41:10 2013
New Revision: 258371
URL: http://svnweb.freebsd.org/changeset/base/258371

Log:
  4101 metaslab_debug should allow for fine-grained control
  
  4102 space_maps should store more information about themselves
  4103 space map object blocksize should be increased
  4104 ::spa_space no longer works
  4105 removing a mirrored log device results in a leaked object
  4106 asynchronously load metaslab
  
  Revision illumos/illumos-gate@0713e232b7712cd27d99e1e935ebb8d5de61c57d

Added:
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c/range_tree.c   
(contents, props changed)
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_reftree.h/space_reftree.h 
  (contents, props changed)
Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfeature.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfeature.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/man/man5/zpool-features.5

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c        Wed Nov 20 
10:38:37 2013        (r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c        Wed Nov 20 
10:41:10 2013        (r258371)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -164,4 +164,7 @@ zpool_feature_init(void)
        zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
            "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
            "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
+       zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+           "com.delphix:spacemap_histogram", "spacemap_histogram",
+           "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL);
 }

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h        Wed Nov 20 
10:38:37 2013        (r258370)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h        Wed Nov 20 
10:41:10 2013        (r258371)
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
@@ -56,6 +56,7 @@ enum spa_feature {
        SPA_FEATURE_EMPTY_BPOBJ,
        SPA_FEATURE_LZ4_COMPRESS,
        SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+       SPA_FEATURE_SPACEMAP_HISTOGRAM,
        SPA_FEATURES
 } spa_feature_t;
 

Modified: vendor-sys/illumos/dist/uts/common/Makefile.files
==============================================================================
--- vendor-sys/illumos/dist/uts/common/Makefile.files   Wed Nov 20 10:38:37 
2013        (r258370)
+++ vendor-sys/illumos/dist/uts/common/Makefile.files   Wed Nov 20 10:41:10 
2013        (r258371)
@@ -22,7 +22,7 @@
 #
 # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
 # Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
-# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2013 by Delphix. All rights reserved.
 # Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 #
 
@@ -1359,6 +1359,7 @@ ZFS_COMMON_OBJS +=                \
        lz4.o                   \
        lzjb.o                  \
        metaslab.o              \
+       range_tree.o            \
        refcount.o              \
        rrwlock.o               \
        sa.o                    \
@@ -1369,6 +1370,7 @@ ZFS_COMMON_OBJS +=                \
        spa_history.o           \
        spa_misc.o              \
        space_map.o             \
+       space_reftree.o         \
        txg.o                   \
        uberblock.o             \
        unique.o                \

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c   Wed Nov 20 10:38:37 
2013        (r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c   Wed Nov 20 10:41:10 
2013        (r258371)
@@ -1334,7 +1334,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
        /* Check for any allocated blocks beyond the first */
-       if (dn->dn_phys->dn_maxblkid != 0)
+       if (dn->dn_maxblkid != 0)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c        Wed Nov 20 
10:38:37 2013        (r258370)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c        Wed Nov 20 
10:41:10 2013        (r258371)
@@ -31,6 +31,7 @@
 #include <sys/metaslab_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
 
 /*
  * Allow allocations to switch to gang blocks quickly. We do this to
@@ -44,6 +45,11 @@
        (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
        METASLAB_GANG_AVOID)))
 
+#define        METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
+#define        METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
+#define        METASLAB_ACTIVE_MASK            \
+       (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;    /* force gang blocks */
 
@@ -79,9 +85,14 @@ int zfs_mg_alloc_failures = 0;
 int zfs_mg_noalloc_threshold = 0;
 
 /*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * When set will load all metaslabs when pool is first opened.
  */
-static int metaslab_debug = 0;
+int metaslab_debug_load = 0;
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
 
 /*
  * Minimum size which forces the dynamic allocator to change
@@ -106,14 +117,16 @@ int metaslab_df_free_pct = 4;
 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
 
 /*
- * Max number of space_maps to prefetch.
+ * Percentage of all cpus that can be used by the metaslab taskq.
  */
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+int metaslab_load_pct = 50;
 
 /*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
  */
-int metaslab_smo_bonus_pct = 150;
+int metaslab_unload_delay = TXG_SIZE * 2;
 
 /*
  * Should we be willing to write data to degraded vdevs?
@@ -121,12 +134,28 @@ int metaslab_smo_bonus_pct = 150;
 boolean_t zfs_write_to_degraded = B_FALSE;
 
 /*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+
+/*
+ * Enable/disable additional weight factor for each metaslab.
+ */
+boolean_t metaslab_weight_factor_enable = B_FALSE;
+
+
+/*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
 {
        metaslab_class_t *mc;
 
@@ -230,9 +259,9 @@ metaslab_compare(const void *x1, const v
        /*
         * If the weights are identical, use the offset to force uniqueness.
         */
-       if (m1->ms_map->sm_start < m2->ms_map->sm_start)
+       if (m1->ms_start < m2->ms_start)
                return (-1);
-       if (m1->ms_map->sm_start > m2->ms_map->sm_start)
+       if (m1->ms_start > m2->ms_start)
                return (1);
 
        ASSERT3P(m1, ==, m2);
@@ -300,6 +329,9 @@ metaslab_group_create(metaslab_class_t *
        mg->mg_class = mc;
        mg->mg_activation_count = 0;
 
+       mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
+           minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
        return (mg);
 }
 
@@ -368,6 +400,8 @@ metaslab_group_passivate(metaslab_group_
                return;
        }
 
+       taskq_wait(mg->mg_taskq);
+
        mgprev = mg->mg_prev;
        mgnext = mg->mg_next;
 
@@ -447,130 +481,200 @@ metaslab_group_allocatable(metaslab_grou
 
 /*
  * ==========================================================================
- * Common allocator routines
+ * Range tree callbacks
  * ==========================================================================
  */
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
 static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+metaslab_rangesize_compare(const void *x1, const void *x2)
 {
-       const space_seg_t *s1 = x1;
-       const space_seg_t *s2 = x2;
-       uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-       uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+       const range_seg_t *r1 = x1;
+       const range_seg_t *r2 = x2;
+       uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+       uint64_t rs_size2 = r2->rs_end - r2->rs_start;
 
-       if (ss_size1 < ss_size2)
+       if (rs_size1 < rs_size2)
                return (-1);
-       if (ss_size1 > ss_size2)
+       if (rs_size1 > rs_size2)
                return (1);
 
-       if (s1->ss_start < s2->ss_start)
+       if (r1->rs_start < r2->rs_start)
                return (-1);
-       if (s1->ss_start > s2->ss_start)
+
+       if (r1->rs_start > r2->rs_start)
                return (1);
 
        return (0);
 }
 
 /*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
  */
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
-    uint64_t align)
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
 {
-       space_seg_t *ss, ssearch;
-       avl_index_t where;
-
-       ssearch.ss_start = *cursor;
-       ssearch.ss_end = *cursor + size;
-
-       ss = avl_find(t, &ssearch, &where);
-       if (ss == NULL)
-               ss = avl_nearest(t, where, AVL_AFTER);
+       metaslab_t *msp = arg;
 
-       while (ss != NULL) {
-               uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
-               if (offset + size <= ss->ss_end) {
-                       *cursor = offset + size;
-                       return (offset);
-               }
-               ss = AVL_NEXT(t, ss);
-       }
+       ASSERT3P(rt->rt_arg, ==, msp);
+       ASSERT(msp->ms_tree == NULL);
 
-       /*
-        * If we know we've searched the whole map (*cursor == 0), give up.
-        * Otherwise, reset the cursor to the beginning and try again.
-        */
-       if (*cursor == 0)
-               return (-1ULL);
-
-       *cursor = 0;
-       return (metaslab_block_picker(t, cursor, size, align));
+       avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+           sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+/*
+ * Destroy the block allocator specific components.
+ */
 static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
 {
-       space_seg_t *ss;
+       metaslab_t *msp = arg;
 
-       ASSERT(sm->sm_ppd == NULL);
-       sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+       ASSERT3P(rt->rt_arg, ==, msp);
+       ASSERT3P(msp->ms_tree, ==, rt);
+       ASSERT0(avl_numnodes(&msp->ms_size_tree));
 
-       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-       avl_create(sm->sm_pp_root, metaslab_segsize_compare,
-           sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-               avl_add(sm->sm_pp_root, ss);
+       avl_destroy(&msp->ms_size_tree);
 }
 
 static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-       void *cookie = NULL;
-
-       kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-       sm->sm_ppd = NULL;
+       metaslab_t *msp = arg;
 
-       while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-               /* tear down the tree */
-       }
-
-       avl_destroy(sm->sm_pp_root);
-       kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-       sm->sm_pp_root = NULL;
+       ASSERT3P(rt->rt_arg, ==, msp);
+       ASSERT3P(msp->ms_tree, ==, rt);
+       VERIFY(!msp->ms_condensing);
+       avl_add(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
 {
-       /* No need to update cursor */
+       metaslab_t *msp = arg;
+
+       ASSERT3P(rt->rt_arg, ==, msp);
+       ASSERT3P(msp->ms_tree, ==, rt);
+       VERIFY(!msp->ms_condensing);
+       avl_remove(&msp->ms_size_tree, rs);
 }
 
-/* ARGSUSED */
 static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
 {
-       /* No need to update cursor */
+       metaslab_t *msp = arg;
+
+       ASSERT3P(rt->rt_arg, ==, msp);
+       ASSERT3P(msp->ms_tree, ==, rt);
+
+       /*
+        * Normally one would walk the tree freeing nodes along the way.
+        * Since the nodes are shared with the range trees we can avoid
+        * walking all nodes and just reinitialize the avl tree. The nodes
+        * will be freed by the range tree, so we don't want to free them here.
+        */
+       avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+           sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
 }
 
+static range_tree_ops_t metaslab_rt_ops = {
+       metaslab_rt_create,
+       metaslab_rt_destroy,
+       metaslab_rt_add,
+       metaslab_rt_remove,
+       metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
 /*
  * Return the maximum contiguous segment within the metaslab.
  */
 uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
 {
-       avl_tree_t *t = sm->sm_pp_root;
-       space_seg_t *ss;
+       avl_tree_t *t = &msp->ms_size_tree;
+       range_seg_t *rs;
 
-       if (t == NULL || (ss = avl_last(t)) == NULL)
+       if (t == NULL || (rs = avl_last(t)) == NULL)
                return (0ULL);
 
-       return (ss->ss_end - ss->ss_start);
+       return (rs->rs_end - rs->rs_start);
+}
+
+uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+{
+       uint64_t start;
+       range_tree_t *rt = msp->ms_tree;
+
+       VERIFY(!msp->ms_condensing);
+
+       start = msp->ms_ops->msop_alloc(msp, size);
+       if (start != -1ULL) {
+               vdev_t *vd = msp->ms_group->mg_vd;
+
+               VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+               VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+               VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+               range_tree_remove(rt, start, size);
+       }
+       return (start);
+}
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
+{
+       range_seg_t *rs, rsearch;
+       avl_index_t where;
+
+       rsearch.rs_start = *cursor;
+       rsearch.rs_end = *cursor + size;
+
+       rs = avl_find(t, &rsearch, &where);
+       if (rs == NULL)
+               rs = avl_nearest(t, where, AVL_AFTER);
+
+       while (rs != NULL) {
+               uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+               if (offset + size <= rs->rs_end) {
+                       *cursor = offset + size;
+                       return (offset);
+               }
+               rs = AVL_NEXT(t, rs);
+       }
+
+       /*
+        * If we know we've searched the whole map (*cursor == 0), give up.
+        * Otherwise, reset the cursor to the beginning and try again.
+        */
+       if (*cursor == 0)
+               return (-1ULL);
+
+       *cursor = 0;
+       return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /*
@@ -579,29 +683,31 @@ metaslab_pp_maxsize(space_map_t *sm)
  * ==========================================================================
  */
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &sm->sm_root;
+       /*
+        * Find the largest power of 2 block size that evenly divides the
+        * requested size. This is used to try to allocate blocks with similar
+        * alignment from the same area of the metaslab (i.e. same cursor
+        * bucket) but it does not guarantee that other allocations sizes
+        * may exist in the same region.
+        */
        uint64_t align = size & -size;
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+       uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+       avl_tree_t *t = &msp->ms_tree->rt_root;
 
        return (metaslab_block_picker(t, cursor, size, align));
 }
 
 /* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
+static boolean_t
+metaslab_ff_fragmented(metaslab_t *msp)
 {
        return (B_TRUE);
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-       metaslab_pp_load,
-       metaslab_pp_unload,
+static metaslab_ops_t metaslab_ff_ops = {
        metaslab_ff_alloc,
-       metaslab_pp_claim,
-       metaslab_pp_free,
-       metaslab_pp_maxsize,
        metaslab_ff_fragmented
 };
 
@@ -614,16 +720,24 @@ static space_map_ops_t metaslab_ff_ops =
  * ==========================================================================
  */
 static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &sm->sm_root;
+       /*
+        * Find the largest power of 2 block size that evenly divides the
+        * requested size. This is used to try to allocate blocks with similar
+        * alignment from the same area of the metaslab (i.e. same cursor
+        * bucket) but it does not guarantee that other allocations sizes
+        * may exist in the same region.
+        */
        uint64_t align = size & -size;
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-       uint64_t max_size = metaslab_pp_maxsize(sm);
-       int free_pct = sm->sm_space * 100 / sm->sm_size;
+       uint64_t *cursor = &msp->ms_lbas[highbit(align) - 1];
+       range_tree_t *rt = msp->ms_tree;
+       avl_tree_t *t = &rt->rt_root;
+       uint64_t max_size = metaslab_block_maxsize(msp);
+       int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
-       ASSERT(MUTEX_HELD(sm->sm_lock));
-       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
        if (max_size < size)
                return (-1ULL);
@@ -634,7 +748,7 @@ metaslab_df_alloc(space_map_t *sm, uint6
         */
        if (max_size < metaslab_df_alloc_threshold ||
            free_pct < metaslab_df_free_pct) {
-               t = sm->sm_pp_root;
+               t = &msp->ms_size_tree;
                *cursor = 0;
        }
 
@@ -642,10 +756,11 @@ metaslab_df_alloc(space_map_t *sm, uint6
 }
 
 static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
+metaslab_df_fragmented(metaslab_t *msp)
 {
-       uint64_t max_size = metaslab_pp_maxsize(sm);
-       int free_pct = sm->sm_space * 100 / sm->sm_size;
+       range_tree_t *rt = msp->ms_tree;
+       uint64_t max_size = metaslab_block_maxsize(msp);
+       int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
 
        if (max_size >= metaslab_df_alloc_threshold &&
            free_pct >= metaslab_df_free_pct)
@@ -654,182 +769,228 @@ metaslab_df_fragmented(space_map_t *sm)
        return (B_TRUE);
 }
 
-static space_map_ops_t metaslab_df_ops = {
-       metaslab_pp_load,
-       metaslab_pp_unload,
+static metaslab_ops_t metaslab_df_ops = {
        metaslab_df_alloc,
-       metaslab_pp_claim,
-       metaslab_pp_free,
-       metaslab_pp_maxsize,
        metaslab_df_fragmented
 };
 
 /*
  * ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
  * ==========================================================================
  */
 static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &sm->sm_root;
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd;
-       uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
-       uint64_t max_size = metaslab_pp_maxsize(sm);
-       uint64_t rsize = size;
+       range_tree_t *rt = msp->ms_tree;
+       avl_tree_t *t = &msp->ms_size_tree;
+       uint64_t *cursor = &msp->ms_lbas[0];
+       uint64_t *cursor_end = &msp->ms_lbas[1];
        uint64_t offset = 0;
 
-       ASSERT(MUTEX_HELD(sm->sm_lock));
-       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
-
-       if (max_size < size)
-               return (-1ULL);
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
 
-       ASSERT3U(*extent_end, >=, *cursor);
+       ASSERT3U(*cursor_end, >=, *cursor);
 
-       /*
-        * If we're running low on space switch to using the size
-        * sorted AVL tree (best-fit).
-        */
-       if ((*cursor + size) > *extent_end) {
+       if ((*cursor + size) > *cursor_end) {
+               range_seg_t *rs;
 
-               t = sm->sm_pp_root;
-               *cursor = *extent_end = 0;
+               rs = avl_last(&msp->ms_size_tree);
+               if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+                       return (-1ULL);
 
-               if (max_size > 2 * SPA_MAXBLOCKSIZE)
-                       rsize = MIN(metaslab_min_alloc_size, max_size);
-               offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
-               if (offset != -1)
-                       *cursor = offset + size;
-       } else {
-               offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+               *cursor = rs->rs_start;
+               *cursor_end = rs->rs_end;
        }
-       ASSERT3U(*cursor, <=, *extent_end);
+
+       offset = *cursor;
+       *cursor += size;
+
        return (offset);
 }
 
 static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
+metaslab_cf_fragmented(metaslab_t *msp)
 {
-       uint64_t max_size = metaslab_pp_maxsize(sm);
-
-       if (max_size > (metaslab_min_alloc_size * 10))
-               return (B_FALSE);
-       return (B_TRUE);
+       return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
 }
 
-static space_map_ops_t metaslab_cdf_ops = {
-       metaslab_pp_load,
-       metaslab_pp_unload,
-       metaslab_cdf_alloc,
-       metaslab_pp_claim,
-       metaslab_pp_free,
-       metaslab_pp_maxsize,
-       metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+       metaslab_cf_alloc,
+       metaslab_cf_fragmented
 };
 
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
 uint64_t metaslab_ndf_clump_shift = 4;
 
 static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
 {
-       avl_tree_t *t = &sm->sm_root;
+       avl_tree_t *t = &msp->ms_tree->rt_root;
        avl_index_t where;
-       space_seg_t *ss, ssearch;
+       range_seg_t *rs, rsearch;
        uint64_t hbit = highbit(size);
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
-       uint64_t max_size = metaslab_pp_maxsize(sm);
+       uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+       uint64_t max_size = metaslab_block_maxsize(msp);
 
-       ASSERT(MUTEX_HELD(sm->sm_lock));
-       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
 
        if (max_size < size)
                return (-1ULL);
 
-       ssearch.ss_start = *cursor;
-       ssearch.ss_end = *cursor + size;
+       rsearch.rs_start = *cursor;
+       rsearch.rs_end = *cursor + size;
 
-       ss = avl_find(t, &ssearch, &where);
-       if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
-               t = sm->sm_pp_root;
+       rs = avl_find(t, &rsearch, &where);
+       if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+               t = &msp->ms_size_tree;
 
-               ssearch.ss_start = 0;
-               ssearch.ss_end = MIN(max_size,
+               rsearch.rs_start = 0;
+               rsearch.rs_end = MIN(max_size,
                    1ULL << (hbit + metaslab_ndf_clump_shift));
-               ss = avl_find(t, &ssearch, &where);
-               if (ss == NULL)
-                       ss = avl_nearest(t, where, AVL_AFTER);
-               ASSERT(ss != NULL);
+               rs = avl_find(t, &rsearch, &where);
+               if (rs == NULL)
+                       rs = avl_nearest(t, where, AVL_AFTER);
+               ASSERT(rs != NULL);
        }
 
-       if (ss != NULL) {
-               if (ss->ss_start + size <= ss->ss_end) {
-                       *cursor = ss->ss_start + size;
-                       return (ss->ss_start);
-               }
+       if ((rs->rs_end - rs->rs_start) >= size) {
+               *cursor = rs->rs_start + size;
+               return (rs->rs_start);
        }
        return (-1ULL);
 }
 
 static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
+metaslab_ndf_fragmented(metaslab_t *msp)
 {
-       uint64_t max_size = metaslab_pp_maxsize(sm);
-
-       if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
-               return (B_FALSE);
-       return (B_TRUE);
+       return (metaslab_block_maxsize(msp) <=
+           (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
 }
 
-
-static space_map_ops_t metaslab_ndf_ops = {
-       metaslab_pp_load,
-       metaslab_pp_unload,
+static metaslab_ops_t metaslab_ndf_ops = {
        metaslab_ndf_alloc,
-       metaslab_pp_claim,
-       metaslab_pp_free,
-       metaslab_pp_maxsize,
        metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
  * Metaslabs
  * ==========================================================================
  */
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+       while (msp->ms_loading) {
+               ASSERT(!msp->ms_loaded);
+               cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+       }
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+       int error = 0;
+
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       ASSERT(!msp->ms_loaded);
+       ASSERT(!msp->ms_loading);
+
+       msp->ms_loading = B_TRUE;
+
+       /*
+        * If the space map has not been allocated yet, then treat
+        * all the space in the metaslab as free and add it to the
+        * ms_tree.
+        */
+       if (msp->ms_sm != NULL)
+               error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+       else
+               range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+       msp->ms_loaded = (error == 0);
+       msp->ms_loading = B_FALSE;
+
+       if (msp->ms_loaded) {
+               for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+                       range_tree_walk(msp->ms_defertree[t],
+                           range_tree_remove, msp->ms_tree);
+               }
+       }
+       cv_broadcast(&msp->ms_load_cv);
+       return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+       range_tree_vacate(msp->ms_tree, NULL, NULL);
+       msp->ms_loaded = B_FALSE;
+       msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+}
+
 metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
-       uint64_t start, uint64_t size, uint64_t txg)
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
 {
        vdev_t *vd = mg->mg_vd;
+       objset_t *mos = vd->vdev_spa->spa_meta_objset;
        metaslab_t *msp;
 
        msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
        mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
+       msp->ms_id = id;
+       msp->ms_start = id << vd->vdev_ms_shift;
+       msp->ms_size = 1ULL << vd->vdev_ms_shift;
 
-       msp->ms_smo_syncing = *smo;
+       /*
+        * We only open space map objects that already exist. All others
+        * will be opened when we finally allocate an object for it.
+        */
+       if (object != 0) {
+               VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
+                   msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
+               ASSERT(msp->ms_sm != NULL);
+       }
 
        /*
-        * We create the main space map here, but we don't create the
-        * allocmaps and freemaps until metaslab_sync_done().  This serves
+        * We create the main range tree here, but we don't create the
+        * alloctree and freetree until metaslab_sync_done().  This serves
         * two purposes: it allows metaslab_sync_done() to detect the
         * addition of new space; and for debugging, it ensures that we'd
         * data fault on any attempt to use this metaslab before it's ready.
         */
-       msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
-       space_map_create(msp->ms_map, start, size,
-           vd->vdev_ashift, &msp->ms_lock);
-
+       msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
        metaslab_group_add(mg, msp);
 
-       if (metaslab_debug && smo->smo_object != 0) {
-               mutex_enter(&msp->ms_lock);
-               VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
-                   SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
-               mutex_exit(&msp->ms_lock);
-       }
+       msp->ms_ops = mg->mg_class->mc_ops;
 
        /*
         * If we're opening an existing pool (txg == 0) or creating
@@ -840,6 +1001,17 @@ metaslab_init(metaslab_group_t *mg, spac
        if (txg <= TXG_INITIAL)
                metaslab_sync_done(msp, 0);
 
+       /*
+        * If metaslab_debug_load is set and we're initializing a metaslab
+        * that has an allocated space_map object then load the its space
+        * map so that can verify frees.
+        */
+       if (metaslab_debug_load && msp->ms_sm != NULL) {
+               mutex_enter(&msp->ms_lock);
+               VERIFY0(metaslab_load(msp));
+               mutex_exit(&msp->ms_lock);
+       }
+
        if (txg != 0) {
                vdev_dirty(vd, 0, NULL, txg);
                vdev_dirty(vd, VDD_METASLAB, msp, txg);
@@ -853,48 +1025,103 @@ metaslab_fini(metaslab_t *msp)
 {
        metaslab_group_t *mg = msp->ms_group;
 
-       vdev_space_update(mg->mg_vd,
-           -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
-
        metaslab_group_remove(mg, msp);
 
        mutex_enter(&msp->ms_lock);
 
-       space_map_unload(msp->ms_map);
-       space_map_destroy(msp->ms_map);
-       kmem_free(msp->ms_map, sizeof (*msp->ms_map));
+       VERIFY(msp->ms_group == NULL);
+       vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+           0, -msp->ms_size);
+       space_map_close(msp->ms_sm);
+
+       metaslab_unload(msp);
+       range_tree_destroy(msp->ms_tree);
 
        for (int t = 0; t < TXG_SIZE; t++) {
-               space_map_destroy(msp->ms_allocmap[t]);
-               space_map_destroy(msp->ms_freemap[t]);
-               kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
-               kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
+               range_tree_destroy(msp->ms_alloctree[t]);
+               range_tree_destroy(msp->ms_freetree[t]);
        }
 
        for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-               space_map_destroy(msp->ms_defermap[t]);
-               kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+               range_tree_destroy(msp->ms_defertree[t]);
        }
 
        ASSERT0(msp->ms_deferspace);
 
        mutex_exit(&msp->ms_lock);
+       cv_destroy(&msp->ms_load_cv);
        mutex_destroy(&msp->ms_lock);
 
        kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define        METASLAB_WEIGHT_PRIMARY         (1ULL << 63)
-#define        METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
-#define        METASLAB_ACTIVE_MASK            \
-       (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+/*
+ * Apply a weighting factor based on the histogram information for this
+ * metaslab. The current weighting factor is somewhat arbitrary and requires
+ * additional investigation. The implementation provides a measure of
+ * "weighted" free space and gives a higher weighting for larger contiguous
+ * regions. The weighting factor is determined by counting the number of
+ * sm_shift sectors that exist in each region represented by the histogram.
+ * That value is then multiplied by the power of 2 exponent and the sm_shift
+ * value.
+ *
+ * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
+ * metaslab has an sm_shift value of 9 (512B):
+ *
+ * 1) calculate the number of sm_shift sectors in the region:
+ *     2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
+ * 2) multiply by the power of 2 exponent and the sm_shift value:
+ *     16384 * 21 * 9 = 3096576
+ * This value will be added to the weighting of the metaslab.
+ */
+static uint64_t
+metaslab_weight_factor(metaslab_t *msp)
+{
+       uint64_t factor = 0;
+       uint64_t sectors;
+       int i;
+
+       /*
+        * A null space map means that the entire metaslab is free,
+        * calculate a weight factor that spans the entire size of the
+        * metaslab.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to