Author: mm
Date: Sat Aug 28 08:59:55 2010
New Revision: 211931
URL: http://svn.freebsd.org/changeset/base/211931

Log:
  Update ZFS metaslab code from OpenSolaris.
  This provides a noticeable write speedup, especially on pools with
  less than 30% of free space.
  
  Detailed information (OpenSolaris onnv changesets and Bug IDs):
  
  11146:7e58f40bcb1c
  6826241       Sync write IOPS drops dramatically during TXG sync
  6869229       zfs should switch to shiny new metaslabs more frequently
  
  11728:59fdb3b856f6
  6918420       zdb -m has issues printing metaslab statistics
  
  12047:7c1fcc8419ca
  6917066       zfs block picking can be improved
  
  Approved by:  delphij (mentor)
  Obtained from:        OpenSolaris (Bug ID 6826241, 6869229, 6918420, 6917066)
  MFC after:    2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Sat Aug 28 08:57:15 2010        
(r211930)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Sat Aug 28 08:59:55 2010        
(r211931)
@@ -491,35 +491,37 @@ dump_metaslab_stats(metaslab_t *msp)
 static void
 dump_metaslab(metaslab_t *msp)
 {
-       char freebuf[5];
-       space_map_obj_t *smo = &msp->ms_smo;
        vdev_t *vd = msp->ms_group->mg_vd;
        spa_t *spa = vd->vdev_spa;
+       space_map_t *sm = &msp->ms_map;
+       space_map_obj_t *smo = &msp->ms_smo;
+       char freebuf[5];
 
-       nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+       nicenum(sm->sm_size - smo->smo_alloc, freebuf);
 
        (void) printf(
            "\tvdev %5llu   offset %12llx   spacemap %6llu   free    %5s\n",
-           (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
-           (u_longlong_t)smo->smo_object, freebuf);
+           (u_longlong_t)(sm->sm_start / sm->sm_size),
+           (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
 
        if (dump_opt['m'] > 1) {
                mutex_enter(&msp->ms_lock);
-               VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops,
-                   SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0);
+               space_map_load_wait(sm);
+               if (!sm->sm_loaded)
+                       VERIFY(space_map_load(sm, zfs_metaslab_ops,
+                           SM_FREE, smo, spa->spa_meta_objset) == 0);
                dump_metaslab_stats(msp);
-               space_map_unload(&msp->ms_map);
+               space_map_unload(sm);
                mutex_exit(&msp->ms_lock);
        }
 
        if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
-               ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+               ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
 
                mutex_enter(&msp->ms_lock);
-               dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+               dump_spacemap(spa->spa_meta_objset, smo, sm);
                mutex_exit(&msp->ms_lock);
        }
-
 }
 
 static void

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Sat Aug 
28 08:59:55 2010        (r211931)
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -37,7 +36,7 @@ uint64_t metaslab_gang_bang = SPA_MAXBLO
 
 /*
  * Minimum size which forces the dynamic allocator to change
- * it's allocation strategy. Once the space map cannot satisfy
+ * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
@@ -49,7 +48,23 @@ uint64_t metaslab_df_alloc_threshold = S
  * Once the space_map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
-int metaslab_df_free_pct = 30;
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
 
 /*
  * ==========================================================================
@@ -219,6 +234,32 @@ metaslab_group_sort(metaslab_group_t *mg
 }
 
 /*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
+{
+       const space_seg_t *s1 = x1;
+       const space_seg_t *s2 = x2;
+       uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+       uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+       if (ss_size1 < ss_size2)
+               return (-1);
+       if (ss_size1 > ss_size2)
+               return (1);
+
+       if (s1->ss_start < s2->ss_start)
+               return (-1);
+       if (s1->ss_start > s2->ss_start)
+               return (1);
+
+       return (0);
+}
+
+/*
  * This is a helper function that can be used by the allocator to find
  * a suitable block to allocate. This will search the specified AVL
  * tree looking for a block that matches the specified criteria.
@@ -258,68 +299,58 @@ metaslab_block_picker(avl_tree_t *t, uin
        return (metaslab_block_picker(t, cursor, size, align));
 }
 
-/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
- */
 static void
-metaslab_ff_load(space_map_t *sm)
+metaslab_pp_load(space_map_t *sm)
 {
+       space_seg_t *ss;
+
        ASSERT(sm->sm_ppd == NULL);
        sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-       sm->sm_pp_root = NULL;
+
+       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+       avl_create(sm->sm_pp_root, metaslab_segsize_compare,
+           sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+               avl_add(sm->sm_pp_root, ss);
 }
 
 static void
-metaslab_ff_unload(space_map_t *sm)
+metaslab_pp_unload(space_map_t *sm)
 {
+       void *cookie = NULL;
+
        kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
        sm->sm_ppd = NULL;
-}
 
-static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
-{
-       avl_tree_t *t = &sm->sm_root;
-       uint64_t align = size & -size;
-       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+       while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+               /* tear down the tree */
+       }
 
-       return (metaslab_block_picker(t, cursor, size, align));
+       avl_destroy(sm->sm_pp_root);
+       kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+       sm->sm_pp_root = NULL;
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 {
        /* No need to update cursor */
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 {
        /* No need to update cursor */
 }
 
-static space_map_ops_t metaslab_ff_ops = {
-       metaslab_ff_load,
-       metaslab_ff_unload,
-       metaslab_ff_alloc,
-       metaslab_ff_claim,
-       metaslab_ff_free,
-       NULL    /* maxsize */
-};
-
 /*
- * Dynamic block allocator -
- * Uses the first fit allocation scheme until space get low and then
- * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
- * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * Return the maximum contiguous segment within the metaslab.
  */
-
 uint64_t
-metaslab_df_maxsize(space_map_t *sm)
+metaslab_pp_maxsize(space_map_t *sm)
 {
        avl_tree_t *t = sm->sm_pp_root;
        space_seg_t *ss;
@@ -330,67 +361,53 @@ metaslab_df_maxsize(space_map_t *sm)
        return (ss->ss_end - ss->ss_start);
 }
 
-static int
-metaslab_df_seg_compare(const void *x1, const void *x2)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 {
-       const space_seg_t *s1 = x1;
-       const space_seg_t *s2 = x2;
-       uint64_t ss_size1 = s1->ss_end - s1->ss_start;
-       uint64_t ss_size2 = s2->ss_end - s2->ss_start;
-
-       if (ss_size1 < ss_size2)
-               return (-1);
-       if (ss_size1 > ss_size2)
-               return (1);
-
-       if (s1->ss_start < s2->ss_start)
-               return (-1);
-       if (s1->ss_start > s2->ss_start)
-               return (1);
+       avl_tree_t *t = &sm->sm_root;
+       uint64_t align = size & -size;
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 
-       return (0);
+       return (metaslab_block_picker(t, cursor, size, align));
 }
 
-static void
-metaslab_df_load(space_map_t *sm)
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
 {
-       space_seg_t *ss;
-
-       ASSERT(sm->sm_ppd == NULL);
-       sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-
-       sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
-       avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
-           sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
-       for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-               avl_add(sm->sm_pp_root, ss);
+       return (B_TRUE);
 }
 
-static void
-metaslab_df_unload(space_map_t *sm)
-{
-       void *cookie = NULL;
-
-       kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-       sm->sm_ppd = NULL;
-
-       while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
-               /* tear down the tree */
-       }
-
-       avl_destroy(sm->sm_pp_root);
-       kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
-       sm->sm_pp_root = NULL;
-}
+static space_map_ops_t metaslab_ff_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_ff_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_ff_fragmented
+};
 
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
 static uint64_t
 metaslab_df_alloc(space_map_t *sm, uint64_t size)
 {
        avl_tree_t *t = &sm->sm_root;
        uint64_t align = size & -size;
        uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
-       uint64_t max_size = metaslab_df_maxsize(sm);
+       uint64_t max_size = metaslab_pp_maxsize(sm);
        int free_pct = sm->sm_space * 100 / sm->sm_size;
 
        ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -412,30 +429,158 @@ metaslab_df_alloc(space_map_t *sm, uint6
        return (metaslab_block_picker(t, cursor, size, 1ULL));
 }
 
-/* ARGSUSED */
-static void
-metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
 {
-       /* No need to update cursor */
-}
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+       int free_pct = sm->sm_space * 100 / sm->sm_size;
 
-/* ARGSUSED */
-static void
-metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
-       /* No need to update cursor */
+       if (max_size >= metaslab_df_alloc_threshold &&
+           free_pct >= metaslab_df_free_pct)
+               return (B_FALSE);
+
+       return (B_TRUE);
 }
 
 static space_map_ops_t metaslab_df_ops = {
-       metaslab_df_load,
-       metaslab_df_unload,
+       metaslab_pp_load,
+       metaslab_pp_unload,
        metaslab_df_alloc,
-       metaslab_df_claim,
-       metaslab_df_free,
-       metaslab_df_maxsize
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_df_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+       avl_tree_t *t = &sm->sm_root;
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+       uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+       uint64_t rsize = size;
+       uint64_t offset = 0;
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+       if (max_size < size)
+               return (-1ULL);
+
+       ASSERT3U(*extent_end, >=, *cursor);
+
+       /*
+        * If we're running low on space switch to using the size
+        * sorted AVL tree (best-fit).
+        */
+       if ((*cursor + size) > *extent_end) {
+
+               t = sm->sm_pp_root;
+               *cursor = *extent_end = 0;
+
+               if (max_size > 2 * SPA_MAXBLOCKSIZE)
+                       rsize = MIN(metaslab_min_alloc_size, max_size);
+               offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+               if (offset != -1)
+                       *cursor = offset + size;
+       } else {
+               offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+       }
+       ASSERT3U(*cursor, <=, *extent_end);
+       return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       if (max_size > (metaslab_min_alloc_size * 10))
+               return (B_FALSE);
+       return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_cdf_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_cdf_fragmented
+};
+
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+       avl_tree_t *t = &sm->sm_root;
+       avl_index_t where;
+       space_seg_t *ss, ssearch;
+       uint64_t hbit = highbit(size);
+       uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       ASSERT(MUTEX_HELD(sm->sm_lock));
+       ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+       if (max_size < size)
+               return (-1ULL);
+
+       ssearch.ss_start = *cursor;
+       ssearch.ss_end = *cursor + size;
+
+       ss = avl_find(t, &ssearch, &where);
+       if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+               t = sm->sm_pp_root;
+
+               ssearch.ss_start = 0;
+               ssearch.ss_end = MIN(max_size,
+                   1ULL << (hbit + metaslab_ndf_clump_shift));
+               ss = avl_find(t, &ssearch, &where);
+               if (ss == NULL)
+                       ss = avl_nearest(t, where, AVL_AFTER);
+               ASSERT(ss != NULL);
+       }
+
+       if (ss != NULL) {
+               if (ss->ss_start + size <= ss->ss_end) {
+                       *cursor = ss->ss_start + size;
+                       return (ss->ss_start);
+               }
+       }
+       return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+       uint64_t max_size = metaslab_pp_maxsize(sm);
+
+       if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
+               return (B_FALSE);
+       return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+       metaslab_pp_load,
+       metaslab_pp_unload,
+       metaslab_ndf_alloc,
+       metaslab_pp_claim,
+       metaslab_pp_free,
+       metaslab_pp_maxsize,
+       metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
 
 /*
  * ==========================================================================
@@ -522,7 +667,6 @@ metaslab_fini(metaslab_t *msp)
 #define        METASLAB_WEIGHT_SECONDARY       (1ULL << 62)
 #define        METASLAB_ACTIVE_MASK            \
        (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define        METASLAB_SMO_BONUS_MULTIPLIER   2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
@@ -555,25 +699,60 @@ metaslab_weight(metaslab_t *msp)
        ASSERT(weight >= space && weight <= 2 * space);
 
        /*
-        * For locality, assign higher weight to metaslabs we've used before.
+        * For locality, assign higher weight to metaslabs which have
+        * a lower offset than what we've already activated.
         */
-       if (smo->smo_object != 0)
-               weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+       if (sm->sm_start <= mg->mg_bonus_area)
+               weight *= (metaslab_smo_bonus_pct / 100);
        ASSERT(weight >= space &&
-           weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+           weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+       if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+               /*
+                * If this metaslab is one we're actively using, adjust its
+                * weight to make it preferable to any inactive metaslab so
+                * we'll polish it off.
+                */
+               weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+       }
+       return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+       spa_t *spa = mg->mg_vd->vdev_spa;
+       metaslab_t *msp;
+       avl_tree_t *t = &mg->mg_metaslab_tree;
+       int m;
+
+       mutex_enter(&mg->mg_lock);
 
        /*
-        * If this metaslab is one we're actively using, adjust its weight to
-        * make it preferable to any inactive metaslab so we'll polish it off.
+        * Prefetch the next potential metaslabs
         */
-       weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+       for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+               space_map_t *sm = &msp->ms_map;
+               space_map_obj_t *smo = &msp->ms_smo;
 
-       return (weight);
+               /* If we have reached our prefetch limit then we're done */
+               if (m >= metaslab_prefetch_limit)
+                       break;
+
+               if (!sm->sm_loaded && smo->smo_object != 0) {
+                       mutex_exit(&mg->mg_lock);
+                       dmu_prefetch(spa->spa_meta_objset, smo->smo_object,
+                           0ULL, smo->smo_objsize);
+                       mutex_enter(&mg->mg_lock);
+               }
+       }
+       mutex_exit(&mg->mg_lock);
 }
 
 static int
 metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
+       metaslab_group_t *mg = msp->ms_group;
        space_map_t *sm = &msp->ms_map;
        space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
@@ -588,6 +767,15 @@ metaslab_activate(metaslab_t *msp, uint6
                }
 
                /*
+                * Track the bonus area as we activate new metaslabs.
+                */
+               if (sm->sm_start > mg->mg_bonus_area) {
+                       mutex_enter(&mg->mg_lock);
+                       mg->mg_bonus_area = sm->sm_start;
+                       mutex_exit(&mg->mg_lock);
+               }
+
+               /*
                 * If we were able to load the map then make sure
                 * that this map is still able to satisfy our request.
                 */
@@ -773,6 +961,32 @@ metaslab_sync_done(metaslab_t *msp, uint
        mutex_exit(&msp->ms_lock);
 }
 
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+       vdev_t *vd = mg->mg_vd;
+
+       /*
+        * Re-evaluate all metaslabs which have lower offsets than the
+        * bonus area.
+        */
+       for (int m = 0; m < vd->vdev_ms_count; m++) {
+               metaslab_t *msp = vd->vdev_ms[m];
+
+               if (msp->ms_map.sm_start > mg->mg_bonus_area)
+                       break;
+
+               mutex_enter(&msp->ms_lock);
+               metaslab_group_sort(mg, msp, metaslab_weight(msp));
+               mutex_exit(&msp->ms_lock);
+       }
+
+       /*
+        * Prefetch the next potential metaslabs
+        */
+       metaslab_prefetch(mg);
+}
+
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
@@ -868,7 +1082,7 @@ metaslab_group_alloc(metaslab_group_t *m
                if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
                        break;
 
-               metaslab_passivate(msp, size - 1);
+               metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
                mutex_exit(&msp->ms_lock);
        }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Sat Aug 28 
08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Sat Aug 28 
08:59:55 2010        (r211931)
@@ -74,35 +74,38 @@ enum zti_modes {
        zti_mode_fixed,                 /* value is # of threads (min 1) */
        zti_mode_online_percent,        /* value is % of online CPUs */
        zti_mode_tune,                  /* fill from zio_taskq_tune_* */
+       zti_mode_null,                  /* don't create a taskq */
        zti_nmodes
 };
 
-#define        ZTI_THREAD_FIX(n)       { zti_mode_fixed, (n) }
-#define        ZTI_THREAD_PCT(n)       { zti_mode_online_percent, (n) }
-#define        ZTI_THREAD_TUNE         { zti_mode_tune, 0 }
+#define        ZTI_FIX(n)      { zti_mode_fixed, (n) }
+#define        ZTI_PCT(n)      { zti_mode_online_percent, (n) }
+#define        ZTI_TUNE        { zti_mode_tune, 0 }
+#define        ZTI_NULL        { zti_mode_null, 0 }
 
-#define        ZTI_THREAD_ONE          ZTI_THREAD_FIX(1)
+#define        ZTI_ONE         ZTI_FIX(1)
 
 typedef struct zio_taskq_info {
-       const char *zti_name;
-       struct {
-               enum zti_modes zti_mode;
-               uint_t zti_value;
-       } zti_nthreads[ZIO_TASKQ_TYPES];
+       enum zti_modes zti_mode;
+       uint_t zti_value;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-                               "issue",                "intr"
+               "issue", "issue_high", "intr", "intr_high"
 };
 
-const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
-       /*                      ISSUE                   INTR            */
-       { "spa_zio_null",       { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_read",       { ZTI_THREAD_FIX(8),    ZTI_THREAD_TUNE } },
-       { "spa_zio_write",      { ZTI_THREAD_TUNE,      ZTI_THREAD_FIX(8) } },
-       { "spa_zio_free",       { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_claim",      { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
-       { "spa_zio_ioctl",      { ZTI_THREAD_ONE,       ZTI_THREAD_ONE } },
+/*
+ * Define the taskq threads for the following I/O types:
+ *     NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+       /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_FIX(8),   ZTI_NULL,       ZTI_TUNE,       ZTI_NULL },
+       { ZTI_TUNE,     ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
+       { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 };
 
 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
@@ -581,14 +584,14 @@ spa_activate(spa_t *spa, int mode)
        spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
 
        for (int t = 0; t < ZIO_TYPES; t++) {
-               const zio_taskq_info_t *ztip = &zio_taskqs[t];
                for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
-                       uint_t value = ztip->zti_nthreads[q].zti_value;
+                       const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+                       enum zti_modes mode = ztip->zti_mode;
+                       uint_t value = ztip->zti_value;
                        char name[32];
 
                        (void) snprintf(name, sizeof (name),
-                           "%s_%s", ztip->zti_name, zio_taskq_types[q]);
+                           "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 
                        if (mode == zti_mode_tune) {
                                mode = zio_taskq_tune_mode;
@@ -613,6 +616,10 @@ spa_activate(spa_t *spa, int mode)
                                    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
                                break;
 
+                       case zti_mode_null:
+                               spa->spa_zio_taskq[t][q] = NULL;
+                               break;
+
                        case zti_mode_tune:
                        default:
                                panic("unrecognized mode for "
@@ -659,7 +666,8 @@ spa_deactivate(spa_t *spa)
 
        for (int t = 0; t < ZIO_TYPES; t++) {
                for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-                       taskq_destroy(spa->spa_zio_taskq[t][q]);
+                       if (spa->spa_zio_taskq[t][q] != NULL)
+                               taskq_destroy(spa->spa_zio_taskq[t][q]);
                        spa->spa_zio_taskq[t][q] = NULL;
                }
        }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c     Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c     Sat Aug 
28 08:59:55 2010        (r211931)
@@ -368,10 +368,8 @@ space_map_unload(space_map_t *sm)
 uint64_t
 space_map_maxsize(space_map_t *sm)
 {
-       if (sm->sm_loaded && sm->sm_ops != NULL)
-               return (sm->sm_ops->smop_max(sm));
-       else
-               return (-1ULL);
+       ASSERT(sm->sm_ops != NULL);
+       return (sm->sm_ops->smop_max(sm));
 }
 
 uint64_t

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h  Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h  Sat Aug 
28 08:59:55 2010        (r211931)
@@ -46,6 +46,7 @@ extern metaslab_t *metaslab_init(metasla
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
 
 #define        METASLAB_HINTBP_FAVOR   0x0
 #define        METASLAB_HINTBP_AVOID   0x1

Modified: 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h     
Sat Aug 28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h     
Sat Aug 28 08:59:55 2010        (r211931)
@@ -46,6 +46,7 @@ struct metaslab_group {
        kmutex_t                mg_lock;
        avl_tree_t              mg_metaslab_tree;
        uint64_t                mg_aliquot;
+       uint64_t                mg_bonus_area;
        int64_t                 mg_bias;
        metaslab_class_t        *mg_class;
        vdev_t                  *mg_vd;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Sat Aug 
28 08:59:55 2010        (r211931)
@@ -87,7 +87,9 @@ typedef enum spa_log_state {
 
 enum zio_taskq_type {
        ZIO_TASKQ_ISSUE = 0,
+       ZIO_TASKQ_ISSUE_HIGH,
        ZIO_TASKQ_INTERRUPT,
+       ZIO_TASKQ_INTERRUPT_HIGH,
        ZIO_TASKQ_TYPES
 };
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h Sat Aug 
28 08:59:55 2010        (r211931)
@@ -77,6 +77,7 @@ struct space_map_ops {
        void    (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
        void    (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
        uint64_t (*smop_max)(space_map_t *sm);
+       boolean_t (*smop_fragmented)(space_map_t *sm);
 };
 
 /*

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h       Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h       Sat Aug 
28 08:59:55 2010        (r211931)
@@ -107,14 +107,15 @@ enum zio_compress {
 #define        ZIO_PRIORITY_NOW                (zio_priority_table[0])
 #define        ZIO_PRIORITY_SYNC_READ          (zio_priority_table[1])
 #define        ZIO_PRIORITY_SYNC_WRITE         (zio_priority_table[2])
-#define        ZIO_PRIORITY_ASYNC_READ         (zio_priority_table[3])
-#define        ZIO_PRIORITY_ASYNC_WRITE        (zio_priority_table[4])
-#define        ZIO_PRIORITY_FREE               (zio_priority_table[5])
-#define        ZIO_PRIORITY_CACHE_FILL         (zio_priority_table[6])
-#define        ZIO_PRIORITY_LOG_WRITE          (zio_priority_table[7])
-#define        ZIO_PRIORITY_RESILVER           (zio_priority_table[8])
-#define        ZIO_PRIORITY_SCRUB              (zio_priority_table[9])
-#define        ZIO_PRIORITY_TABLE_SIZE         10
+#define        ZIO_PRIORITY_LOG_WRITE          (zio_priority_table[3])
+#define        ZIO_PRIORITY_CACHE_FILL         (zio_priority_table[4])
+#define        ZIO_PRIORITY_AGG                (zio_priority_table[5])
+#define        ZIO_PRIORITY_FREE               (zio_priority_table[6])
+#define        ZIO_PRIORITY_ASYNC_WRITE        (zio_priority_table[7])
+#define        ZIO_PRIORITY_ASYNC_READ         (zio_priority_table[8])
+#define        ZIO_PRIORITY_RESILVER           (zio_priority_table[9])
+#define        ZIO_PRIORITY_SCRUB              (zio_priority_table[10])
+#define        ZIO_PRIORITY_TABLE_SIZE         11
 
 #define        ZIO_FLAG_MUSTSUCCEED            0x00000
 #define        ZIO_FLAG_CANFAIL                0x00001

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Sat Aug 28 
08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Sat Aug 28 
08:59:55 2010        (r211931)
@@ -1773,9 +1773,13 @@ void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
        metaslab_t *msp;
+       boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
        while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
                metaslab_sync_done(msp, txg);
+
+       if (reassess)
+               metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c    Sat Aug 
28 08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c    Sat Aug 
28 08:59:55 2010        (r211931)
@@ -233,7 +233,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq,
                ASSERT(size <= zfs_vdev_aggregation_limit);
 
                aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-                   zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
+                   zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
                    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
                    vdev_queue_agg_io_done, NULL);
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c   Sat Aug 28 
08:57:15 2010        (r211930)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c   Sat Aug 28 
08:59:55 2010        (r211931)
@@ -49,11 +49,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_
        0,      /* ZIO_PRIORITY_NOW             */
        0,      /* ZIO_PRIORITY_SYNC_READ       */
        0,      /* ZIO_PRIORITY_SYNC_WRITE      */
-       6,      /* ZIO_PRIORITY_ASYNC_READ      */
-       4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
-       4,      /* ZIO_PRIORITY_FREE            */
-       0,      /* ZIO_PRIORITY_CACHE_FILL      */
        0,      /* ZIO_PRIORITY_LOG_WRITE       */
+       1,      /* ZIO_PRIORITY_CACHE_FILL      */
+       1,      /* ZIO_PRIORITY_AGG             */
+       4,      /* ZIO_PRIORITY_FREE            */
+       4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
+       6,      /* ZIO_PRIORITY_ASYNC_READ      */
        10,     /* ZIO_PRIORITY_RESILVER        */
        20,     /* ZIO_PRIORITY_SCRUB           */
 };
@@ -64,7 +65,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_
  * ==========================================================================
  */
 char *zio_type_name[ZIO_TYPES] = {
-       "null", "read", "write", "free", "claim", "ioctl" };
+       "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+       "zio_ioctl"
+};
 
 #define        SYNC_PASS_DEFERRED_FREE 1       /* defer frees after this pass 
*/
 #define        SYNC_PASS_DONT_COMPRESS 4       /* don't compress after this 
pass */
@@ -942,6 +945,7 @@ zio_write_bp_init(zio_t *zio)
 static void
 zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
 {
+       spa_t *spa = zio->io_spa;
        zio_type_t t = zio->io_type;
 
        /*
@@ -958,7 +962,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_
        if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
                t = ZIO_TYPE_NULL;
 
-       (void) taskq_dispatch_safe(zio->io_spa->spa_zio_taskq[t][q],
+       /*
+        * If this is a high priority I/O, then use the high priority taskq.
+        */
+       if (zio->io_priority == ZIO_PRIORITY_NOW &&
+           spa->spa_zio_taskq[t][q + 1] != NULL)
+               q++;
+
+       ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+       (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
            (task_func_t *)zio_execute, zio, &zio->io_task);
 }
 
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to