Author: mav
Date: Wed May 11 12:54:00 2016
New Revision: 299441
URL: https://svnweb.freebsd.org/changeset/base/299441

Log:
  MFV r299440: 6736 ZFS per-vdev ZAPs
  
  Reviewed by: Matthew Ahrens <mahr...@delphix.com>
  Reviewed by: John Kennedy <john.kenn...@delphix.com>
  Reviewed by: George Wilson <george.wil...@delphix.com>
  Reviewed by: Don Brady <don.br...@intel.com>
  Reviewed by: Dan McDonald <dan...@omniti.com>
  Approved by: Richard Lowe <richl...@richlowe.net>
  Author: Joe Stein <joe.st...@delphix.com>
  
  openzfs/openzfs@215198a6ad15cf4832370e2f19247abeb36b951a

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
  head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Wed May 11 
12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c   Wed May 11 
12:54:00 2016        (r299441)
@@ -1665,6 +1665,19 @@ spa_check_removed(vdev_t *vd)
        }
 }
 
+static void
+spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd)
+{
+       ASSERT3U(vd->vdev_children, ==, mvd->vdev_children);
+
+       vd->vdev_top_zap = mvd->vdev_top_zap;
+       vd->vdev_leaf_zap = mvd->vdev_leaf_zap;
+
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]);
+       }
+}
+
 /*
  * Validate the current config against the MOS config
  */
@@ -1768,16 +1781,25 @@ spa_config_valid(spa_t *spa, nvlist_t *c
                        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
                        vdev_reopen(rvd);
-               } else if (mtvd->vdev_islog) {
+               } else {
+                       if (mtvd->vdev_islog) {
+                               /*
+                                * Load the slog device's state from the MOS
+                                * config since it's possible that the label
+                                * does not contain the most up-to-date
+                                * information.
+                                */
+                               vdev_load_log_state(tvd, mtvd);
+                               vdev_reopen(tvd);
+                       }
+
                        /*
-                        * Load the slog device's state from the MOS config
-                        * since it's possible that the label does not
-                        * contain the most up-to-date information.
+                        * Per-vdev ZAP info is stored exclusively in the MOS.
                         */
-                       vdev_load_log_state(tvd, mtvd);
-                       vdev_reopen(tvd);
+                       spa_config_valid_zaps(tvd, mtvd);
                }
        }
+
        vdev_free(mrvd);
        spa_config_exit(spa, SCL_ALL, FTAG);
 
@@ -2210,6 +2232,34 @@ spa_load(spa_t *spa, spa_load_state_t st
 }
 
 /*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+       uint64_t total = 0;
+       if (vd->vdev_top_zap != 0) {
+               total++;
+               ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+       }
+       if (vd->vdev_leaf_zap != 0) {
+               total++;
+               ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+       }
+
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               total += vdev_count_verify_zaps(vd->vdev_child[i]);
+       }
+
+       return (total);
+}
+
+/*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
@@ -2638,6 +2688,39 @@ spa_load_impl(spa_t *spa, uint64_t pool_
                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
        /*
+        * Load the per-vdev ZAP map. If we have an older pool, this will not
+        * be present; in this case, defer its creation to a later time to
+        * avoid dirtying the MOS this early / out of sync context. See
+        * spa_sync_config_object.
+        */
+
+       /* The sentinel is only available in the MOS config. */
+       nvlist_t *mos_config;
+       if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0)
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+       error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+           &spa->spa_all_vdev_zaps);
+
+       if (error != ENOENT && error != 0) {
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       } else if (error == 0 && !nvlist_exists(mos_config,
+           ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+               /*
+                * An older version of ZFS overwrote the sentinel value, so
+                * we have orphaned per-vdev ZAPs in the MOS. Defer their
+                * destruction to later; see spa_sync_config_object.
+                */
+               spa->spa_avz_action = AVZ_ACTION_DESTROY;
+               /*
+                * We're assuming that no vdevs have had their ZAPs created
+                * before this. Better be sure of it.
+                */
+               ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+       }
+       nvlist_free(mos_config);
+
+       /*
         * If we're assembling the pool from the split-off vdevs of
         * an existing pool, we don't want to attach the spares & cache
         * devices.
@@ -5324,6 +5407,16 @@ spa_vdev_split_mirror(spa_t *spa, char *
                    vml[c]->vdev_top->vdev_asize) == 0);
                VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
                    vml[c]->vdev_top->vdev_ashift) == 0);
+
+               /* transfer per-vdev ZAPs */
+               ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+               VERIFY0(nvlist_add_uint64(child[c],
+                   ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+               ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+               VERIFY0(nvlist_add_uint64(child[c],
+                   ZPOOL_CONFIG_VDEV_TOP_ZAP,
+                   vml[c]->vdev_parent->vdev_top_zap));
        }
 
        if (error != 0) {
@@ -5365,11 +5458,13 @@ spa_vdev_split_mirror(spa_t *spa, char *
            spa->spa_config_txg) == 0);
        VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
            spa_generate_guid(NULL)) == 0);
+       VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
        (void) nvlist_lookup_string(props,
            zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
        /* add the new pool to the namespace */
        newspa = spa_add(newname, config, altroot);
+       newspa->spa_avz_action = AVZ_ACTION_REBUILD;
        newspa->spa_config_txg = spa->spa_config_txg;
        spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
@@ -5434,9 +5529,11 @@ spa_vdev_split_mirror(spa_t *spa, char *
                        if (error == 0)
                                spa_history_log_internal(spa, "detach", tx,
                                    "vdev=%s", vml[c]->vdev_path);
+
                        vdev_free(vml[c]);
                }
        }
+       spa->spa_avz_action = AVZ_ACTION_REBUILD;
        vdev_config_dirty(spa->spa_root_vdev);
        spa->spa_config_splitting = NULL;
        nvlist_free(nvl);
@@ -6321,16 +6418,118 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vde
        sav->sav_sync = B_FALSE;
 }
 
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+       spa_t *spa = vd->vdev_spa;
+       if (vd->vdev_top_zap != 0) {
+               VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+                   vd->vdev_top_zap, tx));
+       }
+       if (vd->vdev_leaf_zap != 0) {
+               VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+                   vd->vdev_leaf_zap, tx));
+       }
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               spa_avz_build(vd->vdev_child[i], avz, tx);
+       }
+}
+
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
        nvlist_t *config;
 
-       if (list_is_empty(&spa->spa_config_dirty_list))
+       /*
+        * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+        * its config may not be dirty but we still need to build per-vdev ZAPs.
+        * Similarly, if the pool is being assembled (e.g. after a split), we
+        * need to rebuild the AVZ although the config may not be dirty.
+        */
+       if (list_is_empty(&spa->spa_config_dirty_list) &&
+           spa->spa_avz_action == AVZ_ACTION_NONE)
                return;
 
        spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
+       ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+           spa->spa_all_vdev_zaps != 0);
+
+       if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+               /* Make and build the new AVZ */
+               uint64_t new_avz = zap_create(spa->spa_meta_objset,
+                   DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+               spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+               /* Diff old AVZ with new one */
+               zap_cursor_t zc;
+               zap_attribute_t za;
+
+               for (zap_cursor_init(&zc, spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       uint64_t vdzap = za.za_first_integer;
+                       if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+                           vdzap) == ENOENT) {
+                               /*
+                                * ZAP is listed in old AVZ but not in new one;
+                                * destroy it
+                                */
+                               VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+                                   tx));
+                       }
+               }
+
+               zap_cursor_fini(&zc);
+
+               /* Destroy the old AVZ */
+               VERIFY0(zap_destroy(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, tx));
+
+               /* Replace the old AVZ in the dir obj with the new one */
+               VERIFY0(zap_update(spa->spa_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+                   sizeof (new_avz), 1, &new_avz, tx));
+
+               spa->spa_all_vdev_zaps = new_avz;
+       } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+               zap_cursor_t zc;
+               zap_attribute_t za;
+
+               /* Walk through the AVZ and destroy all listed ZAPs */
+               for (zap_cursor_init(&zc, spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   zap_cursor_advance(&zc)) {
+                       uint64_t zap = za.za_first_integer;
+                       VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+               }
+
+               zap_cursor_fini(&zc);
+
+               /* Destroy and unlink the AVZ itself */
+               VERIFY0(zap_destroy(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, tx));
+               VERIFY0(zap_remove(spa->spa_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+               spa->spa_all_vdev_zaps = 0;
+       }
+
+       if (spa->spa_all_vdev_zaps == 0) {
+               spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+                   DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_VDEV_ZAP_MAP, tx);
+       }
+       spa->spa_avz_action = AVZ_ACTION_NONE;
+
+       /* Create ZAPs for vdevs that don't have them. */
+       vdev_construct_zaps(spa->spa_root_vdev, tx);
+
        config = spa_config_generate(spa, spa->spa_root_vdev,
            dmu_tx_get_txg(tx), B_FALSE);
 
@@ -6738,6 +6937,21 @@ spa_sync(spa_t *spa, uint64_t txg)
 
        } while (dmu_objset_is_dirty(mos, txg));
 
+       if (!list_is_empty(&spa->spa_config_dirty_list)) {
+               /*
+                * Make sure that the number of ZAPs for all the vdevs matches
+                * the number of ZAPs in the per-vdev ZAP list. This only gets
+                * called if the config is dirty; otherwise there may be
+                * outstanding AVZ operations that weren't completed in
+                * spa_sync_config_object.
+                */
+               uint64_t all_vdev_zap_entry_count;
+               ASSERT0(zap_count(spa->spa_meta_objset,
+                   spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+               ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+                   all_vdev_zap_entry_count);
+       }
+
        /*
         * Rewrite the vdev configuration (which includes the uberblock)
         * to commit the transaction group.

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c    Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c    Wed May 
11 12:54:00 2016        (r299441)
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -123,7 +123,7 @@ spa_config_load(void)
                if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
                        continue;
 
-               VERIFY(nvpair_value_nvlist(nvpair, &child) == 0);
+               child = fnvpair_value_nvlist(nvpair);
 
                if (spa_lookup(nvpair_name(nvpair)) != NULL)
                        continue;
@@ -181,14 +181,9 @@ spa_config_write(spa_config_dirent_t *dp
        /*
         * Pack the configuration into a buffer.
         */
-       VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0);
-
-       buf = kmem_alloc(buflen, KM_SLEEP);
+       buf = fnvlist_pack(nvl, &buflen);
        temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-       VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR,
-           KM_SLEEP) == 0);
-
        /*
         * Write the configuration to disk.  We need to do the traditional
         * 'write to temporary file, sync, move over original' to make sure we
@@ -209,7 +204,7 @@ spa_config_write(spa_config_dirent_t *dp
 
        (void) vn_remove(temp, UIO_SYSSPACE, RMFILE);
 
-       kmem_free(buf, buflen);
+       fnvlist_pack_free(buf, buflen);
        kmem_free(temp, MAXPATHLEN);
        return (err);
 }
@@ -276,11 +271,10 @@ spa_config_sync(spa_t *target, boolean_t
                        }
 
                        if (nvl == NULL)
-                               VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME,
-                                   KM_SLEEP) == 0);
+                               nvl = fnvlist_alloc();
 
-                       VERIFY(nvlist_add_nvlist(nvl, spa->spa_name,
-                           spa->spa_config) == 0);
+                       fnvlist_add_nvlist(nvl, spa->spa_name,
+                           spa->spa_config);
                        mutex_exit(&spa->spa_props_lock);
 
                        if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) 
== 0)
@@ -345,15 +339,15 @@ spa_all_configs(uint64_t *generation)
        if (*generation == spa_config_generation)
                return (NULL);
 
-       VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       pools = fnvlist_alloc();
 
        mutex_enter(&spa_namespace_lock);
        while ((spa = spa_next(spa)) != NULL) {
                if (INGLOBALZONE(curthread) ||
                    zone_dataset_visible(spa_name(spa), NULL)) {
                        mutex_enter(&spa->spa_props_lock);
-                       VERIFY(nvlist_add_nvlist(pools, spa_name(spa),
-                           spa->spa_config) == 0);
+                       fnvlist_add_nvlist(pools, spa_name(spa),
+                           spa->spa_config);
                        mutex_exit(&spa->spa_props_lock);
                }
        }
@@ -402,21 +396,17 @@ spa_config_generate(spa_t *spa, vdev_t *
        if (txg == -1ULL)
                txg = spa->spa_config_txg;
 
-       VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
-           spa_version(spa)) == 0);
-       VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
-           spa_name(spa)) == 0);
-       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
-           spa_state(spa)) == 0);
-       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-           txg) == 0);
-       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
-           spa_guid(spa)) == 0);
-       VERIFY(spa->spa_comment == NULL || nvlist_add_string(config,
-           ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0);
+       config = fnvlist_alloc();
 
+       fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+       fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+       fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+       fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+       fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+       if (spa->spa_comment != NULL) {
+               fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+                   spa->spa_comment);
+       }
 
 #ifdef _KERNEL
        hostid = zone_get_hostid(NULL);
@@ -428,23 +418,24 @@ spa_config_generate(spa_t *spa, vdev_t *
        (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
 #endif /* _KERNEL */
        if (hostid != 0) {
-               VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
-                   hostid) == 0);
+               fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
        }
-       VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME,
-           utsname.nodename) == 0);
+       fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename);
 
+       int config_gen_flags = 0;
        if (vd != rvd) {
-               VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
-                   vd->vdev_top->vdev_guid) == 0);
-               VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
-                   vd->vdev_guid) == 0);
-               if (vd->vdev_isspare)
-                       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE,
-                           1ULL) == 0);
-               if (vd->vdev_islog)
-                       VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
-                           1ULL) == 0);
+               fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+                   vd->vdev_top->vdev_guid);
+               fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+                   vd->vdev_guid);
+               if (vd->vdev_isspare) {
+                       fnvlist_add_uint64(config,
+                           ZPOOL_CONFIG_IS_SPARE, 1ULL);
+               }
+               if (vd->vdev_islog) {
+                       fnvlist_add_uint64(config,
+                           ZPOOL_CONFIG_IS_LOG, 1ULL);
+               }
                vd = vd->vdev_top;              /* label contains top config */
        } else {
                /*
@@ -452,8 +443,12 @@ spa_config_generate(spa_t *spa, vdev_t *
                 * in the mos config, and not in the vdev labels
                 */
                if (spa->spa_config_splitting != NULL)
-                       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
-                           spa->spa_config_splitting) == 0);
+                       fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+                           spa->spa_config_splitting);
+               fnvlist_add_boolean(config,
+                   ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
+
+               config_gen_flags |= VDEV_CONFIG_MOS;
        }
 
        /*
@@ -468,19 +463,19 @@ spa_config_generate(spa_t *spa, vdev_t *
        if (spa->spa_config_splitting != NULL &&
            nvlist_lookup_uint64(spa->spa_config_splitting,
            ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
-               VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
-                   split_guid) == 0);
+               fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+                   split_guid);
        }
 
-       nvroot = vdev_config_generate(spa, vd, getstats, 0);
-       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+       nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
+       fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
        nvlist_free(nvroot);
 
        /*
         * Store what's necessary for reading the MOS in the label.
         */
-       VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
-           spa->spa_label_features) == 0);
+       fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+           spa->spa_label_features);
 
        if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
                ddt_histogram_t *ddh;
@@ -489,23 +484,23 @@ spa_config_generate(spa_t *spa, vdev_t *
 
                ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
                ddt_get_dedup_histogram(spa, ddh);
-               VERIFY(nvlist_add_uint64_array(config,
+               fnvlist_add_uint64_array(config,
                    ZPOOL_CONFIG_DDT_HISTOGRAM,
-                   (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+                   (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
                kmem_free(ddh, sizeof (ddt_histogram_t));
 
                ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
                ddt_get_dedup_object_stats(spa, ddo);
-               VERIFY(nvlist_add_uint64_array(config,
+               fnvlist_add_uint64_array(config,
                    ZPOOL_CONFIG_DDT_OBJ_STATS,
-                   (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+                   (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
                kmem_free(ddo, sizeof (ddt_object_t));
 
                dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
                ddt_get_dedup_stats(spa, dds);
-               VERIFY(nvlist_add_uint64_array(config,
+               fnvlist_add_uint64_array(config,
                    ZPOOL_CONFIG_DDT_STATS,
-                   (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+                   (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
                kmem_free(dds, sizeof (ddt_stat_t));
        }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h       Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h       Wed May 
11 12:54:00 2016        (r299441)
@@ -323,6 +323,7 @@ typedef struct dmu_buf {
 #define        DMU_POOL_BPTREE_OBJ             "bptree_obj"
 #define        DMU_POOL_EMPTY_BPOBJ            "empty_bpobj"
 #define        DMU_POOL_CHECKSUM_SALT          "org.illumos:checksum_salt"
+#define        DMU_POOL_VDEV_ZAP_MAP           "com.delphix:vdev_zap_map"
 
 /*
  * Allocate an object from this objset.  The range of object numbers

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h  Wed May 
11 12:54:00 2016        (r299441)
@@ -117,6 +117,12 @@ typedef struct spa_taskqs {
        taskq_t **stqs_taskq;
 } spa_taskqs_t;
 
+typedef enum spa_all_vdev_zap_action {
+       AVZ_ACTION_NONE = 0,
+       AVZ_ACTION_DESTROY,     /* Destroy all per-vdev ZAPs and the AVZ. */
+       AVZ_ACTION_REBUILD      /* Populate the new AVZ, see spa_avz_rebuild */
+} spa_avz_action_t;
+
 struct spa {
        /*
         * Fields protected by spa_namespace_lock.
@@ -264,6 +270,9 @@ struct spa {
        uint64_t        spa_deadman_calls;      /* number of deadman calls */
        hrtime_t        spa_sync_starttime;     /* starting time fo spa_sync */
        uint64_t        spa_deadman_synctime;   /* deadman expiration timer */
+       uint64_t        spa_all_vdev_zaps;      /* ZAP of per-vd ZAP obj #s */
+       spa_avz_action_t        spa_avz_action; /* destroy/rebuild AVZ? */
+
 #ifdef illumos
        /*
         * spa_iokstat_lock protects spa_iokstat and

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h      Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h      Wed May 
11 12:54:00 2016        (r299441)
@@ -72,6 +72,10 @@ extern void vdev_dtl_reassess(vdev_t *vd
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
+extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj,
+    dmu_tx_t *tx);
+extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx);
+extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx);
 
 extern void vdev_hold(vdev_t *);
 extern void vdev_rele(vdev_t *);
@@ -135,7 +139,8 @@ extern void vdev_state_clean(vdev_t *vd)
 typedef enum vdev_config_flag {
        VDEV_CONFIG_SPARE = 1 << 0,
        VDEV_CONFIG_L2CACHE = 1 << 1,
-       VDEV_CONFIG_REMOVING = 1 << 2
+       VDEV_CONFIG_REMOVING = 1 << 2,
+       VDEV_CONFIG_MOS = 1 << 3
 } vdev_config_flag_t;
 
 extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Wed May 
11 12:54:00 2016        (r299441)
@@ -191,6 +191,7 @@ struct vdev {
        uint64_t        vdev_islog;     /* is an intent log device      */
        uint64_t        vdev_removing;  /* device is being removed?     */
        boolean_t       vdev_ishole;    /* is a hole in the namespace   */
+       uint64_t        vdev_top_zap;
 
        /*
         * Leaf vdev state.
@@ -234,6 +235,7 @@ struct vdev {
        uint16_t        vdev_rotation_rate; /* rotational rate of the media */
 #define        VDEV_RATE_UNKNOWN       0
 #define        VDEV_RATE_NON_ROTATING  1
+       uint64_t        vdev_leaf_zap;
 
        /*
         * For DTrace to work in userland (libzpool) context, these fields must

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Wed May 11 
12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Wed May 11 
12:54:00 2016        (r299441)
@@ -609,6 +609,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
                    &vd->vdev_asize);
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
                    &vd->vdev_removing);
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+                   &vd->vdev_top_zap);
+       } else {
+               ASSERT0(vd->vdev_top_zap);
        }
 
        if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) {
@@ -620,9 +624,18 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl
                    spa_log_class(spa) : spa_normal_class(spa), vd);
        }
 
+       if (vd->vdev_ops->vdev_op_leaf &&
+           (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+               (void) nvlist_lookup_uint64(nv,
+                   ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
+       } else {
+               ASSERT0(vd->vdev_leaf_zap);
+       }
+
        /*
         * If we're a leaf vdev, try to load the DTL object and other state.
         */
+
        if (vd->vdev_ops->vdev_op_leaf &&
            (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
            alloctype == VDEV_ALLOC_ROOTPOOL)) {
@@ -783,10 +796,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *t
        tvd->vdev_ms_array = svd->vdev_ms_array;
        tvd->vdev_ms_shift = svd->vdev_ms_shift;
        tvd->vdev_ms_count = svd->vdev_ms_count;
+       tvd->vdev_top_zap = svd->vdev_top_zap;
 
        svd->vdev_ms_array = 0;
        svd->vdev_ms_shift = 0;
        svd->vdev_ms_count = 0;
+       svd->vdev_top_zap = 0;
 
        if (tvd->vdev_mg)
                ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
@@ -2079,6 +2094,49 @@ vdev_dtl_load(vdev_t *vd)
 }
 
 void
+vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
+       VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+           zapobj, tx));
+}
+
+uint64_t
+vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
+{
+       spa_t *spa = vd->vdev_spa;
+       uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
+           DMU_OT_NONE, 0, tx);
+
+       ASSERT(zap != 0);
+       VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+           zap, tx));
+
+       return (zap);
+}
+
+void
+vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
+{
+       if (vd->vdev_ops != &vdev_hole_ops &&
+           vd->vdev_ops != &vdev_missing_ops &&
+           vd->vdev_ops != &vdev_root_ops &&
+           !vd->vdev_top->vdev_removing) {
+               if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
+                       vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
+               }
+               if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
+                       vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+               }
+       }
+       for (uint64_t i = 0; i < vd->vdev_children; i++) {
+               vdev_construct_zaps(vd->vdev_child[i], tx);
+       }
+}
+
+void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
        spa_t *spa = vd->vdev_spa;
@@ -2100,6 +2158,18 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
                space_map_close(vd->vdev_dtl_sm);
                vd->vdev_dtl_sm = NULL;
                mutex_exit(&vd->vdev_dtl_lock);
+
+               /*
+                * We only destroy the leaf ZAP for detached leaves or for
+                * removed log devices. Removed data devices handle leaf ZAP
+                * cleanup later, once cancellation is no longer possible.
+                */
+               if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
+                   vd->vdev_top->vdev_islog)) {
+                       vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
+                       vd->vdev_leaf_zap = 0;
+               }
+
                dmu_tx_commit(tx);
                return;
        }
@@ -2303,6 +2373,8 @@ vdev_remove(vdev_t *vd, uint64_t txg)
        dmu_tx_t *tx;
 
        tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+       ASSERT(vd == vd->vdev_top);
+       ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
        if (vd->vdev_ms != NULL) {
                metaslab_group_t *mg = vd->vdev_mg;
@@ -2344,6 +2416,11 @@ vdev_remove(vdev_t *vd, uint64_t txg)
                (void) dmu_object_free(mos, vd->vdev_ms_array, tx);
                vd->vdev_ms_array = 0;
        }
+
+       if (vd->vdev_islog && vd->vdev_top_zap != 0) {
+               vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
+               vd->vdev_top_zap = 0;
+       }
        dmu_tx_commit(tx);
 }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c    Wed May 
11 12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c    Wed May 
11 12:54:00 2016        (r299441)
@@ -297,6 +297,20 @@ vdev_config_generate(spa_t *spa, vdev_t 
        if (vd->vdev_crtxg)
                fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 
+       if (flags & VDEV_CONFIG_MOS) {
+               if (vd->vdev_leaf_zap != 0) {
+                       ASSERT(vd->vdev_ops->vdev_op_leaf);
+                       fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
+                           vd->vdev_leaf_zap);
+               }
+
+               if (vd->vdev_top_zap != 0) {
+                       ASSERT(vd == vd->vdev_top);
+                       fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+                           vd->vdev_top_zap);
+               }
+       }
+
        if (getstats) {
                vdev_stat_t vs;
                pool_scan_stat_t ps;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c   Wed May 11 
12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c   Wed May 11 
12:54:00 2016        (r299441)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -962,8 +962,8 @@ zap_create_link(objset_t *os, dmu_object
        uint64_t new_obj;
 
        VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0);
-       VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
-           tx) == 0);
+       VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+           tx));
 
        return (new_obj);
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h   Wed May 11 
12:50:58 2016        (r299440)
+++ head/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h   Wed May 11 
12:54:00 2016        (r299441)
@@ -565,6 +565,9 @@ typedef struct zpool_rewind_policy {
 #define        ZPOOL_CONFIG_CAN_RDONLY         "can_rdonly"    /* not stored 
on disk */
 #define        ZPOOL_CONFIG_FEATURES_FOR_READ  "features_for_read"
 #define        ZPOOL_CONFIG_FEATURE_STATS      "feature_stats" /* not stored 
on disk */
+#define        ZPOOL_CONFIG_VDEV_TOP_ZAP       "com.delphix:vdev_zap_top"
+#define        ZPOOL_CONFIG_VDEV_LEAF_ZAP      "com.delphix:vdev_zap_leaf"
+#define        ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS  "com.delphix:has_per_vdev_zaps"
 /*
  * The persistent vdev state is stored as separate values rather than a single
  * 'vdev_state' entry.  This is because a device can be in multiple states, 
such
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to