The branch main has been updated by glebius:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a1572e202a1840c26c950ed728358965f535ec7a

commit a1572e202a1840c26c950ed728358965f535ec7a
Author:     Gleb Smirnoff <gleb...@freebsd.org>
AuthorDate: 2025-08-20 14:52:20 +0000
Commit:     Gleb Smirnoff <gleb...@freebsd.org>
CommitDate: 2025-08-20 14:52:20 +0000

    libsa/zfs: further improve handling of stale labels
    
    Fix two problems with 6dd0803ffd31.  First problem is that situation when
    newer label was read before stale one, was handled differently to reverse
    order case.  Second problem is that vdev_free() would free the fully
    initialized leaf vdev that carried stale label.  In a case when vdev
    carries a stale label, but is still referenced by a different label with
    new a configuration, we don't want to free it, but rather insert it into
    the new configuration.
    
    o Provide a helper function nvlist_find_vdev_guid() that checks presence
      of certain GUID in a label.
    o In top level vdev store the GUID of vdev used to instantiate top vdev.
    o Cover all possible cases in the block in vdev_probe() where we encounter
      a known configuration.  Make the diagnostic print more informative and
      looking same regardless of probe order.  Make this whole block easier to
      read reducing one level of indentation for a price of a single comparison
      at runtime.
    
    Reviewed by:            mav, imp
    Differential Revision:  https://reviews.freebsd.org/D51913
    Fixes:                  6dd0803ffd31c60a84488d06928813353c6303d3
---
 stand/libsa/zfs/zfsimpl.c   | 131 +++++++++++++++++++++++++++++++++++++-------
 sys/cddl/boot/zfs/zfsimpl.h |   3 +-
 2 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c
index fb5b5fb4606f..bc960a1fe3c0 100644
--- a/stand/libsa/zfs/zfsimpl.c
+++ b/stand/libsa/zfs/zfsimpl.c
@@ -833,6 +833,14 @@ vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void 
*buf,
        return (kid->v_read(kid, bp, buf, offset, bytes));
 }
 
+/*
+ * List of vdevs that were fully initialized from their own label, but later a
+ * newer label was found that obsoleted the stale label, freeing its
+ * configuration tree.  We keep those vdevs around, since a new configuration
+ * may include them.
+ */
+static vdev_list_t orphans = STAILQ_HEAD_INITIALIZER(orphans);
+
 static vdev_t *
 vdev_find(vdev_list_t *list, uint64_t guid)
 {
@@ -854,6 +862,11 @@ vdev_create(uint64_t guid, vdev_read_t *_read)
        vdev_t *vdev;
        vdev_indirect_config_t *vic;
 
+       if ((vdev = vdev_find(&orphans, guid))) {
+               STAILQ_REMOVE(&orphans, vdev, vdev, v_childlink);
+               return (vdev);
+       }
+
        vdev = calloc(1, sizeof(vdev_t));
        if (vdev != NULL) {
                STAILQ_INIT(&vdev->v_children);
@@ -1101,8 +1114,8 @@ vdev_insert(vdev_t *top_vdev, vdev_t *vdev)
 }
 
 static int
-vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t txg,
-    const nvlist_t *nvlist)
+vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t label_guid,
+    uint64_t txg, const nvlist_t *nvlist)
 {
        vdev_t *top_vdev, *vdev;
        nvlist_t **kids = NULL;
@@ -1116,6 +1129,7 @@ vdev_from_nvlist(spa_t *spa, uint64_t top_guid, uint64_t 
txg,
                        return (rc);
                top_vdev->v_spa = spa;
                top_vdev->v_top = top_vdev;
+               top_vdev->v_label = label_guid;
                top_vdev->v_txg = txg;
                (void )vdev_insert(spa->spa_root_vdev, top_vdev);
        }
@@ -1160,12 +1174,14 @@ done:
 static int
 vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
 {
-       uint64_t top_guid, txg;
+       uint64_t top_guid, label_guid, txg;
        nvlist_t *vdevs;
        int rc;
 
        if (nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64,
            NULL, &top_guid, NULL) ||
+           nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
+           NULL, &label_guid, NULL) != 0 ||
            nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
            NULL, &txg, NULL) != 0 ||
            nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
@@ -1174,7 +1190,7 @@ vdev_init_from_label(spa_t *spa, const nvlist_t *nvlist)
                return (ENOENT);
        }
 
-       rc = vdev_from_nvlist(spa, top_guid, txg, vdevs);
+       rc = vdev_from_nvlist(spa, top_guid, label_guid, txg, vdevs);
        nvlist_destroy(vdevs);
        return (rc);
 }
@@ -1271,7 +1287,10 @@ vdev_free(struct vdev *vdev)
 
        STAILQ_FOREACH_SAFE(kid, &vdev->v_children, v_childlink, safe)
                vdev_free(kid);
-       free(vdev);
+       if (vdev->v_phys_read != NULL)
+               STAILQ_INSERT_HEAD(&orphans, vdev, v_childlink);
+       else
+               free(vdev);
 }
 
 static int
@@ -1323,7 +1342,7 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
                 * XXXGL: how can this happen?
                 */
                if (vdev == NULL)
-                       rc = vdev_from_nvlist(spa, guid, 0, kids[i]);
+                       rc = vdev_from_nvlist(spa, guid, 0, 0, kids[i]);
                else
                        rc = vdev_update_from_nvlist(spa->spa_root_vdev, guid,
                            kids[i]);
@@ -1344,6 +1363,53 @@ vdev_init_from_nvlist(spa_t *spa, const nvlist_t *nvlist)
        return (rc);
 }
 
+static bool
+nvlist_find_child_guid(const nvlist_t *nvlist, uint64_t guid)
+{
+       nvlist_t **kids = NULL;
+       int nkids, i;
+       bool rv = false;
+
+       if (nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
+           &nkids, &kids, NULL) != 0)
+               nkids = 0;
+
+       for (i = 0; i < nkids; i++) {
+               uint64_t kid_guid;
+
+               if (nvlist_find(kids[i], ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
+                   NULL, &kid_guid, NULL) != 0)
+                       break;
+               if (kid_guid == guid)
+                       rv = true;
+               else
+                       rv = nvlist_find_child_guid(kids[i], guid);
+               if (rv)
+                       break;
+       }
+
+       for (i = 0; i < nkids; i++)
+               nvlist_destroy(kids[i]);
+       free(kids);
+
+       return (rv);
+}
+
+static bool
+nvlist_find_vdev_guid(const nvlist_t *nvlist, uint64_t guid)
+{
+       nvlist_t *vdevs;
+       bool rv;
+
+       if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL,
+           &vdevs, NULL) != 0)
+               return (false);
+       rv = nvlist_find_child_guid(vdevs, guid);
+       nvlist_destroy(vdevs);
+
+       return (rv);
+}
+
 static spa_t *
 spa_find_by_guid(uint64_t guid)
 {
@@ -2012,7 +2078,7 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t 
*_write, void *priv,
 {
        vdev_t vtmp;
        spa_t *spa;
-       vdev_t *vdev;
+       vdev_t *vdev, *top;
        nvlist_t *nvl;
        uint64_t val;
        uint64_t guid, pool_guid, top_guid, txg;
@@ -2109,22 +2175,47 @@ vdev_probe(vdev_phys_read_t *_read, vdev_phys_write_t 
*_write, void *priv,
                        nvlist_destroy(nvl);
                        return (ENOMEM);
                }
-       } else {
-               struct vdev *kid;
-
-               STAILQ_FOREACH(kid, &spa->spa_root_vdev->v_children,
-                   v_childlink)
-                       if (kid->v_guid == top_guid && kid->v_txg < txg) {
-                               printf("ZFS: pool %s vdev %s ignoring stale "
-                                   "label from txg 0x%jx, using 0x%jx@0x%jx\n",
-                                   spa->spa_name, kid->v_name,
-                                   kid->v_txg, guid, txg);
+       }
+
+       /*
+        * Check if configuration is already known.  If configuration is known
+        * and txg numbers don't match, we got 2x2 scenarios here.  First, is
+        * the label being read right now _newer_ than the one read before.
+        * Second, is the vdev that provided the stale label _present_ in the
+        * newer configuration.  If neither is true, we completely ignore the
+        * label.
+        */
+       STAILQ_FOREACH(top, &spa->spa_root_vdev->v_children, v_childlink)
+               if (top->v_guid == top_guid) {
+                       bool newer, present;
+
+                       if (top->v_txg == txg)
+                               break;
+                       newer = (top->v_txg < txg);
+                       present = newer ?
+                           nvlist_find_vdev_guid(nvl, top->v_label) :
+                           (vdev_find(&top->v_children, guid) != NULL);
+                       printf("ZFS: pool %s vdev %s %s stale label from "
+                           "0x%jx@0x%jx, %s 0x%jx@0x%jx\n",
+                           spa->spa_name, top->v_name,
+                           present ? "using" : "ignoring",
+                           newer ? top->v_label : guid,
+                           newer ? top->v_txg : txg,
+                           present ? "referred by" : "using",
+                           newer ? guid : top->v_label,
+                           newer ? txg : top->v_txg);
+                       if (newer) {
                                STAILQ_REMOVE(&spa->spa_root_vdev->v_children,
-                                   kid, vdev, v_childlink);
-                               vdev_free(kid);
+                                   top, vdev, v_childlink);
+                               vdev_free(top);
                                break;
+                       } else if (present) {
+                               break;
+                       } else {
+                               nvlist_destroy(nvl);
+                               return (EIO);
                        }
-       }
+               }
 
        /*
         * Get the vdev tree and create our in-core copy of it.
diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h
index d3bc090738ff..c9de1fe4c391 100644
--- a/sys/cddl/boot/zfs/zfsimpl.h
+++ b/sys/cddl/boot/zfs/zfsimpl.h
@@ -2024,7 +2024,8 @@ typedef struct vdev {
        vdev_list_t     v_children;     /* children of this vdev */
        const char      *v_name;        /* vdev name */
        uint64_t        v_guid;         /* vdev guid */
-       uint64_t        v_txg;          /* most recent transaction */
+       uint64_t        v_label;        /* label instantiated from (top vdev) */
+       uint64_t        v_txg;          /* most recent transaction (top vdev) */
        uint64_t        v_id;           /* index in parent */
        uint64_t        v_psize;        /* physical device capacity */
        int             v_ashift;       /* offset to block shift */

Reply via email to