On February 26, 2019 12:18:35 AM PST, Baptiste Daroussin <b...@freebsd.org> 
wrote:
>Author: bapt
>Date: Tue Feb 26 08:18:34 2019
>New Revision: 344569
>URL: https://svnweb.freebsd.org/changeset/base/344569
>
>Log:
>  Implement parallel mounting for ZFS filesystem
>  
>  It was first implemented on Illumos and then ported to ZoL.
>  This patch is a port to FreeBSD of the ZoL version.
>  This patch also includes a fix for a race condition that was amended
>  
>With such patch Delphix has seen a huge decrease in latency of the
>mount phase
>  (https://github.com/openzfs/openzfs/commit/a3f0e2b569 for details).
>With that current change Gandi has measured improvments that are on par
>with
>  those reported by Delphix.
>  
>  Zol commits incorporated:
>https://github.com/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825c70c9c9e303
>https://github.com/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d4077567dfea21
>  
>  Reviewed by: avg, sef
>  Approved by: avg, sef
>  Obtained from:       ZoL
>  MFC after:   1 month
>  Relnotes:    yes
>  Sponsored by:        Gandi.net
>  Differential Revision:       https://reviews.freebsd.org/D19098
>
>Modified:
>  head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
>  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
>
>Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
>==============================================================================
>--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c   Tue Feb 26
>06:22:10 2019  (r344568)
>+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c   Tue Feb 26
>08:18:34 2019  (r344569)
>@@ -5812,8 +5812,13 @@ zfs_do_holds(int argc, char **argv)
> 
> #define       CHECK_SPINNER 30
> #define       SPINNER_TIME 3          /* seconds */
>-#define       MOUNT_TIME 5            /* seconds */
>+#define       MOUNT_TIME 1            /* seconds */
> 
>+typedef struct get_all_state {
>+      boolean_t       ga_verbose;
>+      get_all_cb_t    *ga_cbp;
>+} get_all_state_t;
>+
> static int
> get_one_dataset(zfs_handle_t *zhp, void *data)
> {
>@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
>       static int spinval = 0;
>       static int spincheck = 0;
>       static time_t last_spin_time = (time_t)0;
>-      get_all_cb_t *cbp = data;
>+      get_all_state_t *state = data;
>       zfs_type_t type = zfs_get_type(zhp);
> 
>-      if (cbp->cb_verbose) {
>+      if (state->ga_verbose) {
>               if (--spincheck < 0) {
>                       time_t now = time(NULL);
>                       if (last_spin_time + SPINNER_TIME < now) {
>@@ -5850,26 +5855,24 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
>               zfs_close(zhp);
>               return (0);
>       }
>-      libzfs_add_handle(cbp, zhp);
>-      assert(cbp->cb_used <= cbp->cb_alloc);
>+      libzfs_add_handle(state->ga_cbp, zhp);
>+      assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
> 
>       return (0);
> }
> 
> static void
>-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t
>verbose)
>+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
> {
>-      get_all_cb_t cb = { 0 };
>-      cb.cb_verbose = verbose;
>-      cb.cb_getone = get_one_dataset;
>+      get_all_state_t state = {
>+              .ga_verbose = verbose,
>+              .ga_cbp = cbp
>+      };
> 
>       if (verbose)
>               set_progress_header(gettext("Reading ZFS config"));
>-      (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
>+      (void) zfs_iter_root(g_zfs, get_one_dataset, &state);
> 
>-      *dslist = cb.cb_handles;
>-      *count = cb.cb_used;
>-
>       if (verbose)
>               finish_progress(gettext("done."));
> }
>@@ -5879,9 +5882,20 @@ get_all_datasets(zfs_handle_t ***dslist, size_t
>*count
>* similar, we have a common function with an extra parameter to
>determine which
>  * mode we are using.
>  */
>-#define       OP_SHARE        0x1
>-#define       OP_MOUNT        0x2
>+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
> 
>+typedef struct share_mount_state {
>+      share_mount_op_t        sm_op;
>+      boolean_t       sm_verbose;
>+      int     sm_flags;
>+      char    *sm_options;
>+      char    *sm_proto; /* only valid for OP_SHARE */
>+      pthread_mutex_t sm_lock; /* protects the remaining fields */
>+      uint_t  sm_total; /* number of filesystems to process */
>+      uint_t  sm_done; /* number of filesystems processed */
>+      int     sm_status; /* -1 if any of the share/mount operations failed */
>+} share_mount_state_t;
>+
> /*
>  * Share or mount a dataset.
>  */
>@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total)
>               update_progress(info);
> }
> 
>+/*
>+ * zfs_foreach_mountpoint() callback that mounts or shares on
>filesystem and
>+ * updates the progress meter
>+ */
>+static int
>+share_mount_one_cb(zfs_handle_t *zhp, void *arg)
>+{
>+      share_mount_state_t *sms = arg;
>+      int ret;
>+
>+      ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
>+          B_FALSE, sms->sm_options);
>+
>+      pthread_mutex_lock(&sms->sm_lock);
>+      if (ret != 0)
>+              sms->sm_status = ret;
>+      sms->sm_done++;
>+      if (sms->sm_verbose)
>+              report_mount_progress(sms->sm_done, sms->sm_total);
>+      pthread_mutex_unlock(&sms->sm_lock);
>+      return (ret);
>+}
>+
> static void
> append_options(char *mntopts, char *newopts)
> {
>@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv)
> 
>       /* check number of arguments */
>       if (do_all) {
>-              zfs_handle_t **dslist = NULL;
>-              size_t i, count = 0;
>               char *protocol = NULL;
> 
>               if (op == OP_SHARE && argc > 0) {
>@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv)
>               }
> 
>               start_progress_timer();
>-              get_all_datasets(&dslist, &count, verbose);
>+              get_all_cb_t cb = { 0 };
>+              get_all_datasets(&cb, verbose);
> 
>-              if (count == 0)
>+              if (cb.cb_used == 0) {
>+                      if (options != NULL)
>+                              free(options);
>                       return (0);
>+              }
> 
>-              qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
> #ifdef illumos
>-              sa_init_selective_arg_t sharearg;
>-              sharearg.zhandle_arr = dslist;
>-              sharearg.zhandle_len = count;
>-              if ((ret = zfs_init_libshare_arg(zfs_get_handle(dslist[0]),
>-                  SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
>-                      (void) fprintf(stderr,
>-                          gettext("Could not initialize libshare, %d"), ret);
>-                      return (ret);
>+              if (op == OP_SHARE) {
>+                      sa_init_selective_arg_t sharearg;
>+                      sharearg.zhandle_arr = cb.cb_handles;
>+                      sharearg.zhandle_len = cb.cb_used;
>+                      if ((ret = zfs_init_libshare_arg(g_zfs,
>+                          SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
>+                              (void) fprintf(stderr, gettext(
>+                                  "Could not initialize libshare, %d"), ret);
>+                              return (ret);
>+                      }
>               }
> #endif
>+              share_mount_state_t share_mount_state = { 0 };
>+              share_mount_state.sm_op = op;
>+              share_mount_state.sm_verbose = verbose;
>+              share_mount_state.sm_flags = flags;
>+              share_mount_state.sm_options = options;
>+              share_mount_state.sm_proto = protocol;
>+              share_mount_state.sm_total = cb.cb_used;
>+              pthread_mutex_init(&share_mount_state.sm_lock, NULL);
> 
>-              for (i = 0; i < count; i++) {
>-                      if (verbose)
>-                              report_mount_progress(i, count);
>+              /*
>+               * libshare isn't mt-safe, so only do the operation in parallel
>+               * if we're mounting.
>+               */
>+              zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
>+                  share_mount_one_cb, &share_mount_state, op == OP_MOUNT);
>+              ret = share_mount_state.sm_status;
> 
>-                      if (share_mount_one(dslist[i], op, flags, protocol,
>-                          B_FALSE, options) != 0)
>-                              ret = 1;
>-                      zfs_close(dslist[i]);
>-              }
>-
>-              free(dslist);
>+              for (int i = 0; i < cb.cb_used; i++)
>+                      zfs_close(cb.cb_handles[i]);
>+              free(cb.cb_handles);
>       } else if (argc == 0) {
>               struct mnttab entry;
> 
>
>Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
>==============================================================================
>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h   Tue Feb 26
>06:22:10 2019  (r344568)
>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h   Tue Feb 26
>08:18:34 2019  (r344569)
>@@ -579,12 +579,12 @@ typedef struct get_all_cb {
>       zfs_handle_t    **cb_handles;
>       size_t          cb_alloc;
>       size_t          cb_used;
>-      boolean_t       cb_verbose;
>-      int             (*cb_getone)(zfs_handle_t *, void *);
> } get_all_cb_t;
> 
>+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **,
>size_t,
>+    zfs_iter_f, void*, boolean_t);
>+
> void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
>-int libzfs_dataset_cmp(const void *, const void *);
> 
> /*
>  * Functions to create and destroy datasets.
>
>Modified:
>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
>==============================================================================
>---
>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c       Tue
>Feb 26 06:22:10 2019   (r344568)
>+++
>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c       Tue
>Feb 26 08:18:34 2019   (r344569)
>@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const
>vo
> void
> libzfs_mnttab_init(libzfs_handle_t *hdl)
> {
>+      pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
>       assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
>       avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
>           sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
>@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
>               free(mtn);
>       }
>       avl_destroy(&hdl->libzfs_mnttab_cache);
>+      (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
> }
> 
> void
>@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char
>*f
> {
>       mnttab_node_t find;
>       mnttab_node_t *mtn;
>+      int ret = ENOENT;
> 
>       if (!hdl->libzfs_mnttab_enable) {
>               struct mnttab srch = { 0 };
>@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char
>*f
>                       return (ENOENT);
>       }
> 
>+      pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>       if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
>               libzfs_mnttab_update(hdl);
> 
>@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const
>char *f
>       mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
>       if (mtn) {
>               *entry = mtn->mtn_mt;
>-              return (0);
>+              ret = 0;
>       }
>-      return (ENOENT);
>+      pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
>+      return (ret);
> }
> 
> void
>@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const
>char *sp
> {
>       mnttab_node_t *mtn;
> 
>-      if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
>-              return;
>-      mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
>-      mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
>-      mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
>-      mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
>-      mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
>-      avl_add(&hdl->libzfs_mnttab_cache, mtn);
>-}
>+      pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>+      if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
>+              mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
>+              mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
>+              mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
>+              mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
>+              mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
>+              avl_add(&hdl->libzfs_mnttab_cache, mtn);
>+      }
>+      pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
>+}             
> 
> void
> libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
>@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
>char 
>       mnttab_node_t find;
>       mnttab_node_t *ret;
> 
>+      pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
>       find.mtn_mt.mnt_special = (char *)fsname;
>       if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
>           != NULL) {
>@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
>char 
>               free(ret->mtn_mt.mnt_mntopts);
>               free(ret);
>       }
>+      pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
> }
> 
> int
>
>Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
>==============================================================================
>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h      Tue
>Feb 26 06:22:10 2019   (r344568)
>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h      Tue
>Feb 26 08:18:34 2019   (r344569)
>@@ -22,7 +22,7 @@
> /*
>* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights
>reserved.
>  * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
>- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
>+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
>* Copyright (c) 2013 Martin Matuska <m...@freebsd.org>. All rights
>reserved.
>  */
> 
>@@ -73,6 +73,13 @@ struct libzfs_handle {
>       int libzfs_storeerr; /* stuff error messages into buffer */
>       void *libzfs_sharehdl; /* libshare handle */
>       boolean_t libzfs_mnttab_enable;
>+      /*
>+       * We need a lock to handle the case where parallel mount
>+       * threads are populating the mnttab cache simultaneously. The
>+       * lock only protects the integrity of the avl tree, and does
>+       * not protect the contents of the mnttab entries themselves.
>+       */
>+      pthread_mutex_t libzfs_mnttab_cache_lock;
>       avl_tree_t libzfs_mnttab_cache;
>       int libzfs_pool_iter;
>       libzfs_fru_t **libzfs_fru_hash;
>
>Modified:
>head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
>==============================================================================
>--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c     Tue
>Feb 26 06:22:10 2019   (r344568)
>+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c     Tue
>Feb 26 08:18:34 2019   (r344569)
>@@ -26,6 +26,7 @@
>  * Copyright 2016 Igor Kozhukhov <ikozhuk...@gmail.com>
>  * Copyright 2017 Joyent, Inc.
>  * Copyright 2017 RackTop Systems.
>+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
>  */
> 
> /*
>@@ -34,25 +35,25 @@
>  * they are used by mount and unmount and when changing a filesystem's
>  * mountpoint.
>  *
>- *    zfs_is_mounted()
>- *    zfs_mount()
>- *    zfs_unmount()
>- *    zfs_unmountall()
>+ *    zfs_is_mounted()
>+ *    zfs_mount()
>+ *    zfs_unmount()
>+ *    zfs_unmountall()
>  *
>* This file also contains the functions used to manage sharing
>filesystems via
>  * NFS and iSCSI:
>  *
>- *    zfs_is_shared()
>- *    zfs_share()
>- *    zfs_unshare()
>+ *    zfs_is_shared()
>+ *    zfs_share()
>+ *    zfs_unshare()
>  *
>- *    zfs_is_shared_nfs()
>- *    zfs_is_shared_smb()
>- *    zfs_share_proto()
>- *    zfs_shareall();
>- *    zfs_unshare_nfs()
>- *    zfs_unshare_smb()
>- *    zfs_unshareall_nfs()
>+ *    zfs_is_shared_nfs()
>+ *    zfs_is_shared_smb()
>+ *    zfs_share_proto()
>+ *    zfs_shareall();
>+ *    zfs_unshare_nfs()
>+ *    zfs_unshare_smb()
>+ *    zfs_unshareall_nfs()
>  *    zfs_unshareall_smb()
>  *    zfs_unshareall()
>  *    zfs_unshareall_bypath()
>@@ -60,8 +61,8 @@
>  * The following functions are available for pool consumers, and will
>  * mount/unmount and share/unshare all datasets within pool:
>  *
>- *    zpool_enable_datasets()
>- *    zpool_disable_datasets()
>+ *    zpool_enable_datasets()
>+ *    zpool_disable_datasets()
>  */
> 
> #include <dirent.h>
>@@ -83,10 +84,14 @@
> #include <libzfs.h>
> 
> #include "libzfs_impl.h"
>+#include <thread_pool.h>
> 
> #include <libshare.h>
> #define       MAXISALEN       257     /* based on sysinfo(2) man page */
> 
>+static int mount_tp_nthr = 512; /* tpool threads for multi-threaded
>mounting */
>+
>+static void zfs_mount_task(void *);
> static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
> zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
>     zfs_share_proto_t);
>@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp)
>       }
> }
> 
>+/*
>+ * Add the given zfs handle to the cb_handles array, dynamically
>reallocating
>+ * the array if it is out of space
>+ */
> void
> libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
> {
>       if (cbp->cb_alloc == cbp->cb_used) {
>               size_t newsz;
>-              void *ptr;
>+              zfs_handle_t **newhandles;
> 
>-              newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
>-              ptr = zfs_realloc(zhp->zfs_hdl,
>-                  cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
>-                  newsz * sizeof (void *));
>-              cbp->cb_handles = ptr;
>+              newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
>+              newhandles = zfs_realloc(zhp->zfs_hdl,
>+                  cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
>+                  newsz * sizeof (zfs_handle_t *));
>+              cbp->cb_handles = newhandles;
>               cbp->cb_alloc = newsz;
>       }
>       cbp->cb_handles[cbp->cb_used++] = zhp;
> }
> 
>+/*
>+ * Recursive helper function used during file system enumeration
>+ */
> static int
>-mount_cb(zfs_handle_t *zhp, void *data)
>+zfs_iter_cb(zfs_handle_t *zhp, void *data)
> {
>       get_all_cb_t *cbp = data;
> 
>@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data)
>       }
> 
>       libzfs_add_handle(cbp, zhp);
>-      if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
>+      if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
>               zfs_close(zhp);
>               return (-1);
>       }
>       return (0);
> }
> 
>-int
>-libzfs_dataset_cmp(const void *a, const void *b)
>+/*
>+ * Sort comparator that compares two mountpoint paths. We sort these
>paths so
>+ * that subdirectories immediately follow their parents. This means
>that we
>+ * effectively treat the '/' character as the lowest value non-nul
>char.
>+ * Since filesystems from non-global zones can have the same
>mountpoint
>+ * as other filesystems, the comparator sorts global zone filesystems
>to
>+ * the top of the list. This means that the global zone will traverse
>the
>+ * filesystem list in the correct order and can stop when it sees the
>+ * first zoned filesystem. In a non-global zone, only the delegated
>+ * filesystems are seen.
>+ *
>+ * An example sorted list using this comparator would look like:
>+ *
>+ * /foo
>+ * /foo/bar
>+ * /foo/bar/baz
>+ * /foo/baz
>+ * /foo.bar
>+ * /foo (NGZ1)
>+ * /foo (NGZ2)
>+ *
>+ * The mount code depend on this ordering to deterministically iterate
>+ * over filesystems in order to spawn parallel mount tasks.
>+ */
>+static int
>+mountpoint_cmp(const void *arga, const void *argb)
> {
>-      zfs_handle_t **za = (zfs_handle_t **)a;
>-      zfs_handle_t **zb = (zfs_handle_t **)b;
>+      zfs_handle_t *const *zap = arga;
>+      zfs_handle_t *za = *zap;
>+      zfs_handle_t *const *zbp = argb;
>+      zfs_handle_t *zb = *zbp;
>       char mounta[MAXPATHLEN];
>       char mountb[MAXPATHLEN];
>+      const char *a = mounta;
>+      const char *b = mountb;
>       boolean_t gota, gotb;
>+      uint64_t zoneda, zonedb;
> 
>-      if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
>-              verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
>+      zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED);
>+      zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED);
>+      if (zoneda && !zonedb)
>+              return (1);
>+      if (!zoneda && zonedb)
>+              return (-1);
>+      gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
>+      if (gota)
>+              verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
>                   sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
>-      if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
>-              verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
>+      gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
>+      if (gotb)
>+              verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
>                   sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
> 
>-      if (gota && gotb)
>-              return (strcmp(mounta, mountb));
>+      if (gota && gotb) {
>+              while (*a != '\0' && (*a == *b)) {
>+                      a++;
>+                      b++;
>+              }
>+              if (*a == *b)
>+                      return (0);
>+              if (*a == '\0')
>+                      return (-1);
>+              if (*b == '\0')
>+                      return (-1);
>+              if (*a == '/')
>+                      return (-1);
>+              if (*b == '/')
>+                      return (-1);
>+              return (*a < *b ? -1 : *a > *b);
>+      }
> 
>       if (gota)
>               return (-1);
>       if (gotb)
>               return (1);
> 
>-      return (strcmp(zfs_get_name(a), zfs_get_name(b)));
>+      /*
>+       * If neither filesystem has a mountpoint, revert to sorting by
>+       * datset name.
>+       */
>+      return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
> }
> 
> /*
>+ * Reutrn true if path2 is a child of path1
>+ */
>+static boolean_t
>+libzfs_path_contains(const char *path1, const char *path2)
>+{
>+      return (strstr(path2, path1) == path2 && path2[strlen(path1)] ==
>'/');
>+}
>+
>+
>+static int
>+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int
>idx)
>+{
>+      char parent[ZFS_MAXPROPLEN];
>+      char child[ZFS_MAXPROPLEN];
>+      int i;
>+
>+      verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
>+          sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
>+
>+      for (i = idx + 1; i < num_handles; i++) {
>+              verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
>+                  sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
>+              if (!libzfs_path_contains(parent, child))
>+                      break;
>+      }
>+      return (i);
>+}
>+
>+typedef struct mnt_param {
>+      libzfs_handle_t *mnt_hdl;
>+      tpool_t         *mnt_tp;
>+      zfs_handle_t    **mnt_zhps; /* filesystems to mount */
>+      size_t          mnt_num_handles;
>+      int             mnt_idx;        /* Index of selected entry to mount */
>+      zfs_iter_f      mnt_func;
>+      void            *mnt_data;
>+} mnt_param_t;
>+
>+/*
>+ * Allocate and populate the parameter struct for mount function, and
>+ * schedule mounting of the entry selected by idx.
>+ */
>+static void
>+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
>+    size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t
>*tp)
>+{
>+      mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
>+
>+      mnt_param->mnt_hdl = hdl;
>+      mnt_param->mnt_tp = tp;
>+      mnt_param->mnt_zhps = handles;
>+      mnt_param->mnt_num_handles = num_handles;
>+      mnt_param->mnt_idx = idx;
>+      mnt_param->mnt_func = func;
>+      mnt_param->mnt_data = data;
>+
>+      (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
>+}
>+
>+/*
>+ * This is the structure used to keep state of mounting or sharing
>operations
>+ * during a call to zpool_enable_datasets().
>+ */
>+typedef struct mount_state {
>+      /*
>+       * ms_mntstatus is set to -1 if any mount fails. While multiple
>threads
>+       * could update this variable concurrently, no synchronization is
>+       * needed as it's only ever set to -1.
>+       */
>+      int             ms_mntstatus;
>+      int             ms_mntflags;
>+      const char      *ms_mntopts;
>+} mount_state_t;
>+
>+static int
>+zfs_mount_one(zfs_handle_t *zhp, void *arg)
>+{
>+      mount_state_t *ms = arg;
>+      int ret = 0;
>+
>+      if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
>+              ret = ms->ms_mntstatus = -1;
>+      return (ret);
>+}
>+
>+static int
>+zfs_share_one(zfs_handle_t *zhp, void *arg)
>+{
>+      mount_state_t *ms = arg;
>+      int ret = 0;
>+
>+      if (zfs_share(zhp) != 0)
>+              ret = ms->ms_mntstatus = -1;
>+      return (ret);
>+}
>+
>+/*
>+ * Thread pool function to mount one file system. On completion, it
>finds and
>+ * schedules its children to be mounted. This depends on the sorting
>done in
>+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of
>entries
>+ * each descending from the previous) will have no parallelism since
>we always
>+ * have to wait for the parent to finish mounting before we can
>schedule
>+ * its children.
>+ */
>+static void
>+zfs_mount_task(void *arg)
>+{
>+      mnt_param_t *mp = arg;
>+      int idx = mp->mnt_idx;
>+      zfs_handle_t **handles = mp->mnt_zhps;
>+      size_t num_handles = mp->mnt_num_handles;
>+      char mountpoint[ZFS_MAXPROPLEN];
>+
>+      verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
>+          sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
>+
>+      if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
>+              return;
>+
>+      /*
>+       * We dispatch tasks to mount filesystems with mountpoints underneath
>+       * this one. We do this by dispatching the next filesystem with a
>+       * descendant mountpoint of the one we just mounted, then skip all of
>+       * its descendants, dispatch the next descendant mountpoint, and so
>on.
>+       * The non_descendant_idx() function skips over filesystems that are
>+       * descendants of the filesystem we just dispatched.
>+       */
>+      for (int i = idx + 1; i < num_handles;
>+          i = non_descendant_idx(handles, num_handles, i)) {
>+              char child[ZFS_MAXPROPLEN];
>+              verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
>+                  child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
>+
>+              if (!libzfs_path_contains(mountpoint, child))
>+                      break; /* not a descendant, return */
>+              zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
>+                  mp->mnt_func, mp->mnt_data, mp->mnt_tp);
>+      }
>+      free(mp);
>+}
>+
>+/*
>+ * Issue the func callback for each ZFS handle contained in the
>handles
>+ * array. This function is used to mount all datasets, and so this
>function
>+ * guarantees that filesystems for parent mountpoints are called
>before their
>+ * children. As such, before issuing any callbacks, we first sort the
>array
>+ * of handles by mountpoint.
>+ *
>+ * Callbacks are issued in one of two ways:
>+ *
>+ * 1. Sequentially: If the parallel argument is B_FALSE or the
>ZFS_SERIAL_MOUNT
>+ *    environment variable is set, then we issue callbacks
>sequentially.
>+ *
>+ * 2. In parallel: If the parallel argument is B_TRUE and the
>ZFS_SERIAL_MOUNT
>+ *    environment variable is not set, then we use a tpool to dispatch
>threads
>+ *    to mount filesystems in parallel. This function dispatches tasks
>to mount
>+ *    the filesystems at the top-level mountpoints, and these tasks in
>turn
>+ *    are responsible for recursively mounting filesystems in their
>children
>+ *    mountpoints.
>+ */
>+void
>+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
>+    size_t num_handles, zfs_iter_f func, void *data, boolean_t
>parallel)
>+{
>+      zoneid_t zoneid = getzoneid();
>+
>+      /*
>+       * The ZFS_SERIAL_MOUNT environment variable is an undocumented
>+       * variable that can be used as a convenience to do a/b comparison
>+       * of serial vs. parallel mounting.
>+       */
>+      boolean_t serial_mount = !parallel ||
>+          (getenv("ZFS_SERIAL_MOUNT") != NULL);
>+
>+      /*
>+       * Sort the datasets by mountpoint. See mountpoint_cmp for details
>+       * of how these are sorted.
>+       */
>+      qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
>+
>+      if (serial_mount) {
>+              for (int i = 0; i < num_handles; i++) {
>+                      func(handles[i], data);
>+              }
>+              return;
>+      }
>+
>+      /*
>+       * Issue the callback function for each dataset using a parallel
>+       * algorithm that uses a thread pool to manage threads.
>+       */
>+      tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
>+
>+      /*
>+       * There may be multiple "top level" mountpoints outside of the
>pool's
>+       * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each
>of
>+       * these.
>+       */
>+      for (int i = 0; i < num_handles;
>+          i = non_descendant_idx(handles, num_handles, i)) {
>+              /*
>+               * Since the mountpoints have been sorted so that the zoned
>+               * filesystems are at the end, a zoned filesystem seen from
>+               * the global zone means that we're done.
>+               */
>+              if (zoneid == GLOBAL_ZONEID &&
>+                  zfs_prop_get_int(handles[i], ZFS_PROP_ZONED))
>+                      break;
>+              zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
>+                  tp);
>+      }
>+
>+      tpool_wait(tp); /* wait for all scheduled mounts to complete */
>+      tpool_destroy(tp);
>+}
>+
>+/*
>* Mount and share all datasets within the given pool.  This assumes
>that no
>- * datasets within the pool are currently mounted.  Because users can
>create
>- * complicated nested hierarchies of mountpoints, we first gather all
>the
>- * datasets and mountpoints within the pool, and sort them by
>mountpoint.  Once
>- * we have the list of all filesystems, we iterate over them in order
>and mount
>- * and/or share each one.
>+ * datasets within the pool are currently mounted.
>  */
> #pragma weak zpool_mount_datasets = zpool_enable_datasets
> int
>zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int
>flags)
> {
>       get_all_cb_t cb = { 0 };
>-      libzfs_handle_t *hdl = zhp->zpool_hdl;
>+      mount_state_t ms = { 0 };
>       zfs_handle_t *zfsp;
>-      int i, ret = -1;
>-      int *good;
>+      int ret = 0;
> 
>-      /*
>-       * Gather all non-snap datasets within the pool.
>-       */
>-      if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) ==
>NULL)
>+      if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
>+          ZFS_TYPE_DATASET)) == NULL)
>               goto out;
> 
>-      libzfs_add_handle(&cb, zfsp);
>-      if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
>-              goto out;
>       /*
>-       * Sort the datasets by mountpoint.
>+       * Gather all non-snapshot datasets within the pool. Start by adding
>+       * the root filesystem for this pool to the list, and then iterate
>+       * over all child filesystems.
>        */
>-      qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
>-          libzfs_dataset_cmp);
>+      libzfs_add_handle(&cb, zfsp);
>+      if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
>+              goto out;
> 
>       /*
>-       * And mount all the datasets, keeping track of which ones
>-       * succeeded or failed.
>+       * Mount all filesystems
>        */
>-      if ((good = zfs_alloc(zhp->zpool_hdl,
>-          cb.cb_used * sizeof (int))) == NULL)
>-              goto out;
>+      ms.ms_mntopts = mntopts;
>+      ms.ms_mntflags = flags;
>+      zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
>+          zfs_mount_one, &ms, B_TRUE);
>+      if (ms.ms_mntstatus != 0)
>+              ret = ms.ms_mntstatus;
> 
>-      ret = 0;
>-      for (i = 0; i < cb.cb_used; i++) {
>-              if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
>-                      ret = -1;
>-              else
>-                      good[i] = 1;
>-      }
>-
>       /*
>-       * Then share all the ones that need to be shared. This needs
>-       * to be a separate pass in order to avoid excessive reloading
>-       * of the configuration. Good should never be NULL since
>-       * zfs_alloc is supposed to exit if memory isn't available.
>+       * Share all filesystems that need to be shared. This needs to be
>+       * a separate pass because libshare is not mt-safe, and so we need
>+       * to share serially.
>        */
>-      for (i = 0; i < cb.cb_used; i++) {
>-              if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
>-                      ret = -1;
>-      }
>+      ms.ms_mntstatus = 0;
>+      zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
>+          zfs_share_one, &ms, B_FALSE);
>+      if (ms.ms_mntstatus != 0)
>+              ret = ms.ms_mntstatus;
> 
>-      free(good);
>-
> out:
>-      for (i = 0; i < cb.cb_used; i++)
>+      for (int i = 0; i < cb.cb_used; i++)
>               zfs_close(cb.cb_handles[i]);
>       free(cb.cb_handles);
> 

This broke my systems, many filesystems fail to mount causing nullfs late 
mounts to fail. No details now until tonight.

Suggest we back this out until it is properly tested.

-- 
Pardon the typos and autocorrect, small keyboard in use.
Cheers,
Cy Schubert <cy.schub...@cschubert.com>
FreeBSD UNIX: <c...@freebsd.org> Web: http://www.FreeBSD.org

        The need of the many outweighs the greed of the few.
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to