Author: delphij
Date: Fri Jul 18 17:55:12 2014
New Revision: 268848
URL: http://svnweb.freebsd.org/changeset/base/268848

Log:
  4958 zdb trips assert on pools with ashift >= 0xe
  Reviewed by: Matthew Ahrens <mahr...@delphix.com>
  Reviewed by: Max Grossman <max.gross...@delphix.com>
  Reviewed by: George Wilson <george.wil...@delphix.com>
  Reviewed by: Christopher Siden <christopher.si...@delphix.com>
  Approved by: Garrett D'Amore <garr...@damore.org>
  
  illumos/illumos-gate@2a104a5236475eb73aa41eaaf3ed9f3ccbe0ca55

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_debug.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/ztest/ztest.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c        Fri Jul 18 
16:25:35 2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c        Fri Jul 18 
17:55:12 2014        (r268848)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c        Fri Jul 18 
16:25:35 2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c        Fri Jul 18 
17:55:12 2014        (r268848)
@@ -62,6 +62,21 @@ uint64_t metaslab_gang_bang = SPA_MAXBLO
 int zfs_condense_pct = 200;
 
 /*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * a free space. Metaslab groups that have more free space than
@@ -1279,6 +1294,8 @@ metaslab_group_preload(metaslab_group_t 
  * times the size than the free space range tree representation
  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
  *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
  * Checking the first condition is tricky since we don't want to walk
  * the entire AVL tree calculating the estimated on-disk size. Instead we
  * use the size-ordered range tree in the metaslab and calculate the
@@ -1289,13 +1306,21 @@ metaslab_group_preload(metaslab_group_t 
  * To determine the second criterion we use a best-case estimate and assume
  * each segment can be represented on-disk as a single 64-bit entry. We refer
  * to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, 
etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
        space_map_t *sm = msp->ms_sm;
        range_seg_t *rs;
-       uint64_t size, entries, segsz;
+       uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+       dmu_object_info_t doi;
+       uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
        ASSERT(msp->ms_loaded);
@@ -1319,9 +1344,15 @@ metaslab_should_condense(metaslab_t *msp
        entries = size / (MIN(size, SM_RUN_MAX));
        segsz = entries * sizeof (uint64_t);
 
-       return (segsz <= space_map_length(msp->ms_sm) &&
-           space_map_length(msp->ms_sm) >= (zfs_condense_pct *
-           sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
+       optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+       object_size = space_map_length(msp->ms_sm);
+
+       dmu_object_info_from_db(sm->sm_dbuf, &doi);
+       record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+       return (segsz <= object_size &&
+           object_size >= (optimal_size * zfs_condense_pct / 100) &&
+           object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c        Fri Jul 18 
16:25:35 2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c        Fri Jul 18 
17:55:12 2014        (r268848)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  */
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h   Fri Jul 18 
16:25:35 2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h   Fri Jul 18 
17:55:12 2014        (r268848)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_IMPL_H
@@ -229,8 +229,11 @@ struct vdev {
 #define        VDEV_PHYS_SIZE          (112 << 10)
 #define        VDEV_UBERBLOCK_RING     (128 << 10)
 
+/* The largest uberblock we support is 8k. */
+#define        MAX_UBERBLOCK_SHIFT (13)
 #define        VDEV_UBERBLOCK_SHIFT(vd)        \
-       MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+       MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+           MAX_UBERBLOCK_SHIFT)
 #define        VDEV_UBERBLOCK_COUNT(vd)        \
        (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 #define        VDEV_UBERBLOCK_OFFSET(vd, n)    \

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h Fri Jul 18 16:25:35 
2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h Fri Jul 18 17:55:12 
2014        (r268848)
@@ -165,19 +165,20 @@ enum zio_flag {
        ZIO_FLAG_RESILVER       = 1 << 3,
        ZIO_FLAG_SCRUB          = 1 << 4,
        ZIO_FLAG_SCAN_THREAD    = 1 << 5,
+       ZIO_FLAG_PHYSICAL       = 1 << 6,
 
 #define        ZIO_FLAG_AGG_INHERIT    (ZIO_FLAG_CANFAIL - 1)
 
        /*
         * Flags inherited by ddt, gang, and vdev children.
         */
-       ZIO_FLAG_CANFAIL        = 1 << 6,       /* must be first for INHERIT */
-       ZIO_FLAG_SPECULATIVE    = 1 << 7,
-       ZIO_FLAG_CONFIG_WRITER  = 1 << 8,
-       ZIO_FLAG_DONT_RETRY     = 1 << 9,
-       ZIO_FLAG_DONT_CACHE     = 1 << 10,
-       ZIO_FLAG_NODATA         = 1 << 11,
-       ZIO_FLAG_INDUCE_DAMAGE  = 1 << 12,
+       ZIO_FLAG_CANFAIL        = 1 << 7,       /* must be first for INHERIT */
+       ZIO_FLAG_SPECULATIVE    = 1 << 8,
+       ZIO_FLAG_CONFIG_WRITER  = 1 << 9,
+       ZIO_FLAG_DONT_RETRY     = 1 << 10,
+       ZIO_FLAG_DONT_CACHE     = 1 << 11,
+       ZIO_FLAG_NODATA         = 1 << 12,
+       ZIO_FLAG_INDUCE_DAMAGE  = 1 << 13,
 
 #define        ZIO_FLAG_DDT_INHERIT    (ZIO_FLAG_IO_RETRY - 1)
 #define        ZIO_FLAG_GANG_INHERIT   (ZIO_FLAG_IO_RETRY - 1)
@@ -185,27 +186,27 @@ enum zio_flag {
        /*
         * Flags inherited by vdev children.
         */
-       ZIO_FLAG_IO_RETRY       = 1 << 13,      /* must be first for INHERIT */
-       ZIO_FLAG_PROBE          = 1 << 14,
-       ZIO_FLAG_TRYHARD        = 1 << 15,
-       ZIO_FLAG_OPTIONAL       = 1 << 16,
+       ZIO_FLAG_IO_RETRY       = 1 << 14,      /* must be first for INHERIT */
+       ZIO_FLAG_PROBE          = 1 << 15,
+       ZIO_FLAG_TRYHARD        = 1 << 16,
+       ZIO_FLAG_OPTIONAL       = 1 << 17,
 
 #define        ZIO_FLAG_VDEV_INHERIT   (ZIO_FLAG_DONT_QUEUE - 1)
 
        /*
         * Flags not inherited by any children.
         */
-       ZIO_FLAG_DONT_QUEUE     = 1 << 17,      /* must be first for INHERIT */
-       ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
-       ZIO_FLAG_IO_BYPASS      = 1 << 19,
-       ZIO_FLAG_IO_REWRITE     = 1 << 20,
-       ZIO_FLAG_RAW            = 1 << 21,
-       ZIO_FLAG_GANG_CHILD     = 1 << 22,
-       ZIO_FLAG_DDT_CHILD      = 1 << 23,
-       ZIO_FLAG_GODFATHER      = 1 << 24,
-       ZIO_FLAG_NOPWRITE       = 1 << 25,
-       ZIO_FLAG_REEXECUTED     = 1 << 26,
-       ZIO_FLAG_DELEGATED      = 1 << 27,
+       ZIO_FLAG_DONT_QUEUE     = 1 << 18,      /* must be first for INHERIT */
+       ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
+       ZIO_FLAG_IO_BYPASS      = 1 << 20,
+       ZIO_FLAG_IO_REWRITE     = 1 << 21,
+       ZIO_FLAG_RAW            = 1 << 22,
+       ZIO_FLAG_GANG_CHILD     = 1 << 23,
+       ZIO_FLAG_DDT_CHILD      = 1 << 24,
+       ZIO_FLAG_GODFATHER      = 1 << 25,
+       ZIO_FLAG_NOPWRITE       = 1 << 26,
+       ZIO_FLAG_REEXECUTED     = 1 << 27,
+       ZIO_FLAG_DELEGATED      = 1 << 28,
 };
 
 #define        ZIO_FLAG_MUSTSUCCEED            0

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_debug.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_debug.c       Fri Jul 18 
16:25:35 2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_debug.c       Fri Jul 18 
17:55:12 2014        (r268848)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -57,7 +57,10 @@ zfs_dbgmsg_fini(void)
  * echo ::zfs_dbgmsg | mdb -k
  *
  * Monitor these messages by running:
- *     dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
  */
 void
 zfs_dbgmsg(const char *fmt, ...)

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c     Fri Jul 18 16:25:35 
2014        (r268847)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c     Fri Jul 18 17:55:12 
2014        (r268848)
@@ -815,8 +815,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, ui
        ASSERT3U(offset + size, <=, vd->vdev_psize);
 
        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-           ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
-           ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+           ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+           NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 
        zio->io_prop.zp_checksum = checksum;
 
@@ -836,8 +836,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, u
        ASSERT3U(offset + size, <=, vd->vdev_psize);
 
        zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
-           ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
-           ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+           ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+           NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 
        zio->io_prop.zp_checksum = checksum;
 
@@ -2508,7 +2508,9 @@ zio_vdev_io_start(zio_t *zio)
 
        align = 1ULL << vd->vdev_top->vdev_ashift;
 
-       if (P2PHASE(zio->io_size, align) != 0) {
+       if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+           P2PHASE(zio->io_size, align) != 0) {
+               /* Transform logical writes to be a full physical block size. */
                uint64_t asize = P2ROUNDUP(zio->io_size, align);
                char *abuf = zio_buf_alloc(asize);
                ASSERT(vd == vd->vdev_top);
@@ -2519,8 +2521,22 @@ zio_vdev_io_start(zio_t *zio)
                zio_push_transform(zio, abuf, asize, asize, zio_subblock);
        }
 
-       ASSERT(P2PHASE(zio->io_offset, align) == 0);
-       ASSERT(P2PHASE(zio->io_size, align) == 0);
+       /*
+        * If this is not a physical io, make sure that it is properly aligned
+        * before proceeding.
+        */
+       if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+               ASSERT0(P2PHASE(zio->io_offset, align));
+               ASSERT0(P2PHASE(zio->io_size, align));
+       } else {
+               /*
+                * For physical writes, we allow 512b aligned writes and assume
+                * the device will perform a read-modify-write as necessary.
+                */
+               ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+               ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+       }
+
        VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
 
        /*
_______________________________________________
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to