Author: avg
Date: Mon Feb 17 17:00:46 2014
New Revision: 262089
URL: http://svnweb.freebsd.org/changeset/base/262089

Log:
  MFC r255750: MFV r254750: Add support of Illumos dumps on zvol over RAID-Z.
  
  Note that this only adds the features.  FreeBSD would
  still need more work to support dumping on zvols.
  
  MFC slacker:  delphij

Added:
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
     - copied unchanged from r255750, 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h
Modified:
  stable/9/cddl/contrib/opensolaris/cmd/zfs/zfs.8
  stable/9/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
  stable/9/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
  stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
  stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
  stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
Directory Properties:
  stable/9/cddl/contrib/opensolaris/   (props changed)
  stable/9/cddl/contrib/opensolaris/cmd/zfs/   (props changed)
  stable/9/cddl/contrib/opensolaris/lib/libzfs/   (props changed)
  stable/9/sys/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)

Modified: stable/9/cddl/contrib/opensolaris/cmd/zfs/zfs.8
==============================================================================
--- stable/9/cddl/contrib/opensolaris/cmd/zfs/zfs.8     Mon Feb 17 16:48:11 
2014        (r262088)
+++ stable/9/cddl/contrib/opensolaris/cmd/zfs/zfs.8     Mon Feb 17 17:00:46 
2014        (r262089)
@@ -19,16 +19,16 @@
 .\"
 .\" Copyright (c) 2010, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012 by Delphix. All rights reserved.
-.\" Copyright (c) 2012, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <p...@freebsd.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrew...@freebsd.org>
 .\" Copyright (c) 2012, Glen Barber <g...@freebsd.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2013 Nexenta Systems, Inc. All Rights Reserved.
+.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd March 21, 2013
+.Dd September 20, 2013
 .Dt ZFS 8
 .Os
 .Sh NAME
@@ -885,14 +885,21 @@ command or unmounted by the
 command.
 .Pp
 This property is not inherited.
-.It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256
+.It Sy checksum Ns = Ns Cm on | off | fletcher2 | fletcher4 | sha256 | noparity
 Controls the checksum used to verify data integrity. The default value is
 .Cm on ,
 which automatically selects an appropriate algorithm (currently,
 .Cm fletcher4 ,
 but this may change in future releases). The value
 .Cm off
-disables integrity checking on user data. Disabling checksums is
+disables integrity checking on user data.
+The value
+.Cm noparity
+not only
+disables integrity but also disables maintaining parity for user data.  This
+setting is used internally by a dump device residing on a RAID-Z pool and 
should
+not be used by any other dataset.
+Disabling checksums is
 .Em NOT
 a recommended practice.
 .It Sy compression Ns = Ns Cm on | off | lzjb | gzip | gzip- Ns Ar N | zle | 
Cm lz4

Modified: stable/9/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
==============================================================================
--- stable/9/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7        Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7        Mon Feb 
17 17:00:46 2014        (r262089)
@@ -19,10 +19,11 @@
 .\"
 .\" Copyright (c) 2012 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+.\" Copyright (c) 2013, Joyent, Inc. All rights reserved.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 8, 2013
+.Dd September 20, 2013
 .Dt ZPOOL-FEATURES 7
 .Os
 .Sh NAME
@@ -229,6 +230,27 @@ feature. At the
 moment, this operation cannot be reversed. Booting off of
 .Sy lz4
 -compressed root pools is supported.
+.It Sy multi_vdev_crash_dump
+.Bl -column "READ\-ONLY COMPATIBLE" "com.joyent:multi_vdev_crash_dump"
+.It GUID Ta com.joyent:multi_vdev_crash_dump
+.It READ\-ONLY COMPATIBLE Ta no
+.It DEPENDENCIES Ta none
+.El
+.Pp
+This feature allows a dump device to be configured with a pool comprised
+of multiple vdevs.
+Those vdevs may be arranged in any mirrored or raidz
+configuration.
+.\" TODO: this is not yet supported on FreeBSD.
+.\" .Pp
+.\" When the
+.\" .Sy multi_vdev_crash_dump
+.\" feature is set to
+.\" .Sy enabled ,
+.\" the administrator can use the
+.\" .Xr dumpon 8
+.\" command to configure a
+.\" dump device on a pool comprised of multiple vdevs.
 .El
 .Sh SEE ALSO
 .Xr zpool 8

Modified: stable/9/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c
==============================================================================
--- stable/9/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c   Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_pool.c   Mon Feb 
17 17:00:46 2014        (r262089)
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -4020,9 +4021,7 @@ supported_dump_vdev_type(libzfs_handle_t
        uint_t children, c;
 
        verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0);
-       if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
-           strcmp(type, VDEV_TYPE_FILE) == 0 ||
-           strcmp(type, VDEV_TYPE_LOG) == 0 ||
+       if (strcmp(type, VDEV_TYPE_FILE) == 0 ||
            strcmp(type, VDEV_TYPE_HOLE) == 0 ||
            strcmp(type, VDEV_TYPE_MISSING) == 0) {
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -4041,8 +4040,12 @@ supported_dump_vdev_type(libzfs_handle_t
 }
 
 /*
- * check if this zvol is allowable for use as a dump device; zero if
- * it is, > 0 if it isn't, < 0 if it isn't a zvol
+ * Check if this zvol is allowable for use as a dump device; zero if
+ * it is, > 0 if it isn't, < 0 if it isn't a zvol.
+ *
+ * Allowable storage configurations include mirrors, all raidz variants, and
+ * pools with log, cache, and spare devices.  Pools which are backed by files 
or
+ * have missing/hole vdevs are not suitable.
  */
 int
 zvol_check_dump_config(char *arg)
@@ -4104,12 +4107,6 @@ zvol_check_dump_config(char *arg)
 
        verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
            &top, &toplevels) == 0);
-       if (toplevels != 1) {
-               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                   "'%s' has multiple top level vdevs"), poolname);
-               (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf);
-               goto out;
-       }
 
        if (!supported_dump_vdev_type(hdl, top[0], errbuf)) {
                goto out;

Modified: stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c  Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c  Mon Feb 
17 17:00:46 2014        (r262089)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #ifdef _KERNEL
@@ -159,4 +160,7 @@ zpool_feature_init(void)
        zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
            "org.illumos:lz4_compress", "lz4_compress",
            "LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL);
+       zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+           "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+           "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL);
 }

Modified: stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h  Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h  Mon Feb 
17 17:00:46 2014        (r262089)
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _ZFEATURE_COMMON_H
@@ -53,6 +54,7 @@ static enum spa_feature {
        SPA_FEATURE_ASYNC_DESTROY,
        SPA_FEATURE_EMPTY_BPOBJ,
        SPA_FEATURE_LZ4_COMPRESS,
+       SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
        SPA_FEATURES
 } spa_feature_t;
 

Modified: stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c Mon Feb 17 
16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/common/zfs/zfs_prop.c Mon Feb 17 
17:00:46 2014        (r262089)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -69,6 +70,7 @@ zfs_prop_init(void)
                { "fletcher2",  ZIO_CHECKSUM_FLETCHER_2 },
                { "fletcher4",  ZIO_CHECKSUM_FLETCHER_4 },
                { "sha256",     ZIO_CHECKSUM_SHA256 },
+               { "noparity",   ZIO_CHECKSUM_NOPARITY },
                { NULL }
        };
 

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c      Mon Feb 
17 17:00:46 2014        (r262089)
@@ -23,6 +23,7 @@
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -2795,7 +2796,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
                    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
                mutex_exit(&db->db_mtx);
        } else if (db->db_state == DB_NOFILL) {
-               ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+               ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+                   zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
                dr->dr_zio = zio_write(zio, os->os_spa, txg,
                    db->db_blkptr, NULL, db->db.db_size, &zp,
                    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c       Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c       Mon Feb 
17 17:00:46 2014        (r262089)
@@ -22,8 +22,8 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
-
 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
@@ -1682,7 +1682,7 @@ dmu_write_policy(objset_t *os, dnode_t *
                 * pipeline.
                 */
                compress = ZIO_COMPRESS_OFF;
-               checksum = ZIO_CHECKSUM_OFF;
+               checksum = ZIO_CHECKSUM_NOPARITY;
        } else {
                compress = zio_compress_select(dn->dn_compress, compress);
 

Modified: 
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h     
Mon Feb 17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h     
Mon Feb 17 17:00:46 2014        (r262089)
@@ -21,13 +21,12 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_VDEV_DISK_H
 #define        _SYS_VDEV_DISK_H
 
-#pragma ident  "%Z%%M% %I%     %E% SMI"
-
 #include <sys/vdev.h>
 #ifdef _KERNEL
 #include <sys/buf.h>
@@ -40,14 +39,23 @@
 extern "C" {
 #endif
 
+#ifdef _KERNEL
 typedef struct vdev_disk {
        ddi_devid_t     vd_devid;
        char            *vd_minor;
        ldi_handle_t    vd_lh;
 } vdev_disk_t;
+#endif
 
+extern int vdev_disk_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, int, boolean_t);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only 
be
+ * defined in the zfs kernel module.
+ */
 #ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
 #endif
 #ifdef __cplusplus
 }

Copied: 
stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h (from 
r255750, head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h)
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h    
Mon Feb 17 17:00:46 2014        (r262089, copy of r255750, 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_raidz.h)
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
+
+#ifndef _SYS_VDEV_RAIDZ_H
+#define        _SYS_VDEV_RAIDZ_H
+
+#include <sys/vdev.h>
+#ifdef illumos
+#include <sys/semaphore.h>
+#ifdef _KERNEL
+#include <sys/ddi.h>
+#include <sys/sunldi.h>
+#include <sys/sunddi.h>
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+extern int vdev_raidz_physio(vdev_t *,
+    caddr_t, size_t, uint64_t, uint64_t, boolean_t, boolean_t);
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_RAIDZ_H */

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h   Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h   Mon Feb 
17 17:00:46 2014        (r262089)
@@ -25,6 +25,7 @@
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _ZIO_H
@@ -81,6 +82,7 @@ enum zio_checksum {
        ZIO_CHECKSUM_FLETCHER_4,
        ZIO_CHECKSUM_SHA256,
        ZIO_CHECKSUM_ZILOG2,
+       ZIO_CHECKSUM_NOPARITY,
        ZIO_CHECKSUM_FUNCTIONS
 };
 

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c Mon Feb 
17 17:00:46 2014        (r262089)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2013 Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -391,8 +392,29 @@ vdev_disk_close(vdev_t *vd)
 }
 
 int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
-    uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+    size_t size, uint64_t offset, int flags, boolean_t isdump)
+{
+       vdev_disk_t *dvd = vd->vdev_tsd;
+
+       ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+       /*
+        * If in the context of an active crash dump, use the ldi_dump(9F)
+        * call instead of ldi_strategy(9F) as usual.
+        */
+       if (isdump) {
+               ASSERT3P(dvd, !=, NULL);
+               return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+                   lbtodb(size)));
+       }
+
+       return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+    size_t size, uint64_t offset, int flags)
 {
        buf_t *bp;
        int error = 0;
@@ -640,7 +662,7 @@ vdev_disk_read_rootlabel(char *devpath, 
 
                /* read vdev label */
                offset = vdev_label_offset(size, l, 0);
-               if (vdev_disk_physio(vd_lh, (caddr_t)label,
+               if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
                    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
                        continue;
 

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c        
Mon Feb 17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c        
Mon Feb 17 17:00:46 2014        (r262089)
@@ -22,15 +22,22 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
+#ifdef illumos
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
+#include <sys/bio.h>
 
 /*
  * Virtual device vector for RAID-Z.
@@ -154,6 +161,8 @@ typedef struct raidz_map {
        VDEV_RAIDZ_64MUL_2((x), mask); \
 }
 
+#define        VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
+
 /*
  * Force reconstruction to use the general purpose method.
  */
@@ -437,14 +446,14 @@ static const zio_vsd_ops_t vdev_raidz_vs
  * the number of children in the target vdev.
  */
 static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
-    uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t 
dofree,
+    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
 {
        raidz_map_t *rm;
        /* The starting RAIDZ (parent) vdev sector of the block. */
-       uint64_t b = zio->io_offset >> unit_shift;
+       uint64_t b = offset >> unit_shift;
        /* The zio's size in units of the vdev's minimum sector size. */
-       uint64_t s = zio->io_size >> unit_shift;
+       uint64_t s = size >> unit_shift;
        /* The first column for this stripe. */
        uint64_t f = b % dcols;
        /* The starting byte offset on each child vdev. */
@@ -532,13 +541,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
        ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
        ASSERT3U(rm->rm_nskip, <=, nparity);
 
-       if (zio->io_type != ZIO_TYPE_FREE) {
+       if (!dofree) {
                for (c = 0; c < rm->rm_firstdatacol; c++) {
                        rm->rm_col[c].rc_data =
                            zio_buf_alloc(rm->rm_col[c].rc_size);
                }
 
-               rm->rm_col[c].rc_data = zio->io_data;
+               rm->rm_col[c].rc_data = data;
 
                for (c = c + 1; c < acols; c++) {
                        rm->rm_col[c].rc_data =
@@ -570,7 +579,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
        ASSERT(rm->rm_cols >= 2);
        ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 
-       if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+       if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
                devidx = rm->rm_col[0].rc_devidx;
                o = rm->rm_col[0].rc_offset;
                rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -582,8 +591,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_
                        rm->rm_skipstart = 1;
        }
 
-       zio->io_vsd = rm;
-       zio->io_vsd_ops = &vdev_raidz_vsd_ops;
        return (rm);
 }
 
@@ -993,12 +1000,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *r
  *           ~~                               ~~
  *           __                               __
  *           |  1   1   1   1   1   1   1   1  |
- *           | 128  64  32  16  8   4   2   1  |
  *           |  19 205 116  29  64  16  4   1  |
  *           |  1   0   0   0   0   0   0   0  |
- *           |  0   1   0   0   0   0   0   0  |
- *  (V|I)' = |  0   0   1   0   0   0   0   0  |
- *           |  0   0   0   1   0   0   0   0  |
+ *  (V|I)' = |  0   0   0   1   0   0   0   0  |
  *           |  0   0   0   0   1   0   0   0  |
  *           |  0   0   0   0   0   1   0   0  |
  *           |  0   0   0   0   0   0   1   0  |
@@ -1532,6 +1536,154 @@ vdev_raidz_close(vdev_t *vd)
                vdev_close(vd->vdev_child[c]);
 }
 
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible.  In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots.  For that reason, this function eschews parity for
+ * performance and simplicity.  The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume.  I/Os less 
than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned.  In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs.  8 KB of data would be written 
to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ *       parity    data     data     data     data
+ *     |   PP   |   XX   |   XX   |   XX   |   XX   |
+ *         ^        ^        ^        ^        ^
+ *         |        |        |        |        |
+ *   8 KB parity    ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size.  If 
the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ *       parity    data     data     data     data
+ *     |        |        |        |   XX   |        |
+ *                                    ^
+ *                                    |
+ *                             32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example 
above
+ * would look like:
+ *
+ *       parity   parity   parity    data     data     data     data
+ *     |        |        |        |        |        |   XX   |        |
+ *                                                      ^
+ *                                                      |
+ *                                               32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+       vdev_t *tvd = vd->vdev_top;
+       vdev_t *cvd;
+       raidz_map_t *rm;
+       raidz_col_t *rc;
+       int c, err = 0;
+
+       uint64_t start, end, colstart, colend;
+       uint64_t coloffset, colsize, colskip;
+
+       int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef _KERNEL
+
+       /*
+        * Don't write past the end of the block
+        */
+       VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
+
+       start = offset;
+       end = start + size;
+
+       /*
+        * Allocate a RAID-Z map for this block.  Note that this block starts
+        * from the "original" offset, this is, the offset of the extent which
+        * contains the requisite offset of the data being read or written.
+        *
+        * Even if this I/O operation doesn't span the full block size, let's
+        * treat the on-disk format as if the only blocks are the complete 128
+        * KB size.
+        */
+       rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+           SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, 
vd->vdev_children,
+           vd->vdev_nparity);
+
+       coloffset = origoffset;
+
+       for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+           c++, coloffset += rc->rc_size) {
+               rc = &rm->rm_col[c];
+               cvd = vd->vdev_child[rc->rc_devidx];
+
+               /*
+                * Find the start and end of this column in the RAID-Z map,
+                * keeping in mind that the stated size and offset of the
+                * operation may not fill the entire column for this vdev.
+                *
+                * If any portion of the data spans this column, issue the
+                * appropriate operation to the vdev.
+                */
+               if (coloffset + rc->rc_size <= start)
+                       continue;
+               if (coloffset >= end)
+                       continue;
+
+               colstart = MAX(coloffset, start);
+               colend = MIN(end, coloffset + rc->rc_size);
+               colsize = colend - colstart;
+               colskip = colstart - coloffset;
+
+               VERIFY3U(colsize, <=, rc->rc_size);
+               VERIFY3U(colskip, <=, rc->rc_size);
+
+               /*
+                * Note that the child vdev will have a vdev label at the start
+                * of its range of offsets, hence the need for
+                * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
+                * example of why this calculation is needed.
+                */
+               if ((err = vdev_disk_physio(cvd,
+                   ((char *)rc->rc_data) + colskip, colsize,
+                   VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+                   flags, isdump)) != 0)
+                       break;
+       }
+
+       vdev_raidz_map_free(rm);
+#endif /* KERNEL */
+
+       return (err);
+}
+#endif
+
 static uint64_t
 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 {
@@ -1584,9 +1736,14 @@ vdev_raidz_io_start(zio_t *zio)
        raidz_col_t *rc;
        int c, i;
 
-       rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+       rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+           zio->io_type == ZIO_TYPE_FREE,
+           tvd->vdev_ashift, vd->vdev_children,
            vd->vdev_nparity);
 
+       zio->io_vsd = rm;
+       zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
        ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
        if (zio->io_type == ZIO_TYPE_FREE) {
@@ -1729,6 +1886,13 @@ raidz_parity_verify(zio_t *zio, raidz_ma
        int c, ret = 0;
        raidz_col_t *rc;
 
+       blkptr_t *bp = zio->io_bp;
+       enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+           (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+       if (checksum == ZIO_CHECKSUM_NOPARITY)
+               return (ret);
+
        for (c = 0; c < rm->rm_firstdatacol; c++) {
                rc = &rm->rm_col[c];
                if (!rc->rc_tried || rc->rc_error != 0)

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c      
Mon Feb 17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c      
Mon Feb 17 17:00:46 2014        (r262089)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -78,6 +79,7 @@ zio_checksum_info_t zio_checksum_table[Z
        {{fletcher_4_native,    fletcher_4_byteswap},   1, 0, 0, "fletcher4"},
        {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0, 1, "sha256"},
        {{fletcher_4_native,    fletcher_4_byteswap},   0, 1, 0, "zilog2"},
+       {{zio_checksum_off,     zio_checksum_off},      0, 0, 0, "noparity"},
 };
 
 enum zio_checksum

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c      Mon Feb 
17 16:48:11 2014        (r262088)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c      Mon Feb 
17 17:00:46 2014        (r262089)
@@ -24,6 +24,7 @@
  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <p...@freebsd.org>
  * All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -60,6 +61,7 @@
 #include <sys/stat.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dnode.h>
@@ -77,9 +79,14 @@
 #include <sys/zfs_znode.h>
 #include <sys/zfs_rlock.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zil_impl.h>
 #include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
@@ -1164,27 +1171,28 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
 
 #ifdef sun
 static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
-    boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+    uint64_t size, boolean_t doread, boolean_t isdump)
 {
        vdev_disk_t *dvd;
        int c;
        int numerrors = 0;
 
-       for (c = 0; c < vd->vdev_children; c++) {
-               ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
-                   vd->vdev_ops == &vdev_replacing_ops ||
-                   vd->vdev_ops == &vdev_spare_ops);
-               int err = zvol_dumpio_vdev(vd->vdev_child[c],
-                   addr, offset, size, doread, isdump);
-               if (err != 0) {
-                       numerrors++;
-               } else if (doread) {
-                       break;
+       if (vd->vdev_ops == &vdev_mirror_ops ||
+           vd->vdev_ops == &vdev_replacing_ops ||
+           vd->vdev_ops == &vdev_spare_ops) {
+               for (c = 0; c < vd->vdev_children; c++) {
+                       int err = zvol_dumpio_vdev(vd->vdev_child[c],
+                           addr, offset, origoffset, size, doread, isdump);
+                       if (err != 0) {
+                               numerrors++;
+                       } else if (doread) {
+                               break;
+                       }
                }
        }
 
-       if (!vd->vdev_ops->vdev_op_leaf)
+       if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
                return (numerrors < vd->vdev_children ? 0 : EIO);
 
        if (doread && !vdev_readable(vd))
@@ -1192,19 +1200,26 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr,
        else if (!doread && !vdev_writeable(vd))
                return (SET_ERROR(EIO));
 
-       dvd = vd->vdev_tsd;
-       ASSERT3P(dvd, !=, NULL);
+       if (vd->vdev_ops == &vdev_raidz_ops) {
+               return (vdev_raidz_physio(vd,
+                   addr, size, offset, origoffset, doread, isdump));
+       }
+
        offset += VDEV_LABEL_START_SIZE;
 
        if (ddi_in_panic() || isdump) {
                ASSERT(!doread);
                if (doread)
                        return (SET_ERROR(EIO));
+               dvd = vd->vdev_tsd;
+               ASSERT3P(dvd, !=, NULL);
                return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
                    lbtodb(size)));
        } else {
-               return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
-                   doread ? B_READ : B_WRITE));
+               dvd = vd->vdev_tsd;
+               ASSERT3P(dvd, !=, NULL);
+               return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+                   offset, doread ? B_READ : B_WRITE));
        }
 }
 
@@ -1239,7 +1254,8 @@ zvol_dumpio(zvol_state_t *zv, void *addr
 
        vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
        offset += DVA_GET_OFFSET(&ze->ze_dva);
-       error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+       error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+           size, doread, isdump);
 
        if (!ddi_in_panic())
                spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1259,6 +1275,7 @@ zvol_strategy(struct bio *bp)
        rl_t *rl;
        int error = 0;
        boolean_t doread = (bp->bio_cmd == BIO_READ);
+       boolean_t is_dumpified;
        boolean_t sync;
 
        if (zv == NULL) {
@@ -1285,7 +1302,13 @@ zvol_strategy(struct bio *bp)
                return (0);
        }
 
-        sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#ifdef illumos
+       is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+#else
+       is_dumpified = B_FALSE;
+#endif
+        sync = !doread && !is_dumpified &&
+           zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
        /*
         * There must be no buffer changes when doing a dmu_sync() because
@@ -1296,7 +1319,15 @@ zvol_strategy(struct bio *bp)
 
        while (resid != 0 && off < volsize) {
                size_t size = MIN(resid, zvol_maxphys);
+#ifdef illumos
+               if (is_dumpified) {
+                       size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
+                       error = zvol_dumpio(zv, addr, off, size,
+                           doread, B_FALSE);
+               } else if (doread) {
+#else
                if (doread) {
+#endif
                        error = dmu_read(os, ZVOL_OBJ, off, size, addr,
                            DMU_READ_PREFETCH);
                } else {
@@ -1830,21 +1861,67 @@ zvol_fini(void)
 }
 
 #ifdef sun
+/*ARGSUSED*/
+static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       if (spa_feature_is_active(spa,
+           &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
+               return (1);
+       return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+       spa_feature_incr(spa,
+           &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP], tx);
+}
+
 static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
        dmu_tx_t *tx;
-       int error = 0;
+       int error;
        objset_t *os = zv->zv_objset;
+       spa_t *spa = dmu_objset_spa(os);
+       vdev_t *vd = spa->spa_root_vdev;
        nvlist_t *nv = NULL;
-       uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+       uint64_t version = spa_version(spa);
+       enum zio_checksum checksum;
 
        ASSERT(MUTEX_HELD(&spa_namespace_lock));
+       ASSERT(vd->vdev_ops == &vdev_root_ops);
+
        error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
            DMU_OBJECT_END);
        /* wait for dmu_free_long_range to actually free the blocks */
        txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
+       /*
+        * If the pool on which the dump device is being initialized has more
+        * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+        * enabled.  If so, bump that feature's counter to indicate that the
+        * feature is active. We also check the vdev type to handle the
+        * following case:
+        *   # zpool create test raidz disk1 disk2 disk3
+        *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+        *   the raidz vdev itself has 3 children.
+        */
+       if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+               if (!spa_feature_is_enabled(spa,
+                   &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]))
+                       return (SET_ERROR(ENOTSUP));
+               (void) dsl_sync_task(spa_name(spa),
+                   zfs_mvdev_dump_feature_check,
+                   zfs_mvdev_dump_activate_feature_sync, NULL, 2);
+       }
+
        tx = dmu_tx_create(os);
        dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
        dmu_tx_hold_bonus(tx, ZVOL_OBJ);
@@ -1855,6 +1932,14 @@ zvol_dump_init(zvol_state_t *zv, boolean
        }
 
        /*
+        * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+        * function.  Otherwise, use the old default -- OFF.
+        */
+       checksum = spa_feature_is_active(spa,
+           &spa_feature_table[SPA_FEATURE_MULTI_VDEV_CRASH_DUMP]) ?
+           ZIO_CHECKSUM_NOPARITY : ZIO_CHECKSUM_OFF;
+
+       /*
         * If we are resizing the dump device then we only need to
         * update the refreservation to match the newly updated
         * zvolsize. Otherwise, we save off the original state of the
@@ -1917,7 +2002,7 @@ zvol_dump_init(zvol_state_t *zv, boolean
                    ZIO_COMPRESS_OFF) == 0);
                VERIFY(nvlist_add_uint64(nv,
                    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
-                   ZIO_CHECKSUM_OFF) == 0);
+                   checksum) == 0);
                if (version >= SPA_VERSION_DEDUP) {
                        VERIFY(nvlist_add_uint64(nv,
                            zfs_prop_to_name(ZFS_PROP_DEDUP),
_______________________________________________
svn-src-stable-9@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-stable-9
To unsubscribe, send any mail to "svn-src-stable-9-unsubscr...@freebsd.org"

Reply via email to