I've discovered another issue with both the current code and the previous
versions of the patch.  So another version is at the same URL:
http://people.freebsd.org/~avg/zfs-boot-gang.diff

The problem was with gang blocks on a raidz vdev.  zio_read_gang would pass a
NULL bp to vdev_raidz_read, but that function doesn't expect that and really
needs a valid bp.

Pawel, I also understood the code in zio_read that does size rounding up based
on v_ashift.  That code is for vdev_raidz_read which needs the size to be
multiple of 1 << v_ashift.  And, again, this would have posed a problem for
zio_read_gang, which always passed SPA_GANGBLOCKSIZE when reading a gang block
header.

So, a list of fixes and logical changes:
- correctly read gang header from raidz [*]
- decompress assembled gang block data if compressed
- verify checksum of a gang header
- verify checksum of assembled gang block data
- verify checksum of uber block

[*] - new in this version of patch.

Description of the code changes:
- remove offset parameter from zio_checksum_error
This parameter has only been used for the case of verifying checksum of a vdev
label.  The offset is now passed via DVA offset field in a made up bp pointing
to the label.
zio_checksum_error now gets all checksum parameters from the bp.

- zio_read_gang new uses an artificial bp to read a gang header via zio_read
This solves all the problems with gang blocks on raidz as zio_read already has
all the code to handle raidz correctly.

- zio_read performs size rounding only if v_read == vdev_raidz_read
This is to make the intention of the code more clear.  And also to slightly
optimize non-raidz cases with non-default ashift where an unnecessary
intermediate buffer would otherwise be used.

Some inline comments (marked with %):

Index: sys/cddl/boot/zfs/zfssubr.c
===================================================================
--- sys/cddl/boot/zfs/zfssubr.c (revision 225581)
+++ sys/cddl/boot/zfs/zfssubr.c (working copy)
@@ -181,14 +181,17 @@
 }

 static int
-zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
+zio_checksum_error(const blkptr_t *bp, void *data)
 {
-       unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
BP_GET_CHECKSUM(bp);
-       uint64_t size = BP_GET_PSIZE(bp);
+       uint64_t size;
+       unsigned int checksum;
        zio_checksum_info_t *ci;
        zio_cksum_t actual_cksum, expected_cksum, verifier;
        int byteswap;

+       checksum = BP_GET_CHECKSUM(bp);
+       size = BP_GET_PSIZE(bp);
+
%
% checksum type and size are always taken from the bp.
% Note that BP_IS_GANG(bp) doesn't imply that the caller wants to verify
% the gang header (as was assumed before); the caller want want to verify
% the whole assembled data as well.
%
        if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
                return (EINVAL);
        ci = &zio_checksum_table[checksum];
@@ -206,7 +209,8 @@
                if (checksum == ZIO_CHECKSUM_GANG_HEADER)
                        zio_checksum_gang_verifier(&verifier, bp);
                else if (checksum == ZIO_CHECKSUM_LABEL)
-                       zio_checksum_label_verifier(&verifier, offset);
+                       zio_checksum_label_verifier(&verifier,
+                           DVA_GET_OFFSET(BP_IDENTITY(bp)));
%
% label offset is taken from the bp now.
%
                else
                        verifier = bp->blk_cksum;

@@ -224,7 +228,6 @@
                        byteswap_uint64_array(&expected_cksum,
                            sizeof (zio_cksum_t));
        } else {
-               ASSERT(!BP_IS_GANG(bp));
%
% the assert is no longer valid as we pass a gang block bp
% when verifying checksum of assembled data
%
                expected_cksum = bp->blk_cksum;
                ci->ci_func[0](data, size, &actual_cksum);
        }
@@ -1248,7 +1251,7 @@
 raidz_checksum_verify(const blkptr_t *bp, void *data)
 {

-       return (zio_checksum_error(bp, data, 0));
+       return (zio_checksum_error(bp, data));
 }

 /*
Index: sys/boot/zfs/zfsimpl.c
===================================================================
--- sys/boot/zfs/zfsimpl.c      (revision 225581)
+++ sys/boot/zfs/zfsimpl.c      (working copy)
@@ -347,7 +347,7 @@
        rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
        if (rc)
                return (rc);
-       if (bp && zio_checksum_error(bp, buf, offset))
+       if (bp && zio_checksum_error(bp, buf))
%
% the bp can still be NULL here: when raidz code reads blocks of data
% it doesn't pass any bp, because there is really no bp for that data.
% this should be the only case, in all other cases a valid bp must be provided.
%
                return (EIO);

        return (0);
@@ -800,6 +800,7 @@
        BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
        BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
        BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+       DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
%
% as described above, the offset is now passed via DVA offset field
%
        ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
        if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
                return (EIO);
@@ -941,7 +942,7 @@
                BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
                ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);

-               if (vdev_read_phys(vdev, NULL, upbuf, off, 
VDEV_UBERBLOCK_SIZE(vdev)))
+               if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
%
% pass the artificial uberblock bp to vdev_read_phys, so that it
% can call zio_checksum_error and verify the checksum
%
                        continue;

                if (up->ub_magic != UBERBLOCK_MAGIC)
@@ -974,34 +975,39 @@
 }

 static int
-zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
+zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
 {
+       blkptr_t gbh_bp;
        zio_gbh_phys_t zio_gb;
-       vdev_t *vdev;
-       int vdevid;
-       off_t offset;
+       char *pbuf;
        int i;

-       vdevid = DVA_GET_VDEV(dva);
-       offset = DVA_GET_OFFSET(dva);
-       STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
-               if (vdev->v_id == vdevid)
-                       break;
-       if (!vdev || !vdev->v_read)
+       /* Artificial BP for gang block header. */
+       gbh_bp = *bp;
+       BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+       BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
+       BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
+       BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
+       for (i = 0; i < SPA_DVAS_PER_BP; i++)
+               DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
%
% the artifical bp for the gang header.
% it has PSIZE and LSIZE of SPA_GANGBLOCKSIZE.
% cheksum is set ZIO_CHECKSUM_GANG_HEADER, so that zio_checksum_error
% does the right thing.
% compression is set to ZIO_COMPRESS_OFF, so that zio_read does the right thing.
% the gang bit is cleared from the DVAs - we read only the gang header block and
% do not want to create endless recursion here.
% vdevs and offsets are preserved in the DVAs of the BP.
%
+
+       /* Read gang header block using the artificial BP. */
+       if (zio_read(spa, &gbh_bp, &zio_gb))
                return (EIO);

-       if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
-               return (EIO);
%
% so now smarter zio_read can be used instead of dumber v_read.
% Which, again, would have crashed on NULL bp if v_read == vdev_raidz_read
%

+       pbuf = buf;
%
% keep the original buffer pointer, use pbuf to populate the buffer
%
        for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
                blkptr_t *gbp = &zio_gb.zg_blkptr[i];

                if (BP_IS_HOLE(gbp))
                        continue;
-               if (zio_read(spa, gbp, buf))
+               if (zio_read(spa, gbp, pbuf))
                        return (EIO);
-               buf = (char*)buf + BP_GET_PSIZE(gbp);
+               pbuf += BP_GET_PSIZE(gbp);
        }
-
+
+       if (zio_checksum_error(bp, buf))
+               return (EIO);
%
% the original pointer is used to verify checksum of the assembled data
% of the gang block.
% Note: this is where the gang bp is passed to zio_checksum_error.
%
        return (0);
 }

@@ -1024,46 +1030,41 @@
                if (!dva->dva_word[0] && !dva->dva_word[1])
                        continue;

-               if (DVA_GET_GANG(dva)) {
-                       error = zio_read_gang(spa, bp, dva, buf);
-                       if (error != 0)
-                               continue;
-               } else {
-                       vdevid = DVA_GET_VDEV(dva);
-                       offset = DVA_GET_OFFSET(dva);
-                       STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
-                               if (vdev->v_id == vdevid)
-                                       break;
-                       }
-                       if (!vdev || !vdev->v_read)
-                               continue;
+               vdevid = DVA_GET_VDEV(dva);
+               offset = DVA_GET_OFFSET(dva);
+               STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
+                       if (vdev->v_id == vdevid)
+                               break;
+               }
+               if (!vdev || !vdev->v_read)
+                       continue;

-                       size = BP_GET_PSIZE(bp);
+               size = BP_GET_PSIZE(bp);
+               if (vdev->v_read == vdev_raidz_read) {
                        align = 1ULL << vdev->v_top->v_ashift;
                        if (P2PHASE(size, align) != 0)
                                size = P2ROUNDUP(size, align);
%
% do size rounding up only if v_read == vdev_raidz_read,
% because only vdev_raidz_read requires that.
% vdev_read_phys requires only 512 byte granularity.
%
-                       if (size != BP_GET_PSIZE(bp) || cpfunc != 
ZIO_COMPRESS_OFF)
-                               pbuf = zfs_alloc(size);
-                       else
-                               pbuf = buf;
+               }
+               if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
+                       pbuf = zfs_alloc(size);
+               else
+                       pbuf = buf;
%
% So use temporary buffer only if either the block is compressed or
% vdev_raidz_read requires a larger buffer
%

+               if (DVA_GET_GANG(dva))
+                       error = zio_read_gang(spa, bp, pbuf);
+               else
                        error = vdev->v_read(vdev, bp, pbuf, offset, size);
-                       if (error == 0) {
-                               if (cpfunc != ZIO_COMPRESS_OFF) {
-                                       error = zio_decompress_data(cpfunc,
-                                           pbuf, BP_GET_PSIZE(bp), buf,
-                                           BP_GET_LSIZE(bp));
-                               } else if (size != BP_GET_PSIZE(bp)) {
-                                       bcopy(pbuf, buf, BP_GET_PSIZE(bp));
-                               }
-                       }
-                       if (buf != pbuf)
-                               zfs_free(pbuf, size);
-                       if (error != 0)
-                               continue;
+               if (error == 0) {
+                       if (cpfunc != ZIO_COMPRESS_OFF)
+                               error = zio_decompress_data(cpfunc, pbuf,
+                                   BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
+                       else if (size != BP_GET_PSIZE(bp))
+                               bcopy(pbuf, buf, BP_GET_PSIZE(bp));
                }
%
% Decompression is now done for the gang blocks too.
% Ditto for the use of larger buffers in the raidz case.
%
-               error = 0;
-               break;
+               if (buf != pbuf)
+                       zfs_free(pbuf, size);
+               if (error == 0)
+                       break;
        }
        if (error != 0)
                printf("ZFS: i/o error - all block copies unavailable\n");



-- 
Andriy Gapon
_______________________________________________
freebsd-current@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "freebsd-current-unsubscr...@freebsd.org"

Reply via email to