On Fri, Jun 07, 2019 at 10:10:24PM +0900, Naohiro Aota wrote:
> Currently, dev-replace copy all the device extents on source device to the
> target device, and it also clones new incoming write I/Os from users to the
> source device into the target device.
> 
> Cloning incoming IOs can break the sequential write rule in the target
> device. When write is mapped in the middle of block group, that I/O is
> directed in the middle of a zone of target device, which breaks the
> sequential write rule.
> 
> However, the cloning function cannot be simply disabled since incoming I/Os
> targeting already copied device extents must be cloned so that the I/O is
> executed on the target device.
> 
> We cannot use dev_replace->cursor_{left,right} to determine whether bio
> is going to not yet copied region.  Since we have time gap between
> finishing btrfs_scrub_dev() and rewriting the mapping tree in
> btrfs_dev_replace_finishing(), we can have newly allocated device extent
> which is never cloned (by handle_ops_on_dev_replace) nor copied (by the
> dev-replace process).
> 
> So the point is to copy only already existing device extents. This patch
> introduce mark_block_group_to_copy() to mark existing block group as a
> target of copying. Then, handle_ops_on_dev_replace() and dev-replace can
> check the flag to do their job.
> 
> This patch also handles empty region between used extents. Since
> dev-replace is smart to copy only used extents on source device, we have to
> fill the gap to honor the sequential write rule in the target device.
> 
> Signed-off-by: Naohiro Aota <naohiro.a...@wdc.com>
> ---
>  fs/btrfs/ctree.h       |   1 +
>  fs/btrfs/dev-replace.c |  96 +++++++++++++++++++++++
>  fs/btrfs/extent-tree.c |  32 +++++++-
>  fs/btrfs/scrub.c       | 169 +++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/volumes.c     |  27 ++++++-
>  5 files changed, 319 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index dad8ea5c3b99..a0be2b96117a 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -639,6 +639,7 @@ struct btrfs_block_group_cache {
>       unsigned int has_caching_ctl:1;
>       unsigned int removed:1;
>       unsigned int wp_broken:1;
> +     unsigned int to_copy:1;
>  
>       int disk_cache_state;
>  
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index fbe5ea2a04ed..5011b5ce0e75 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -263,6 +263,13 @@ static int btrfs_init_dev_replace_tgtdev(struct 
> btrfs_fs_info *fs_info,
>       device->dev_stats_valid = 1;
>       set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
>       device->fs_devices = fs_info->fs_devices;
> +     if (bdev_is_zoned(bdev)) {
> +             ret = btrfs_get_dev_zonetypes(device);
> +             if (ret) {
> +                     mutex_unlock(&fs_info->fs_devices->device_list_mutex);
> +                     goto error;
> +             }
> +     }
>       list_add(&device->dev_list, &fs_info->fs_devices->devices);
>       fs_info->fs_devices->num_devices++;
>       fs_info->fs_devices->open_devices++;
> @@ -396,6 +403,88 @@ static char* btrfs_dev_name(struct btrfs_device *device)
>               return rcu_str_deref(device->name);
>  }
>  
> +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
> +                                 struct btrfs_device *src_dev)
> +{
> +     struct btrfs_path *path;
> +     struct btrfs_key key;
> +     struct btrfs_key found_key;
> +     struct btrfs_root *root = fs_info->dev_root;
> +     struct btrfs_dev_extent *dev_extent = NULL;
> +     struct btrfs_block_group_cache *cache;
> +     struct extent_buffer *l;
> +     int slot;
> +     int ret;
> +     u64 chunk_offset, length;
> +
> +     path = btrfs_alloc_path();
> +     if (!path)
> +             return -ENOMEM;
> +
> +     path->reada = READA_FORWARD;
> +     path->search_commit_root = 1;
> +     path->skip_locking = 1;
> +
> +     key.objectid = src_dev->devid;
> +     key.offset = 0ull;
> +     key.type = BTRFS_DEV_EXTENT_KEY;
> +
> +     while (1) {
> +             ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
> +             if (ret < 0)
> +                     break;
> +             if (ret > 0) {
> +                     if (path->slots[0] >=
> +                         btrfs_header_nritems(path->nodes[0])) {
> +                             ret = btrfs_next_leaf(root, path);
> +                             if (ret < 0)
> +                                     break;
> +                             if (ret > 0) {
> +                                     ret = 0;
> +                                     break;
> +                             }
> +                     } else {
> +                             ret = 0;
> +                     }
> +             }
> +
> +             l = path->nodes[0];
> +             slot = path->slots[0];
> +
> +             btrfs_item_key_to_cpu(l, &found_key, slot);
> +
> +             if (found_key.objectid != src_dev->devid)
> +                     break;
> +
> +             if (found_key.type != BTRFS_DEV_EXTENT_KEY)
> +                     break;
> +
> +             if (found_key.offset < key.offset)
> +                     break;
> +
> +             dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
> +             length = btrfs_dev_extent_length(l, dev_extent);
> +
> +             chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
> +
> +             cache = btrfs_lookup_block_group(fs_info, chunk_offset);
> +             if (!cache)
> +                     goto skip;
> +
> +             cache->to_copy = 1;
> +
> +             btrfs_put_block_group(cache);
> +
> +skip:
> +             key.offset = found_key.offset + length;
> +             btrfs_release_path(path);
> +     }
> +
> +     btrfs_free_path(path);
> +
> +     return ret;
> +}
> +
>  static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
>               const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
>               int read_src)
> @@ -439,6 +528,13 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
> *fs_info,
>       }
>  
>       need_unlock = true;
> +
> +     mutex_lock(&fs_info->chunk_mutex);
> +     ret = mark_block_group_to_copy(fs_info, src_device);
> +     mutex_unlock(&fs_info->chunk_mutex);
> +     if (ret)
> +             return ret;
> +
>       down_write(&dev_replace->rwsem);
>       switch (dev_replace->replace_state) {
>       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index ff4d55d6ef04..268365dd9a5d 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -29,6 +29,7 @@
>  #include "qgroup.h"
>  #include "ref-verify.h"
>  #include "rcu-string.h"
> +#include "dev-replace.h"
>  
>  #undef SCRAMBLE_DELAYED_REFS
>  
> @@ -2022,7 +2023,31 @@ int btrfs_discard_extent(struct btrfs_fs_info 
> *fs_info, u64 bytenr,
>                       if (btrfs_dev_is_sequential(stripe->dev,
>                                                   stripe->physical) &&
>                           stripe->length == stripe->dev->zone_size) {
> -                             ret = blkdev_reset_zones(stripe->dev->bdev,
> +                             struct btrfs_device *dev = stripe->dev;
> +
> +                             ret = blkdev_reset_zones(dev->bdev,
> +                                                      stripe->physical >>
> +                                                              SECTOR_SHIFT,
> +                                                      stripe->length >>
> +                                                              SECTOR_SHIFT,
> +                                                      GFP_NOFS);
> +                             if (!ret)
> +                                     discarded_bytes += stripe->length;
> +                             else
> +                                     break;
> +                             set_bit(stripe->physical >>
> +                                     dev->zone_size_shift,
> +                                     dev->empty_zones);
> +
> +                             if (!btrfs_dev_replace_is_ongoing(
> +                                         &fs_info->dev_replace) ||
> +                                 stripe->dev != fs_info->dev_replace.srcdev)
> +                                     continue;
> +
> +                             /* send to target as well */
> +                             dev = fs_info->dev_replace.tgtdev;
> +
> +                             ret = blkdev_reset_zones(dev->bdev,

This is unrelated to dev replace isn't it?  Please make this it's own patch, and
it's own helper while you are at it.  Thanks,

Josef

Reply via email to