[dm-devel] [PATCH 3/3] dm-zoned: properly handle backing device failure

Dmitry Fomichev Mon, 05 Aug 2019 18:12:24 -0700

dm-zoned is observed to lock up or livelock in case of hardware
failure or some misconfiguration of the backing zoned device.


This patch adds a new function in the target code that checks the
status of the backing device request queue. If it goes to dying state,
the health check code marks dm-zoned DM target read-only and begins
to reject any further incoming I/O. In order to timely detect backing
device failure, this new function is called in the request mapping
path, at the beginning of every reclaim run and before performing any
metadata I/O.

The proper way out of this situation is to do

dmsetup remove <dm-zoned target>

and recreate the target when the problem with the backing device
is resolved.

Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
Cc: [email protected]
Signed-off-by: Dmitry Fomichev <[email protected]>
---
 drivers/md/dm-zoned-metadata.c | 51 +++++++++++++++++++++++++++-------
 drivers/md/dm-zoned-reclaim.c  | 18 ++++++++++--
 drivers/md/dm-zoned-target.c   | 50 +++++++++++++++++++++++++++++++--
 drivers/md/dm-zoned.h          | 10 +++++++
 4 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 6b7fbbd735ef..2a5bc51fd6d5 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -403,15 +403,18 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct 
dmz_metadata *zmd,
        sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
        struct bio *bio;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return ERR_PTR(-EIO);
+
        /* Get a new block and a BIO to read it */
        mblk = dmz_alloc_mblock(zmd, mblk_no);
        if (!mblk)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio) {
                dmz_free_mblock(zmd, mblk);
-               return NULL;
+               return ERR_PTR(-ENOMEM);
        }
 
        spin_lock(&zmd->mblk_lock);
@@ -542,8 +545,8 @@ static struct dmz_mblock *dmz_get_mblock(struct 
dmz_metadata *zmd,
        if (!mblk) {
                /* Cache miss: read the block from disk */
                mblk = dmz_get_mblock_slow(zmd, mblk_no);
-               if (!mblk)
-                       return ERR_PTR(-ENOMEM);
+               if (IS_ERR(mblk))
+                       return mblk;
        }
 
        /* Wait for on-going read I/O and check for error */
@@ -571,16 +574,19 @@ static void dmz_dirty_mblock(struct dmz_metadata *zmd, 
struct dmz_mblock *mblk)
 /*
  * Issue a metadata block write BIO.
  */
-static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
-                            unsigned int set)
+static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
+                           unsigned int set)
 {
        sector_t block = zmd->sb[set].block + mblk->no;
        struct bio *bio;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return -EIO;
+
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio) {
                set_bit(DMZ_META_ERROR, &mblk->state);
-               return;
+               return -ENOMEM;
        }
 
        set_bit(DMZ_META_WRITING, &mblk->state);
@@ -592,6 +598,8 @@ static void dmz_write_mblock(struct dmz_metadata *zmd, 
struct dmz_mblock *mblk,
        bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
        submit_bio(bio);
+
+       return 0;
 }
 
 /*
@@ -603,6 +611,9 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, 
sector_t block,
        struct bio *bio;
        int ret;
 
+       if (dmz_bdev_is_dying(zmd->dev))
+               return -EIO;
+
        bio = bio_alloc(GFP_NOIO, 1);
        if (!bio)
                return -ENOMEM;
@@ -660,22 +671,29 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata 
*zmd,
 {
        struct dmz_mblock *mblk;
        struct blk_plug plug;
-       int ret = 0;
+       int ret = 0, nr_mblks_submitted = 0;
 
        /* Issue writes */
        blk_start_plug(&plug);
-       list_for_each_entry(mblk, write_list, link)
-               dmz_write_mblock(zmd, mblk, set);
+       list_for_each_entry(mblk, write_list, link) {
+               ret = dmz_write_mblock(zmd, mblk, set);
+               if (ret)
+                       break;
+               nr_mblks_submitted++;
+       }
        blk_finish_plug(&plug);
 
        /* Wait for completion */
        list_for_each_entry(mblk, write_list, link) {
+               if (!nr_mblks_submitted)
+                       break;
                wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
                               TASK_UNINTERRUPTIBLE);
                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
                        clear_bit(DMZ_META_ERROR, &mblk->state);
                        ret = -EIO;
                }
+               nr_mblks_submitted--;
        }
 
        /* Flush drive cache (this will also sync data) */
@@ -737,6 +755,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
         */
        dmz_lock_flush(zmd);
 
+       if (dmz_bdev_is_dying(zmd->dev)) {
+               ret = -EIO;
+               goto out;
+       }
+
        /* Get dirty blocks */
        spin_lock(&zmd->mblk_lock);
        list_splice_init(&zmd->mblk_dirty_list, &write_list);
@@ -1632,6 +1655,10 @@ struct dm_zone *dmz_get_chunk_mapping(struct 
dmz_metadata *zmd, unsigned int chu
                /* Allocate a random zone */
                dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
                if (!dzone) {
+                       if (dmz_bdev_is_dying(zmd->dev)) {
+                               dzone = ERR_PTR(-EIO);
+                               goto out;
+                       }
                        dmz_wait_for_free_zones(zmd);
                        goto again;
                }
@@ -1729,6 +1756,10 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata 
*zmd,
        /* Allocate a random zone */
        bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
        if (!bzone) {
+               if (dmz_bdev_is_dying(zmd->dev)) {
+                       bzone = ERR_PTR(-EIO);
+                       goto out;
+               }
                dmz_wait_for_free_zones(zmd);
                goto again;
        }
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
index 26e34493a2db..d240d7ca8a8a 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -38,7 +38,7 @@ enum {
 /*
  * Number of seconds of target BIO inactivity to consider the target idle.
  */
-#define DMZ_IDLE_PERIOD                (10UL * HZ)
+#define DMZ_IDLE_PERIOD                        (10UL * HZ)
 
 /*
  * Percentage of unmapped (free) random zones below which reclaim starts
@@ -135,6 +135,9 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
                set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
 
        while (block < end_block) {
+               if (dev->flags & DMZ_BDEV_DYING)
+                       return -EIO;
+
                /* Get a valid region from the source zone */
                ret = dmz_first_valid_block(zmd, src_zone, &block);
                if (ret <= 0)
@@ -452,6 +455,9 @@ static void dmz_reclaim_work(struct work_struct *work)
        unsigned int p_unmap_rnd;
        int ret;
 
+       if (dmz_bdev_is_dying(zrc->dev))
+               return;
+
        if (!dmz_should_reclaim(zrc)) {
                mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
                return;
@@ -481,8 +487,16 @@ static void dmz_reclaim_work(struct work_struct *work)
                      p_unmap_rnd, nr_unmap_rnd, nr_rnd);
 
        ret = dmz_do_reclaim(zrc);
-       if (ret)
+       if (ret) {
                dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret);
+               if (ret == -EIO)
+                       /*
+                        * LLD might be performing some error handling sequence
+                        * at the underlying device. To not interfere, do not
+                        * attempt to schedule the next reclaim run immediately.
+                        */
+                       return;
+       }
 
        dmz_schedule_reclaim(zrc);
 }
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index c1992034c099..c40c18f7fd64 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -134,6 +134,8 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct 
dm_zone *zone,
 
        refcount_inc(&bioctx->ref);
        generic_make_request(clone);
+       if (clone->bi_status == BLK_STS_IOERR)
+               return -EIO;
 
        if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
                zone->wp_block += nr_blocks;
@@ -278,8 +280,8 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
 
        /* Get the buffer zone. One will be allocated if needed */
        bzone = dmz_get_chunk_buffer(zmd, zone);
-       if (!bzone)
-               return -ENOSPC;
+       if (IS_ERR(bzone))
+               return PTR_ERR(bzone);
 
        if (dmz_is_readonly(bzone))
                return -EROFS;
@@ -390,6 +392,11 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct 
dm_chunk_work *cw,
 
        dmz_lock_metadata(zmd);
 
+       if (dmz->dev->flags & DMZ_BDEV_DYING) {
+               ret = -EIO;
+               goto out;
+       }
+
        /*
         * Get the data zone mapping the chunk. There may be no
         * mapping for read and discard. If a mapping is obtained,
@@ -494,6 +501,8 @@ static void dmz_flush_work(struct work_struct *work)
 
        /* Flush dirty metadata blocks */
        ret = dmz_flush_metadata(dmz->metadata);
+       if (ret)
+               dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
 
        /* Process queued flush requests */
        while (1) {
@@ -557,6 +566,37 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, 
struct bio *bio)
        return ret;
 }
 
+/*
+ * Check the backing device availability. If it's on the way out,
+ * start failing I/O. Reclaim and metadata components also call this
+ * function to cleanly abort operation in the event of such failure.
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
+{
+       struct gendisk *disk;
+       bool dying = dmz_dev->flags & DMZ_BDEV_DYING;
+
+       if (!dying) {
+               disk = dmz_dev->bdev->bd_disk;
+               if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
+                       dmz_dev_warn(dmz_dev, "Backing device queue dying");
+                       dying = true;
+               } else if (disk->fops->check_events) {
+                       if (disk->fops->check_events(disk, 0) &
+                                       DISK_EVENT_MEDIA_CHANGE) {
+                               dmz_dev_warn(dmz_dev, "Backing device offline");
+                               dying = true;
+                       }
+               }
+               if (dying) {
+                       dmz_dev->flags |= DMZ_BDEV_DYING;
+                       set_disk_ro(dmz_dev->bdev->bd_disk, 1);
+               }
+       }
+
+       return dying;
+}
+
 /*
  * Process a new BIO.
  */
@@ -570,6 +610,9 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
        sector_t chunk_sector;
        int ret;
 
+       if (dmz_bdev_is_dying(dmz->dev))
+               return DM_MAPIO_KILL;
+
        dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block 
%llu, %u blocks",
                      bio_op(bio), (unsigned long long)sector, nr_sectors,
                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
@@ -866,6 +909,9 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct 
block_device **bdev)
 {
        struct dmz_target *dmz = ti->private;
 
+       if (dmz_bdev_is_dying(dmz->dev))
+               return -ENODEV;
+
        *bdev = dmz->dev->bdev;
 
        return 0;
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 1a3b06de2c00..d8e70b0ade35 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -57,6 +57,8 @@ struct dmz_dev {
 
        unsigned int            nr_zones;
 
+       unsigned int            flags;
+
        sector_t                zone_nr_sectors;
        unsigned int            zone_nr_sectors_shift;
 
@@ -68,6 +70,9 @@ struct dmz_dev {
                                 (dev)->zone_nr_sectors_shift)
 #define dmz_chunk_block(dev, b)        ((b) & ((dev)->zone_nr_blocks - 1))
 
+/* Device flags. */
+#define DMZ_BDEV_DYING         (1 << 0)
+
 /*
  * Zone descriptor.
  */
@@ -246,4 +251,9 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc);
 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
 void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
 
+/*
+ * Functions defined in dm-zoned-target.c
+ */
+bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
+
 #endif /* DM_ZONED_H */
-- 
2.21.0

--
dm-devel mailing list
[email protected]
https://www.redhat.com/mailman/listinfo/dm-devel

[dm-devel] [PATCH 3/3] dm-zoned: properly handle backing device failure

Reply via email to