On Fri, 22 May 2015 11:18:33 -0700 Ming Lin <m...@kernel.org> wrote:

> From: Kent Overstreet <kent.overstr...@gmail.com>
> 
> The way the block layer is currently written, it goes to great lengths
> to avoid having to split bios; upper layer code (such as bio_add_page())
> checks what the underlying device can handle and tries to always create
> bios that don't need to be split.
> 
> But this approach becomes unwieldy and eventually breaks down with
> stacked devices and devices with dynamic limits, and it adds a lot of
> complexity. If the block layer could split bios as needed, we could
> eliminate a lot of complexity elsewhere - particularly in stacked
> drivers. Code that creates bios can then create whatever size bios are
> convenient, and more importantly stacked drivers don't have to deal with
> both their own bio size limitations and the limitations of the
> (potentially multiple) devices underneath them.  In the future this will
> let us delete merge_bvec_fn and a bunch of other code.
> 
> We do this by adding calls to blk_queue_split() to the various
> make_request functions that need it - a few can already handle arbitrary
> size bios. Note that we add the call _after_ any call to
> blk_queue_bounce(); this means that blk_queue_split() and
> blk_recalc_rq_segments() don't need to be concerned with bouncing
> affecting segment merging.
> 
> Some make_request_fn() callbacks were simple enough to audit and verify
> they don't need blk_queue_split() calls. The skipped ones are:
> 
>  * nfhd_make_request (arch/m68k/emu/nfblock.c)
>  * axon_ram_make_request (arch/powerpc/sysdev/axonram.c)
>  * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c)
>  * brd_make_request (ramdisk - drivers/block/brd.c)
>  * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c)
>  * loop_make_request
>  * null_queue_bio
>  * bcache's make_request fns
> 
> Some others are almost certainly safe to remove now, but will be left
> for future patches.
> 
> Cc: Jens Axboe <ax...@kernel.dk>
> Cc: Christoph Hellwig <h...@infradead.org>
> Cc: Al Viro <v...@zeniv.linux.org.uk>
> Cc: Ming Lei <ming....@canonical.com>
> Cc: Neil Brown <ne...@suse.de>
> Cc: Alasdair Kergon <a...@redhat.com>
> Cc: Mike Snitzer <snit...@redhat.com>
> Cc: dm-de...@redhat.com
> Cc: Lars Ellenberg <drbd-...@lists.linbit.com>
> Cc: drbd-u...@lists.linbit.com
> Cc: Jiri Kosina <jkos...@suse.cz>
> Cc: Geoff Levand <ge...@infradead.org>
> Cc: Jim Paris <j...@jtan.com>
> Cc: Joshua Morris <josh.h.mor...@us.ibm.com>
> Cc: Philip Kelleher <pjk1...@linux.vnet.ibm.com>
> Cc: Minchan Kim <minc...@kernel.org>
> Cc: Nitin Gupta <ngu...@vflare.org>
> Cc: Oleg Drokin <oleg.dro...@intel.com>
> Cc: Andreas Dilger <andreas.dil...@intel.com>
> Signed-off-by: Kent Overstreet <kent.overstr...@gmail.com>
> [dpark: skip more mq-based drivers, resolve merge conflicts, etc.]
> Signed-off-by: Dongsu Park <dp...@posteo.net>
> Signed-off-by: Ming Lin <m...@kernel.org>


Acked-by: NeilBrown <ne...@suse.de>

For the 'md/md.c' bits.

Thanks,
NeilBrown


> ---
>  block/blk-core.c                            |  19 ++--
>  block/blk-merge.c                           | 159 
> ++++++++++++++++++++++++++--
>  block/blk-mq.c                              |   4 +
>  drivers/block/drbd/drbd_req.c               |   2 +
>  drivers/block/pktcdvd.c                     |   6 +-
>  drivers/block/ps3vram.c                     |   2 +
>  drivers/block/rsxx/dev.c                    |   2 +
>  drivers/block/umem.c                        |   2 +
>  drivers/block/zram/zram_drv.c               |   2 +
>  drivers/md/dm.c                             |   2 +
>  drivers/md/md.c                             |   2 +
>  drivers/s390/block/dcssblk.c                |   2 +
>  drivers/s390/block/xpram.c                  |   2 +
>  drivers/staging/lustre/lustre/llite/lloop.c |   2 +
>  include/linux/blkdev.h                      |   3 +
>  15 files changed, 189 insertions(+), 22 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 7871603..fbbb337 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -619,6 +619,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
> gfp_mask, int node_id)
>       if (q->id < 0)
>               goto fail_q;
>  
> +     q->bio_split = bioset_create(BIO_POOL_SIZE, 0);
> +     if (!q->bio_split)
> +             goto fail_id;
> +
>       q->backing_dev_info.ra_pages =
>                       (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
>       q->backing_dev_info.state = 0;
> @@ -628,7 +632,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
> gfp_mask, int node_id)
>  
>       err = bdi_init(&q->backing_dev_info);
>       if (err)
> -             goto fail_id;
> +             goto fail_split;
>  
>       setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
>                   laptop_mode_timer_fn, (unsigned long) q);
> @@ -670,6 +674,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
> gfp_mask, int node_id)
>  
>  fail_bdi:
>       bdi_destroy(&q->backing_dev_info);
> +fail_split:
> +     bioset_free(q->bio_split);
>  fail_id:
>       ida_simple_remove(&blk_queue_ida, q->id);
>  fail_q:
> @@ -1586,6 +1592,8 @@ void blk_queue_bio(struct request_queue *q, struct bio 
> *bio)
>       struct request *req;
>       unsigned int request_count = 0;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       /*
>        * low level driver can indicate that it wants pages above a
>        * certain limit bounced to low memory (ie for highmem, or even
> @@ -1809,15 +1817,6 @@ generic_make_request_checks(struct bio *bio)
>               goto end_io;
>       }
>  
> -     if (likely(bio_is_rw(bio) &&
> -                nr_sectors > queue_max_hw_sectors(q))) {
> -             printk(KERN_ERR "bio too big device %s (%u > %u)\n",
> -                    bdevname(bio->bi_bdev, b),
> -                    bio_sectors(bio),
> -                    queue_max_hw_sectors(q));
> -             goto end_io;
> -     }
> -
>       part = bio->bi_bdev->bd_part;
>       if (should_fail_request(part, bio->bi_iter.bi_size) ||
>           should_fail_request(&part_to_disk(part)->part0,
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index fd3fee8..dc14255 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -9,12 +9,158 @@
>  
>  #include "blk.h"
>  
> +static struct bio *blk_bio_discard_split(struct request_queue *q,
> +                                      struct bio *bio,
> +                                      struct bio_set *bs)
> +{
> +     unsigned int max_discard_sectors, granularity;
> +     int alignment;
> +     sector_t tmp;
> +     unsigned split_sectors;
> +
> +     /* Zero-sector (unknown) and one-sector granularities are the same.  */
> +     granularity = max(q->limits.discard_granularity >> 9, 1U);
> +
> +     max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
> +     max_discard_sectors -= max_discard_sectors % granularity;
> +
> +     if (unlikely(!max_discard_sectors)) {
> +             /* XXX: warn */
> +             return NULL;
> +     }
> +
> +     if (bio_sectors(bio) <= max_discard_sectors)
> +             return NULL;
> +
> +     split_sectors = max_discard_sectors;
> +
> +     /*
> +      * If the next starting sector would be misaligned, stop the discard at
> +      * the previous aligned sector.
> +      */
> +     alignment = (q->limits.discard_alignment >> 9) % granularity;
> +
> +     tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
> +     tmp = sector_div(tmp, granularity);
> +
> +     if (split_sectors > tmp)
> +             split_sectors -= tmp;
> +
> +     return bio_split(bio, split_sectors, GFP_NOIO, bs);
> +}
> +
> +static struct bio *blk_bio_write_same_split(struct request_queue *q,
> +                                         struct bio *bio,
> +                                         struct bio_set *bs)
> +{
> +     if (!q->limits.max_write_same_sectors)
> +             return NULL;
> +
> +     if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
> +             return NULL;
> +
> +     return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
> +}
> +
> +static struct bio *blk_bio_segment_split(struct request_queue *q,
> +                                      struct bio *bio,
> +                                      struct bio_set *bs)
> +{
> +     struct bio *split;
> +     struct bio_vec bv, bvprv;
> +     struct bvec_iter iter;
> +     unsigned seg_size = 0, nsegs = 0;
> +     int prev = 0;
> +
> +     struct bvec_merge_data bvm = {
> +             .bi_bdev        = bio->bi_bdev,
> +             .bi_sector      = bio->bi_iter.bi_sector,
> +             .bi_size        = 0,
> +             .bi_rw          = bio->bi_rw,
> +     };
> +
> +     bio_for_each_segment(bv, bio, iter) {
> +             if (q->merge_bvec_fn &&
> +                 q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
> +                     goto split;
> +
> +             bvm.bi_size += bv.bv_len;
> +
> +             if (bvm.bi_size >> 9 > queue_max_sectors(q))
> +                     goto split;
> +
> +             /*
> +              * If the queue doesn't support SG gaps and adding this
> +              * offset would create a gap, disallow it.
> +              */
> +             if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) &&
> +                 prev && bvec_gap_to_prev(&bvprv, bv.bv_offset))
> +                     goto split;
> +
> +             if (prev && blk_queue_cluster(q)) {
> +                     if (seg_size + bv.bv_len > queue_max_segment_size(q))
> +                             goto new_segment;
> +                     if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
> +                             goto new_segment;
> +                     if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
> +                             goto new_segment;
> +
> +                     seg_size += bv.bv_len;
> +                     bvprv = bv;
> +                     prev = 1;
> +                     continue;
> +             }
> +new_segment:
> +             if (nsegs == queue_max_segments(q))
> +                     goto split;
> +
> +             nsegs++;
> +             bvprv = bv;
> +             prev = 1;
> +             seg_size = bv.bv_len;
> +     }
> +
> +     return NULL;
> +split:
> +     split = bio_clone_bioset(bio, GFP_NOIO, bs);
> +
> +     split->bi_iter.bi_size -= iter.bi_size;
> +     bio->bi_iter = iter;
> +
> +     if (bio_integrity(bio)) {
> +             bio_integrity_advance(bio, split->bi_iter.bi_size);
> +             bio_integrity_trim(split, 0, bio_sectors(split));
> +     }
> +
> +     return split;
> +}
> +
> +void blk_queue_split(struct request_queue *q, struct bio **bio,
> +                  struct bio_set *bs)
> +{
> +     struct bio *split;
> +
> +     if ((*bio)->bi_rw & REQ_DISCARD)
> +             split = blk_bio_discard_split(q, *bio, bs);
> +     else if ((*bio)->bi_rw & REQ_WRITE_SAME)
> +             split = blk_bio_write_same_split(q, *bio, bs);
> +     else
> +             split = blk_bio_segment_split(q, *bio, q->bio_split);
> +
> +     if (split) {
> +             bio_chain(split, *bio);
> +             generic_make_request(*bio);
> +             *bio = split;
> +     }
> +}
> +EXPORT_SYMBOL(blk_queue_split);
> +
>  static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
>                                            struct bio *bio,
>                                            bool no_sg_merge)
>  {
>       struct bio_vec bv, bvprv = { NULL };
> -     int cluster, high, highprv = 1;
> +     int cluster, prev = 0;
>       unsigned int seg_size, nr_phys_segs;
>       struct bio *fbio, *bbio;
>       struct bvec_iter iter;
> @@ -36,7 +182,6 @@ static unsigned int __blk_recalc_rq_segments(struct 
> request_queue *q,
>       cluster = blk_queue_cluster(q);
>       seg_size = 0;
>       nr_phys_segs = 0;
> -     high = 0;
>       for_each_bio(bio) {
>               bio_for_each_segment(bv, bio, iter) {
>                       /*
> @@ -46,13 +191,7 @@ static unsigned int __blk_recalc_rq_segments(struct 
> request_queue *q,
>                       if (no_sg_merge)
>                               goto new_segment;
>  
> -                     /*
> -                      * the trick here is making sure that a high page is
> -                      * never considered part of another segment, since
> -                      * that might change with the bounce page.
> -                      */
> -                     high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
> -                     if (!high && !highprv && cluster) {
> +                     if (prev && cluster) {
>                               if (seg_size + bv.bv_len
>                                   > queue_max_segment_size(q))
>                                       goto new_segment;
> @@ -72,8 +211,8 @@ new_segment:
>  
>                       nr_phys_segs++;
>                       bvprv = bv;
> +                     prev = 1;
>                       seg_size = bv.bv_len;
> -                     highprv = high;
>               }
>               bbio = bio;
>       }
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e68b71b..e7fae76 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1256,6 +1256,8 @@ static void blk_mq_make_request(struct request_queue 
> *q, struct bio *bio)
>               return;
>       }
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       rq = blk_mq_map_request(q, bio, &data);
>       if (unlikely(!rq))
>               return;
> @@ -1339,6 +1341,8 @@ static void blk_sq_make_request(struct request_queue 
> *q, struct bio *bio)
>               return;
>       }
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       if (use_plug && !blk_queue_nomerges(q) &&
>           blk_attempt_plug_merge(q, bio, &request_count))
>               return;
> diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
> index 3907202..a6265bc 100644
> --- a/drivers/block/drbd/drbd_req.c
> +++ b/drivers/block/drbd/drbd_req.c
> @@ -1497,6 +1497,8 @@ void drbd_make_request(struct request_queue *q, struct 
> bio *bio)
>       struct drbd_device *device = (struct drbd_device *) q->queuedata;
>       unsigned long start_jif;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       start_jif = jiffies;
>  
>       /*
> diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
> index 09e628da..ea10bd9 100644
> --- a/drivers/block/pktcdvd.c
> +++ b/drivers/block/pktcdvd.c
> @@ -2446,6 +2446,10 @@ static void pkt_make_request(struct request_queue *q, 
> struct bio *bio)
>       char b[BDEVNAME_SIZE];
>       struct bio *split;
>  
> +     blk_queue_bounce(q, &bio);
> +
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       pd = q->queuedata;
>       if (!pd) {
>               pr_err("%s incorrect request queue\n",
> @@ -2476,8 +2480,6 @@ static void pkt_make_request(struct request_queue *q, 
> struct bio *bio)
>               goto end_io;
>       }
>  
> -     blk_queue_bounce(q, &bio);
> -
>       do {
>               sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
>               sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
> diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
> index ef45cfb..e32e799 100644
> --- a/drivers/block/ps3vram.c
> +++ b/drivers/block/ps3vram.c
> @@ -605,6 +605,8 @@ static void ps3vram_make_request(struct request_queue *q, 
> struct bio *bio)
>  
>       dev_dbg(&dev->core, "%s\n", __func__);
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       spin_lock_irq(&priv->lock);
>       busy = !bio_list_empty(&priv->list);
>       bio_list_add(&priv->list, bio);
> diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
> index ac8c62c..50ef199 100644
> --- a/drivers/block/rsxx/dev.c
> +++ b/drivers/block/rsxx/dev.c
> @@ -148,6 +148,8 @@ static void rsxx_make_request(struct request_queue *q, 
> struct bio *bio)
>       struct rsxx_bio_meta *bio_meta;
>       int st = -EINVAL;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       might_sleep();
>  
>       if (!card)
> diff --git a/drivers/block/umem.c b/drivers/block/umem.c
> index 4cf81b5..13d577c 100644
> --- a/drivers/block/umem.c
> +++ b/drivers/block/umem.c
> @@ -531,6 +531,8 @@ static void mm_make_request(struct request_queue *q, 
> struct bio *bio)
>                (unsigned long long)bio->bi_iter.bi_sector,
>                bio->bi_iter.bi_size);
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       spin_lock_irq(&card->lock);
>       *card->biotail = bio;
>       bio->bi_next = NULL;
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 8dcbced..36a004e 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -981,6 +981,8 @@ static void zram_make_request(struct request_queue 
> *queue, struct bio *bio)
>       if (unlikely(!zram_meta_get(zram)))
>               goto error;
>  
> +     blk_queue_split(queue, &bio, queue->bio_split);
> +
>       if (!valid_io_request(zram, bio->bi_iter.bi_sector,
>                                       bio->bi_iter.bi_size)) {
>               atomic64_inc(&zram->stats.invalid_io);
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index a930b72..34f6063 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -1784,6 +1784,8 @@ static void dm_make_request(struct request_queue *q, 
> struct bio *bio)
>  
>       map = dm_get_live_table(md, &srcu_idx);
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
>  
>       /* if we're suspended, we have to queue this io for later */
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 593a024..046b3c9 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -257,6 +257,8 @@ static void md_make_request(struct request_queue *q, 
> struct bio *bio)
>       unsigned int sectors;
>       int cpu;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       if (mddev == NULL || mddev->pers == NULL
>           || !mddev->ready) {
>               bio_io_error(bio);
> diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
> index da21281..267ca3a 100644
> --- a/drivers/s390/block/dcssblk.c
> +++ b/drivers/s390/block/dcssblk.c
> @@ -826,6 +826,8 @@ dcssblk_make_request(struct request_queue *q, struct bio 
> *bio)
>       unsigned long source_addr;
>       unsigned long bytes_done;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       bytes_done = 0;
>       dev_info = bio->bi_bdev->bd_disk->private_data;
>       if (dev_info == NULL)
> diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
> index 7d4e939..1305ed3 100644
> --- a/drivers/s390/block/xpram.c
> +++ b/drivers/s390/block/xpram.c
> @@ -190,6 +190,8 @@ static void xpram_make_request(struct request_queue *q, 
> struct bio *bio)
>       unsigned long page_addr;
>       unsigned long bytes;
>  
> +     blk_queue_split(q, &bio, q->bio_split);
> +
>       if ((bio->bi_iter.bi_sector & 7) != 0 ||
>           (bio->bi_iter.bi_size & 4095) != 0)
>               /* Request is not page-aligned. */
> diff --git a/drivers/staging/lustre/lustre/llite/lloop.c 
> b/drivers/staging/lustre/lustre/llite/lloop.c
> index 413a840..a8645a9 100644
> --- a/drivers/staging/lustre/lustre/llite/lloop.c
> +++ b/drivers/staging/lustre/lustre/llite/lloop.c
> @@ -340,6 +340,8 @@ static void loop_make_request(struct request_queue *q, 
> struct bio *old_bio)
>       int rw = bio_rw(old_bio);
>       int inactive;
>  
> +     blk_queue_split(q, &old_bio, q->bio_split);
> +
>       if (!lo)
>               goto err;
>  
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 7f9a516..93b81a2 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -488,6 +488,7 @@ struct request_queue {
>  
>       struct blk_mq_tag_set   *tag_set;
>       struct list_head        tag_set_list;
> +     struct bio_set          *bio_split;
>  };
>  
>  #define QUEUE_FLAG_QUEUED    1       /* uses generic tag queueing */
> @@ -812,6 +813,8 @@ extern void blk_rq_unprep_clone(struct request *rq);
>  extern int blk_insert_cloned_request(struct request_queue *q,
>                                    struct request *rq);
>  extern void blk_delay_queue(struct request_queue *, unsigned long);
> +extern void blk_queue_split(struct request_queue *, struct bio **,
> +                         struct bio_set *);
>  extern void blk_recount_segments(struct request_queue *, struct bio *);
>  extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
>  extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,

Attachment: pgpMmOnX1K3YE.pgp
Description: OpenPGP digital signature

Reply via email to