From: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com> We see that virtio-blk requests from Windows Server 2022 are not aligned to sector size (512), specifically that io vectors start at 0x8 offset to sector alignment. And if we send such unaligned requests directly to block device it refuses to work with them.
In qemu there is a "bounce buffer" technique to mitigate such problems. Perfectly aligned intermediary buffer is allocated and request is converted to use it instead of unaligned original memory. So we should do the same thing in vhost-blk driver too. v2: - bounce buffer uses kvec so bio does not pin those pages, so we should not relase them too - write requests with unaligned length are not supported, add warning and fail such requests - move io vectors to per-request storage to use after bio completion - make error handling in vhost_blk_bio_make a bit more explicit https://virtuozzo.atlassian.net/browse/PSBM-157752 Signed-off-by: Andrey Zhadchenko <andrey.zhadche...@virtuozzo.com> Signed-off-by: Pavel Tikhomirov <ptikhomi...@virtuozzo.com> --- Note: My original plan was to support all kinds of unaligned requests including write requests with unaligned length. To emulate such a requests with aligned-only operations we need to pre-read the last (partial) sector into memory, combine it's data with write data and do aligned write with combined data. Den says that such requests should not be considered correct and error for them is fine. Just in case, I have some early (not yet working, not shure why) version of it saved here: https://bitbucket.org/virtuozzocore/vzkernel.ptikhomirov/commits/branch/vhost-blk-improvements-4 --- drivers/vhost/blk.c | 148 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 137 insertions(+), 11 deletions(-) diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c index 0c80967bfeee1..fdb063fafc996 100644 --- a/drivers/vhost/blk.c +++ b/drivers/vhost/blk.c @@ -59,6 +59,7 @@ struct vhost_blk_req { struct iovec *iov; int iov_nr; + void *bb; atomic_t bio_nr; struct bio **bio; @@ -68,7 +69,7 @@ struct vhost_blk_req { sector_t sector; int bi_opf; u16 head; - long len; + size_t len; int bio_err; struct vhost_blk *blk; @@ -153,7 +154,13 @@ static void vhost_blk_req_done(struct bio *bio) vhost_vq_work_queue(&req->blk_vq->vq, &req->blk_vq->work); } - bio_release_pages(bio, !req->bi_opf); + /* + * Bounce buffer adds kvec to bio (instead of user backed memory), + * so there is no reference/pin to bio pages in this case. + */ + if (!req->bb) + bio_release_pages(bio, !req->bi_opf); + bio_put(bio); } @@ -161,6 +168,12 @@ static void vhost_blk_req_cleanup(struct vhost_blk_req *req) { if (req->bio != req->inline_bio) kfree(req->bio); + + if (req->iov != req->blk_vq->iov) + kvfree(req->iov); + + if (req->bb) + kvfree(req->bb); } static int vhost_blk_bio_make_simple(struct vhost_blk_req *req, @@ -184,11 +197,71 @@ static int vhost_blk_bio_make_simple(struct vhost_blk_req *req, return 0; } +inline static bool vhost_blk_iov_need_bb(struct vhost_blk_req *req) +{ + int i; + + if (req->len & SECTOR_MASK) + return true; + + for (i = 0; i < req->iov_nr; i++) { + if (((size_t)req->iov[i].iov_base) & SECTOR_MASK) + return true; + if (req->iov[i].iov_len & SECTOR_MASK) + return true; + } + + return false; +} + +static int vhost_blk_save_iov_to_req(struct vhost_blk_req *req) +{ + struct iovec *iov; + + BUG_ON(req->iov != req->blk_vq->iov); + + iov = kvmalloc(req->iov_nr * sizeof(struct iovec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + + if (move_iovec(req->iov, iov, req->len, req->iov_nr, req->iov_nr) < 0) { + kvfree(iov); + return -EINVAL; + } + + req->iov = iov; + return 0; +} + + +static size_t vhost_blk_move_req_to_bb(struct vhost_blk_req *req) +{ + struct iov_iter iter; + + iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len); + if (copy_from_iter(req->bb, req->len, &iter) != req->len) + return -EINVAL; + + return 0; +} + +static size_t vhost_blk_move_bb_to_req(struct vhost_blk_req *req) +{ + struct iov_iter iter; + + iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len); + if (copy_to_iter(req->bb, req->len, &iter) != req->len) + return -EINVAL; + + return 0; +} + static int vhost_blk_bio_make(struct vhost_blk_req *req, struct block_device *bdev) { int nr_pages, nr_pages_total = 0, bio_nr = 0, ret, i; struct iov_iter iter; + struct kvec kv; struct bio *bio; sector_t sector = req->sector; unsigned long pos = 0; @@ -196,15 +269,61 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req, if (unlikely(req->bi_opf == REQ_OP_FLUSH)) return vhost_blk_bio_make_simple(req, bdev); - iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len); + if (req->bi_opf == REQ_OP_WRITE && req->len & SECTOR_MASK) { + WARN_ONCE(1, "vhost-blk: write requests with unaligned len" + " are not supported, len = %zu", req->len); + return -EINVAL; + } + + /* + * Requests with unaligned io vectors are not supported by underlined + * block device, let's use an aligned intermediary "bounce buffer" to + * make such requests work. + */ + if (vhost_blk_iov_need_bb(req)) { + size_t len; + + /* + * Save io vectors for read request, as we would need to fill + * them with read data from bounce buffer after bio completion + * (see vhost_blk_move_bb_to_req), and vhost_blk_vq->iov can be + * reused by other request at that point. + */ + if (req->bi_opf == REQ_OP_READ) { + ret = vhost_blk_save_iov_to_req(req); + if (ret) + return ret; + } + + len = ALIGN(req->len, SECTOR_SIZE); + req->bb = kvmalloc(len, GFP_KERNEL); + if (!req->bb) { + ret = -ENOMEM; + goto err_req; + } + + if (req->bi_opf == REQ_OP_WRITE) { + ret = vhost_blk_move_req_to_bb(req); + if (ret) + goto err_req; + } + + kv.iov_base = req->bb; + kv.iov_len = len; + iov_iter_kvec(&iter, req->bi_opf, &kv, 1, len); + } else { + iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len); + } nr_pages_total = iov_iter_npages(&iter, INT_MAX); if (nr_pages_total > NR_INLINE * BIO_MAX_VECS) { req->bio = kmalloc(((nr_pages_total + BIO_MAX_VECS - 1) / BIO_MAX_VECS) * sizeof(struct bio *), GFP_KERNEL); - if (!req->bio) - return -ENOMEM; + if (!req->bio) { + ret = -ENOMEM; + goto err_req; + } } else { req->bio = req->inline_bio; } @@ -215,8 +334,10 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req, BUG_ON(pos & SECTOR_MASK); bio = bio_alloc(bdev, nr_pages, req->bi_opf, GFP_KERNEL); - if (!bio) - goto fail; + if (!bio) { + ret = -ENOMEM; + goto err_bio; + } bio->bi_iter.bi_sector = sector; bio->bi_private = req; @@ -225,7 +346,7 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { bio_put(bio); - goto fail; + goto err_bio; } req->bio[bio_nr++] = bio; @@ -237,15 +358,15 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req, atomic_set(&req->bio_nr, bio_nr); return 0; -fail: +err_bio: for (i = 0; i < bio_nr; i++) { bio_release_pages(req->bio[i], false); bio_put(req->bio[i]); } - +err_req: vhost_blk_req_cleanup(req); - return -ENOMEM; + return ret; } static inline void vhost_blk_bio_send(struct vhost_blk_req *req) @@ -329,6 +450,7 @@ static int vhost_blk_req_handle(struct vhost_virtqueue *vq, req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status); req->iov_nr = move_iovec(vq->iov, req->iov, req->len, total_iov_nr, ARRAY_SIZE(blk_vq->iov)); + req->bb = NULL; ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr, ARRAY_SIZE(req->status)); @@ -456,6 +578,10 @@ static void vhost_blk_handle_host_kick(struct vhost_work *work) if (!blk) blk = req->blk; + if (!req->bio_err && req->bb && req->bi_opf == REQ_OP_READ) { + if (vhost_blk_move_bb_to_req(req)) + req->bio_err = EINVAL; + } vhost_blk_req_cleanup(req); status = req->bio_err == 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; -- 2.47.0 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel