On 2015-01-30 at 09:33, Peter Lieven wrote:
this patch finally introduces multiread support to virtio-blk. While
multiwrite support was there for a long time, read support was missing.
The complete merge logic is moved into virtio-blk.c which has
been the only user of request merging ever since. This is required
to be able to merge chunks of requests and immediately invoke callbacks
for those requests. Secondly, this is required to switch to
direct invocation of coroutines which is planned at a later stage.
The following benchmarks show the performance of running fio with
4 worker threads on a local ram disk. The numbers show the average
of 10 test runs after 1 run as warmup phase.
| 4k | 64k | 4k
MB/s | rd seq | rd rand | rd seq | rd rand | wr seq | wr rand
--------------+--------+---------+--------+---------+--------+--------
master | 1221 | 1187 | 4178 | 4114 | 1745 | 1213
multiread | 1829 | 1189 | 4639 | 4110 | 1894 | 1216
Signed-off-by: Peter Lieven <p...@kamp.de>
---
hw/block/dataplane/virtio-blk.c | 8 +-
hw/block/virtio-blk.c | 288 +++++++++++++++++++++++++++-------------
include/hw/virtio/virtio-blk.h | 19 +--
trace-events | 1 +
4 files changed, 210 insertions(+), 106 deletions(-)
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 39c5d71..93472e9 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -96,9 +96,7 @@ static void handle_notify(EventNotifier *e)
event_notifier_test_and_clear(&s->host_notifier);
blk_io_plug(s->conf->conf.blk);
for (;;) {
- MultiReqBuffer mrb = {
- .num_writes = 0,
- };
+ MultiReqBuffer mrb = {};
int ret;
/* Disable guest->host notifies to avoid unnecessary vmexits */
@@ -120,7 +118,9 @@ static void handle_notify(EventNotifier *e)
virtio_blk_handle_request(req, &mrb);
}
- virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
+ if (mrb.num_reqs) {
+ virtio_submit_multireq(s->conf->conf.blk, &mrb);
+ }
if (likely(ret == -EAGAIN)) { /* vring emptied */
/* Re-enable guest->host notifies and stop processing the vring.
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e04adb8..77890a0 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -34,6 +34,7 @@ VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
req->dev = s;
req->qiov.size = 0;
req->next = NULL;
+ req->mr_next = NULL;
return req;
}
@@ -84,20 +85,29 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
static void virtio_blk_rw_complete(void *opaque, int ret)
{
- VirtIOBlockReq *req = opaque;
+ VirtIOBlockReq *next = opaque;
- trace_virtio_blk_rw_complete(req, ret);
+ while (next) {
+ VirtIOBlockReq *req = next;
+ next = req->mr_next;
+ trace_virtio_blk_rw_complete(req, ret);
- if (ret) {
- int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
- bool is_read = !(p & VIRTIO_BLK_T_OUT);
- if (virtio_blk_handle_rw_error(req, -ret, is_read))
- return;
- }
+ if (req->qiov.nalloc != -1) {
+ qemu_iovec_destroy(&req->qiov);
+ }
- virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
- block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
- virtio_blk_free_request(req);
+ if (ret) {
+ int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
+ bool is_read = !(p & VIRTIO_BLK_T_OUT);
+ if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
+ continue;
+ }
+ }
+
+ virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+ block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+ virtio_blk_free_request(req);
+ }
}
static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -291,24 +301,125 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
}
}
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
+static inline void
Is the "inline" really worth it? This function is rather long...
(To my surprise, gcc actually does inline the function)
+virtio_submit_multireq2(BlockBackend *blk, MultiReqBuffer *mrb,
Hm, that's not a very descriptive name... Maybe "submit_merged_requests"
or something like that (it's static, so you can drop the virtio_ prefix
if you want to) would be better?
+ int start, int num_reqs, int niov)
{
- int i, ret;
+ QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
+ int64_t sector_num = mrb->reqs[start]->sector_num;
+ int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE;
+ bool is_write = mrb->is_write;
+
+ if (num_reqs > 1) {
+ int i;
+ struct iovec *_iov = qiov->iov;
+ int _niov = qiov->niov;
Identifiers with leading underscores are considered reserved, I'd rather
avoid using them.
+
+ qemu_iovec_init(qiov, niov);
+
+ for (i = 0; i < _niov; i++) {
+ qemu_iovec_add(qiov, _iov[i].iov_base, _iov[i].iov_len);
+ }
- if (!mrb->num_writes) {
+ for (i = start + 1; i < start + num_reqs; i++) {
+ qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
+ mrb->reqs[i]->qiov.size);
+ mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
+ nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE;
+ }
+ assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
+
+ trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num,
+ nb_sectors, is_write);
+ block_acct_merge_done(blk_get_stats(blk),
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
+ num_reqs - 1);
+ }
+
+ if (is_write) {
+ blk_aio_writev(blk, sector_num, qiov, nb_sectors,
+ virtio_blk_rw_complete, mrb->reqs[start]);
+ } else {
+ blk_aio_readv(blk, sector_num, qiov, nb_sectors,
+ virtio_blk_rw_complete, mrb->reqs[start]);
+ }
+}
+
+static int virtio_multireq_compare(const void *a, const void *b)
+{
+ const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
+ *req2 = *(VirtIOBlockReq **)b;
+
+ /*
+ * Note that we can't simply subtract sector_num1 from sector_num2
+ * here as that could overflow the return value.
+ */
+ if (req1->sector_num > req2->sector_num) {
+ return 1;
+ } else if (req1->sector_num < req2->sector_num) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
+{
+ int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0,
+ max_xfer_len = 0;
Albeit not necessary, starting a new "int" line won't take any more
space (and would look nicer to me, probably).
+ int64_t sector_num = 0;
+
+ if (mrb->num_reqs == 1) {
+ virtio_submit_multireq2(blk, mrb, 0, 1, -1);
+ mrb->num_reqs = 0;
return;
}
- ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
- if (ret != 0) {
- for (i = 0; i < mrb->num_writes; i++) {
- if (mrb->blkreq[i].error) {
- virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
+ max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
+ max_xfer_len = MIN_NON_ZERO(max_xfer_len, INT_MAX);
+
+ qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
+ &virtio_multireq_compare);
+
+ for (i = 0; i < mrb->num_reqs; i++) {
+ VirtIOBlockReq *req = mrb->reqs[i];
+ if (num_reqs > 0) {
+ bool merge = true;
+
+ /* merge would exceed maximum number of IOVs */
+ if (niov + req->qiov.niov + 1 > IOV_MAX) {
Hm, why the +1?
+ merge = false;
+ }
+
+ /* merge would exceed maximum transfer length of backend device */
+ if (req->qiov.size / BDRV_SECTOR_SIZE + nb_sectors > max_xfer_len)
{
+ merge = false;
+ }
+
+ /* requests are not sequential */
+ if (sector_num + nb_sectors != req->sector_num) {
+ merge = false;
+ }
+
+ if (!merge) {
+ virtio_submit_multireq2(blk, mrb, start, num_reqs, niov);
+ num_reqs = 0;
}
}
+
+ if (num_reqs == 0) {
+ sector_num = req->sector_num;
+ nb_sectors = niov = 0;
+ start = i;
+ }
+
+ nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
+ niov += req->qiov.niov;
+ num_reqs++;
}
- mrb->num_writes = 0;
+ virtio_submit_multireq2(blk, mrb, start, num_reqs, niov);
+ mrb->num_reqs = 0;
}
static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
@@ -319,7 +430,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req,
MultiReqBuffer *mrb)
/*
* Make sure all outstanding writes are posted to the backing device.
*/
- virtio_submit_multiwrite(req->dev->blk, mrb);
+ if (mrb->is_write && mrb->num_reqs > 0) {
+ virtio_submit_multireq(req->dev->blk, mrb);
+ }
blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
}
@@ -329,6 +442,9 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
uint64_t total_sectors;
+ if (nb_sectors > INT_MAX) {
+ return false;
+ }
if (sector & dev->sector_mask) {
return false;
}
@@ -342,60 +458,6 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
return true;
}
-static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
-{
- BlockRequest *blkreq;
- uint64_t sector;
-
- sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
- trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
-
- if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
- virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
- virtio_blk_free_request(req);
- return;
- }
-
- block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
- BLOCK_ACCT_WRITE);
-
- if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
- virtio_submit_multiwrite(req->dev->blk, mrb);
- }
-
- blkreq = &mrb->blkreq[mrb->num_writes];
- blkreq->sector = sector;
- blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
- blkreq->qiov = &req->qiov;
- blkreq->cb = virtio_blk_rw_complete;
- blkreq->opaque = req;
- blkreq->error = 0;
-
- mrb->num_writes++;
-}
-
-static void virtio_blk_handle_read(VirtIOBlockReq *req)
-{
- uint64_t sector;
-
- sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
- trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
-
- if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
- virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
- virtio_blk_free_request(req);
- return;
- }
-
- block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
- BLOCK_ACCT_READ);
- blk_aio_readv(req->dev->blk, sector, &req->qiov,
- req->qiov.size / BDRV_SECTOR_SIZE,
- virtio_blk_rw_complete, req);
-}
-
void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
{
uint32_t type;
@@ -430,11 +492,54 @@ void virtio_blk_handle_request(VirtIOBlockReq *req,
MultiReqBuffer *mrb)
type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
- if (type & VIRTIO_BLK_T_FLUSH) {
+ switch (type & 0xff) {
Somehow I'd prefer type & ~VIRTIO_BLK_T_BARRIER, but this is correct, too.
+ case VIRTIO_BLK_T_IN:
+ case VIRTIO_BLK_T_OUT:
+ {
+ bool is_write = type & VIRTIO_BLK_T_OUT;
+ req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
+ &req->out.sector);
+
+ if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
+ req->qiov.size)) {
+ virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+ virtio_blk_free_request(req);
+ return;
+ }
+
+ if (is_write) {
+ qemu_iovec_init_external(&req->qiov, iov, out_num);
+ trace_virtio_blk_handle_write(req, req->sector_num,
+ req->qiov.size / BDRV_SECTOR_SIZE);
+ } else {
+ qemu_iovec_init_external(&req->qiov, in_iov, in_num);
+ trace_virtio_blk_handle_read(req, req->sector_num,
+ req->qiov.size / BDRV_SECTOR_SIZE);
+ }
Before this patch, req->qiov (and subsequently req->qiov.size) was
initialized before virtio_blk_sect_range_ok() was called (with
req->qiov.size as an argument). Is it fine to change that?
+
+ block_acct_start(blk_get_stats(req->dev->blk),
+ &req->acct, req->qiov.size,
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+ /* merge would exceed maximum number of requests or IO direction
+ * changes */
+ if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
+ is_write != mrb->is_write)) {
+ virtio_submit_multireq(req->dev->blk, mrb);
+ }
+
+ mrb->reqs[mrb->num_reqs++] = req;
Somehow I'd like an assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS)
before this. Feel free to ignore my request.
+ mrb->is_write = is_write;
+ break;
+ }
+ case VIRTIO_BLK_T_FLUSH:
I think VIRTIO_BLK_T_FLUSH_OUT is missing (that's the reason it was
"type & VIRTIO_BLK_T_..." before instead of "type == VIRTIO_BLK_T_...").
virtio_blk_handle_flush(req, mrb);
- } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
+ break;
+ case VIRTIO_BLK_T_SCSI_CMD:
And VIRTIO_BLK_T_SCSI_CMD_OUT here.
The overall logic of this patch seems good to me (although I had some
trouble following which QIOVs had which purpose, which were allocated
and which were not, and so on...).
Max
virtio_blk_handle_scsi(req);
- } else if (type & VIRTIO_BLK_T_GET_ID) {
+ break;
+ case VIRTIO_BLK_T_GET_ID:
+ {
VirtIOBlock *s = req->dev;
/*
@@ -448,14 +553,9 @@ void virtio_blk_handle_request(VirtIOBlockReq *req,
MultiReqBuffer *mrb)
iov_from_buf(in_iov, in_num, 0, serial, size);
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
virtio_blk_free_request(req);
- } else if (type & VIRTIO_BLK_T_OUT) {
- qemu_iovec_init_external(&req->qiov, iov, out_num);
- virtio_blk_handle_write(req, mrb);
- } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
- /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
- qemu_iovec_init_external(&req->qiov, in_iov, in_num);
- virtio_blk_handle_read(req);
- } else {
+ break;
+ }
+ default:
virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
virtio_blk_free_request(req);
}
@@ -465,9 +565,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev,
VirtQueue *vq)
{
VirtIOBlock *s = VIRTIO_BLK(vdev);
VirtIOBlockReq *req;
- MultiReqBuffer mrb = {
- .num_writes = 0,
- };
+ MultiReqBuffer mrb = {};
/* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
* dataplane here instead of waiting for .set_status().
@@ -481,7 +579,9 @@ static void virtio_blk_handle_output(VirtIODevice *vdev,
VirtQueue *vq)
virtio_blk_handle_request(req, &mrb);
}
- virtio_submit_multiwrite(s->blk, &mrb);
+ if (mrb.num_reqs) {
+ virtio_submit_multireq(s->blk, &mrb);
+ }
/*
* FIXME: Want to check for completions before returning to guest mode,
@@ -494,9 +594,7 @@ static void virtio_blk_dma_restart_bh(void *opaque)
{
VirtIOBlock *s = opaque;
VirtIOBlockReq *req = s->rq;
- MultiReqBuffer mrb = {
- .num_writes = 0,
- };
+ MultiReqBuffer mrb = {};
qemu_bh_delete(s->bh);
s->bh = NULL;
@@ -509,7 +607,9 @@ static void virtio_blk_dma_restart_bh(void *opaque)
req = next;
}
- virtio_submit_multiwrite(s->blk, &mrb);
+ if (mrb.num_reqs) {
+ virtio_submit_multireq(s->blk, &mrb);
+ }
}
static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 6ef3fa5..2feeb57 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,29 +134,32 @@ typedef struct VirtIOBlock {
struct VirtIOBlockDataPlane *dataplane;
} VirtIOBlock;
-#define VIRTIO_BLK_MAX_MERGE_REQS 32
-
-typedef struct MultiReqBuffer {
- BlockRequest blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
- unsigned int num_writes;
-} MultiReqBuffer;
-
typedef struct VirtIOBlockReq {
+ int64_t sector_num;
VirtIOBlock *dev;
VirtQueueElement elem;
struct virtio_blk_inhdr *in;
struct virtio_blk_outhdr out;
QEMUIOVector qiov;
struct VirtIOBlockReq *next;
+ struct VirtIOBlockReq *mr_next;
BlockAcctCookie acct;
} VirtIOBlockReq;
+#define VIRTIO_BLK_MAX_MERGE_REQS 32
+
+typedef struct MultiReqBuffer {
+ VirtIOBlockReq *reqs[VIRTIO_BLK_MAX_MERGE_REQS];
+ unsigned int num_reqs;
+ bool is_write;
+} MultiReqBuffer;
+
VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
void virtio_blk_free_request(VirtIOBlockReq *req);
void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb);
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb);
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb);
#endif
diff --git a/trace-events b/trace-events
index 4ec81eb..9ef3b68 100644
--- a/trace-events
+++ b/trace-events
@@ -116,6 +116,7 @@ virtio_blk_req_complete(void *req, int status) "req %p status
%d"
virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector
%"PRIu64" nsectors %zu"
virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector
%"PRIu64" nsectors %zu"
+virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t sector, size_t nsectors,
bool is_write) "mrb %p start %d num_reqs %d sector %"PRIu64" nsectors %zu is_write
%d"
# hw/block/dataplane/virtio-blk.c
virtio_blk_data_plane_start(void *s) "dataplane %p"