Am 18.02.2020 um 06:07 hat Coiby Xu geschrieben: > By making use of libvhost, multiple block device drives can be exported > and each drive can serve multiple clients simultaneously. > Since vhost-user-server needs a block drive to be created first, delay > the creation of this object. > > Signed-off-by: Coiby Xu <coiby...@gmail.com> > --- > Makefile.target | 1 + > backends/Makefile.objs | 2 + > backends/vhost-user-blk-server.c | 718 +++++++++++++++++++++++++++++++ > backends/vhost-user-blk-server.h | 21 + > vl.c | 4 + > 5 files changed, 746 insertions(+) > create mode 100644 backends/vhost-user-blk-server.c > create mode 100644 backends/vhost-user-blk-server.h > > diff --git a/Makefile.target b/Makefile.target > index 6e61f607b1..8c6c01eb3a 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -159,6 +159,7 @@ obj-y += monitor/ > obj-y += qapi/ > obj-y += memory.o > obj-y += memory_mapping.o > +obj-$(CONFIG_LINUX) += ../contrib/libvhost-user/libvhost-user.o > obj-y += migration/ram.o > LIBS := $(libs_softmmu) $(LIBS) > > diff --git a/backends/Makefile.objs b/backends/Makefile.objs > index 28a847cd57..4e7be731e0 100644 > --- a/backends/Makefile.objs > +++ b/backends/Makefile.objs > @@ -14,6 +14,8 @@ common-obj-y += cryptodev-vhost.o > common-obj-$(CONFIG_VHOST_CRYPTO) += cryptodev-vhost-user.o > endif > > +common-obj-$(CONFIG_LINUX) += vhost-user-blk-server.o > + > common-obj-$(call land,$(CONFIG_VHOST_USER),$(CONFIG_VIRTIO)) += vhost-user.o > > common-obj-$(CONFIG_LINUX) += hostmem-memfd.o > diff --git a/backends/vhost-user-blk-server.c > b/backends/vhost-user-blk-server.c > new file mode 100644 > index 0000000000..1bf7f7b544 > --- /dev/null > +++ b/backends/vhost-user-blk-server.c
Please move this to block/export/vhost-user-blk.c. This will automatically have it covered as a block layer component in MAINTAINERS, and it will provide a place where other exports can be put, too. > @@ -0,0 +1,718 @@ > +/* > + * Sharing QEMU block devices via vhost-user protocal > + * > + * Author: Coiby Xu <coiby...@gmail.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or > + * later. See the COPYING file in the top-level directory. > + */ > +#include "qemu/osdep.h" > +#include "block/block.h" > +#include "vhost-user-blk-server.h" > +#include "qapi/error.h" > +#include "qom/object_interfaces.h" > +#include "sysemu/block-backend.h" > + > +enum { > + VHOST_USER_BLK_MAX_QUEUES = 1, > +}; > +struct virtio_blk_inhdr { > + unsigned char status; > +}; > + > +static QTAILQ_HEAD(, VuBlockDev) vu_block_devs = > + QTAILQ_HEAD_INITIALIZER(vu_block_devs); > + > + > +typedef struct VuBlockReq { > + VuVirtqElement *elem; > + int64_t sector_num; > + size_t size; > + struct virtio_blk_inhdr *in; > + struct virtio_blk_outhdr out; > + VuClient *client; > + struct VuVirtq *vq; > +} VuBlockReq; > + > + > +static void vu_block_req_complete(VuBlockReq *req) > +{ > + VuDev *vu_dev = &req->client->parent; > + > + /* IO size with 1 extra status byte */ > + vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1); > + vu_queue_notify(vu_dev, req->vq); > + > + if (req->elem) { > + free(req->elem); > + } > + > + g_free(req); > +} > + > +static VuBlockDev *get_vu_block_device_by_client(VuClient *client) > +{ > + return container_of(client->server->ptr_in_device, VuBlockDev, > vu_server); > +} > + > +static int coroutine_fn > +vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov, > + uint32_t iovcnt, uint32_t type) > +{ > + struct virtio_blk_discard_write_zeroes desc; > + ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)); > + if (unlikely(size != sizeof(desc))) { > + error_report("Invalid size %ld, expect %ld", size, sizeof(desc)); > + return -EINVAL; > + } > + > + VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client); > + uint64_t range[2] = { le64toh(desc.sector) << 9, > + le32toh(desc.num_sectors) << 9 }; > + if (type == VIRTIO_BLK_T_DISCARD) { > + if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) { > + return 0; > + } > + } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) { > + if (blk_co_pwrite_zeroes(vdev_blk->backend, > + range[0], range[1], 0) == 0) { > + return 0; > + } > + } > + > + return -EINVAL; > +} > + > + > +static void coroutine_fn vu_block_flush(VuBlockReq *req) > +{ > + VuBlockDev *vdev_blk = get_vu_block_device_by_client(req->client); > + BlockBackend *backend = vdev_blk->backend; > + blk_co_flush(backend); > +} > + > + > +static int coroutine_fn vu_block_virtio_process_req(VuClient *client, > + VuVirtq *vq) > +{ > + VuDev *vu_dev = &client->parent; > + VuVirtqElement *elem; > + uint32_t type; > + VuBlockReq *req; > + > + VuBlockDev *vdev_blk = get_vu_block_device_by_client(client); > + BlockBackend *backend = vdev_blk->backend; > + elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + > + sizeof(VuBlockReq)); > + if (!elem) { > + return -1; > + } > + > + struct iovec *in_iov = elem->in_sg; > + struct iovec *out_iov = elem->out_sg; > + unsigned in_num = elem->in_num; > + unsigned out_num = elem->out_num; > + /* refer to hw/block/virtio_blk.c */ > + if (elem->out_num < 1 || elem->in_num < 1) { > + error_report("virtio-blk request missing headers"); > + free(elem); > + return -EINVAL; > + } > + > + req = g_new0(VuBlockReq, 1); > + req->client = client; > + req->vq = vq; > + req->elem = elem; You can keep req on the stack, it will be freed before this function returns. > + if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out, > + sizeof(req->out)) != sizeof(req->out))) { > + error_report("virtio-blk request outhdr too short"); > + goto err; > + } > + > + iov_discard_front(&out_iov, &out_num, sizeof(req->out)); > + > + if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) { > + error_report("virtio-blk request inhdr too short"); > + goto err; > + } > + > + /* We always touch the last byte, so just see how big in_iov is. */ > + req->in = (void *)in_iov[in_num - 1].iov_base > + + in_iov[in_num - 1].iov_len > + - sizeof(struct virtio_blk_inhdr); > + iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); > + > + > + type = le32toh(req->out.type); > + switch (type & ~VIRTIO_BLK_T_BARRIER) { > + case VIRTIO_BLK_T_IN: > + case VIRTIO_BLK_T_OUT: { > + ssize_t ret = 0; > + bool is_write = type & VIRTIO_BLK_T_OUT; > + req->sector_num = le64toh(req->out.sector); > + > + int64_t offset = req->sector_num * vdev_blk->blk_size; > + QEMUIOVector *qiov = g_new0(QEMUIOVector, 1); > + if (is_write) { > + qemu_iovec_init_external(qiov, out_iov, out_num); > + ret = blk_co_pwritev(backend, offset, qiov->size, > + qiov, 0); > + } else { > + qemu_iovec_init_external(qiov, in_iov, in_num); > + ret = blk_co_preadv(backend, offset, qiov->size, > + qiov, 0); > + } > + aio_wait_kick(); What purpose does this aio_wait_kick() serve? Usually you do this if you want to signal completion fo something to a different thread, but I don't see that we changed any control state that could be visible to other threads. > + if (ret >= 0) { > + req->in->status = VIRTIO_BLK_S_OK; > + } else { > + req->in->status = VIRTIO_BLK_S_IOERR; > + } > + vu_block_req_complete(req); > + break; > + } > + case VIRTIO_BLK_T_FLUSH: > + vu_block_flush(req); > + req->in->status = VIRTIO_BLK_S_OK; > + vu_block_req_complete(req); > + break; > + case VIRTIO_BLK_T_GET_ID: { > + size_t size = MIN(iov_size(&elem->in_sg[0], in_num), > + VIRTIO_BLK_ID_BYTES); > + snprintf(elem->in_sg[0].iov_base, size, "%s", > "vhost_user_blk_server"); > + req->in->status = VIRTIO_BLK_S_OK; > + req->size = elem->in_sg[0].iov_len; > + vu_block_req_complete(req); > + break; > + } > + case VIRTIO_BLK_T_DISCARD: > + case VIRTIO_BLK_T_WRITE_ZEROES: { > + int rc; > + rc = vu_block_discard_write_zeroes(req, &elem->out_sg[1], > + out_num, type); > + if (rc == 0) { > + req->in->status = VIRTIO_BLK_S_OK; > + } else { > + req->in->status = VIRTIO_BLK_S_IOERR; > + } > + vu_block_req_complete(req); > + break; > + } > + default: > + req->in->status = VIRTIO_BLK_S_UNSUPP; > + vu_block_req_complete(req); > + break; > + } You have vu_block_req_complete() as the last thing in every branch. You could have it just once after the switch block. > + return 0; > + > +err: > + free(elem); > + g_free(req); > + return -EINVAL; > +} Okay, so vu_block_virtio_process_req() takes a single request from the virtqueue and processes it. This makes sense as a building block, but it doesn't allow parallelism yet. I expect to see the creation of multiple coroutines below that can run in parallel. > + > +static void vu_block_process_vq(VuDev *vu_dev, int idx) > +{ > + VuClient *client; > + VuVirtq *vq; > + int ret; > + > + client = container_of(vu_dev, VuClient, parent); > + assert(client); > + > + vq = vu_get_queue(vu_dev, idx); > + assert(vq); > + > + while (1) { > + ret = vu_block_virtio_process_req(client, vq); If you call vu_block_virtio_process_req(), then this one need to be a coroutine_fn, too. > + if (ret) { > + break; > + } > + } > +} > + > +static void vu_block_queue_set_started(VuDev *vu_dev, int idx, bool started) > +{ > + VuVirtq *vq; > + > + assert(vu_dev); > + > + vq = vu_get_queue(vu_dev, idx); > + vu_set_queue_handler(vu_dev, vq, started ? vu_block_process_vq : NULL); > +} If vu_block_process_vq() is used as a vq->handler, will it not be called outside coroutine context? How can this work when vu_block_virtio_process_req() required to be run in coroutine context? Another problem is that vu_block_process_vq() runs a loop as long as there are requests that aren't completed yet. During this time, vu_kick_cb() will block. I don't think this is what was intended. Instead, we should start the requests (without waiting for their completion) and return. > diff --git a/vl.c b/vl.c > index 794f2e5733..0332ab70da 100644 > --- a/vl.c > +++ b/vl.c > @@ -2538,6 +2538,10 @@ static bool object_create_initial(const char *type, > QemuOpts *opts) > } > #endif > > + /* Reason: vhost-user-server property "name" */ "node-name" now. > + if (g_str_equal(type, "vhost-user-blk-server")) { > + return false; > + } > /* > * Reason: filter-* property "netdev" etc. > */ Kevin