On Mon, Jun 07, 2021 at 09:32:53PM +0800, zhenwei pi wrote: > Add a new qemu block driver which uses libnvmf as userspace NVMe over > fabric initiator. > > Currently QEMU uses 4 NVMF IO-queues by a RR policy, test with a > linux kernel NVMF target, QEMU gets about 220K IOPS. > > Thanks to Famz for several suggestions. > > Signed-off-by: zhenwei pi <[email protected]> > --- > block/meson.build | 1 + > block/nvmf.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++ > configure | 8 +- > meson.build | 8 + > meson_options.txt | 2 + > 5 files changed, 443 insertions(+), 1 deletion(-) > create mode 100644 block/nvmf.c > > diff --git a/block/meson.build b/block/meson.build > index 01861e1545..adf4e753ae 100644 > --- a/block/meson.build > +++ b/block/meson.build > @@ -77,6 +77,7 @@ foreach m : [ > [libnfs, 'nfs', files('nfs.c')], > [libssh, 'ssh', files('ssh.c')], > [rbd, 'rbd', files('rbd.c')], > + [libnvmf, 'nvmf', files('nvmf.c')], > ] > if m[0].found() > module_ss = ss.source_set() > diff --git a/block/nvmf.c b/block/nvmf.c > new file mode 100644 > index 0000000000..d09ca41b58 > --- /dev/null > +++ b/block/nvmf.c > @@ -0,0 +1,425 @@ > +/* > + * NVMe over fabric block driver based on libnvmf > + * > + * Copyright 2020-2021 zhenwei pi > + * > + * Authors: > + * zhenwei pi <[email protected]> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + */ > + > +#include "qemu/osdep.h" > + > +#include <poll.h>
Why is this system header required?
> +#include "qemu/error-report.h"
> +#include "block/block_int.h"
> +#include "block/nvme.h"
> +#include "qemu/iov.h"
> +#include "qemu/option.h"
> +#include "qemu/module.h"
> +#include "qapi/error.h"
> +#include "qapi/qmp/qdict.h"
> +#include "qapi/qmp/qstring.h"
> +
> +#include "nvmf/nvmf.h"
This should probably be <nvmf/nvmf.h>.
> +
> +#define DEF_IO_QUEUES 4 /* default IO queues 4 by RR policy
> */
Why did you choose the round-robin approach instead of a single queue?
Is the idea to allow multi-core NVMeoF targets to process I/O queues in
parallel? Or is the goal to avoid TCP connections stalling? Or something
else?
> +#define NVMF_KATO 30000 /* default keepalive time 30s */
> +#define NVMF_URI "uri"
> +#define PROTOCOL_TCP_PREF "nvmf-tcp://"
> +#define PROTOCOL_RDMA_PREF "nvmf-rdma://"
> +
> +typedef struct NvmfQueue {
> + int qid;
> + CoQueue wait_queue;
> + CoMutex wait_lock;
> +} NvmfQueue;
> +
> +typedef struct NvmfHost {
> + nvmf_options_t opts;
> + nvmf_ctrl_t ctrl;
> + AioContext *aio_context;
> + QemuSpin lock;
> +
> + unsigned long requests;
> + int nr_ioqueues;
> + NvmfQueue *ioqueues;
> +} NvmfHost;
> +
> +typedef struct NvmfReq {
> + NvmfHost *host;
> + nvmf_req_t req;
> + Coroutine *co;
> + int qid;
> + int retval;
> + bool done;
> +} NvmfReq;
> +
> +typedef enum NvmfOp {
> + NvmfOpRead,
> + NvmfOpWrite,
> + NvmfOpDiscard,
> + NvmfOpWritezeroes
> +} NvmfOp;
> +
> +static void nvmf_process(void *opaque)
> +{
> + NvmfHost *host = opaque;
> +
> + nvmf_ctrl_process(host->ctrl);
> +}
> +
> +static void nvmf_attach_aio_context(BlockDriverState *bs,
> + AioContext *new_context)
> +{
> + NvmfHost *host = bs->opaque;
> +
> + host->aio_context = new_context;
> +
> + qemu_spin_lock(&host->lock);
> + aio_set_fd_handler(new_context, nvmf_ctrl_fd(host->ctrl),
> + false, nvmf_process, NULL, NULL, host);
> + qemu_spin_unlock(&host->lock);
This lock is strange. aio_set_fd_handler() is thread-safe. Why is it
necessary?
> +}
> +
> +static QemuOptsList runtime_opts = {
> + .name = "nvmf",
> + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> + .desc = {
> + {
> + .name = NVMF_URI,
> + .type = QEMU_OPT_STRING,
> + .help = "NVMe over fabric URI",
> + },
> + { /* end of list */ }
> + },
> +};
> +
> +static void nvmf_parse_filename(const char *filename, QDict *options,
> + Error **errp)
> +{
> + const char *uri;
> + int len;
> +
> + len = strlen(PROTOCOL_TCP_PREF);
> + if (strlen(filename) > len && !strncmp(filename, PROTOCOL_TCP_PREF,
> len)) {
> + uri = g_strdup(filename);
> + qdict_put_str(options, NVMF_URI, uri);
> + return;
> + }
> +
> + len = strlen(PROTOCOL_RDMA_PREF);
> + if (strlen(filename) > len && !strncmp(filename, PROTOCOL_RDMA_PREF,
> len)) {
> + uri = g_strdup(filename);
> + qdict_put_str(options, NVMF_URI, uri);
> + return;
> + }
> +
> + error_setg(errp, "nvmf: invalid filename. Ex, nvmf-tcp/nvmf-rdma");
> +}
> +
> +static int nvmf_file_open(BlockDriverState *bs, QDict *options, int flags,
> + Error **errp)
> +{
> + NvmfHost *host = bs->opaque;
> + QemuOpts *qopts;
> + nvmf_ctrl_t ctrl;
> + nvmf_options_t opts;
> + const char *uri;
> + NvmfQueue *queue;
> + unsigned int nsid;
> + int i;
> +
> + qopts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
> + qemu_opts_absorb_qdict(qopts, options, &error_abort);
> + uri = qemu_opt_get(qopts, NVMF_URI);
> + if (!uri) {
> + error_setg(errp, "nvmf: missing URI");
> + goto free_qopts;
> + }
> +
> + host->nr_ioqueues = DEF_IO_QUEUES;
> + opts = nvmf_default_options(uri, NULL);
> + if (!opts) {
> + error_setg(errp, "nvmf: parse uri failed");
> + goto free_uri;
> + }
> +
> + nvmf_options_set_io_queues(opts, host->nr_ioqueues);
> + nvmf_options_set_kato(opts, NVMF_KATO);
> + ctrl = nvmf_ctrl_create(opts);
> + if (!ctrl) {
> + error_setg(errp, "nvmf: create ctrl failed");
> + goto free_nvmf_opts;
> + }
> +
> + if (nvmf_ns_count(ctrl) < 1) {
> + error_setg(errp, "nvmf: no available namespace");
> + goto release_ctrl;
> + }
> +
> + /* nsid should be specified by command line */
> + nsid = nvmf_ns_id(ctrl);
> + if (!nvmf_ns_lbads(ctrl, nsid)) {
> + error_setg(errp, "nvmf: invalid LBADS");
> + goto release_ctrl;
> + }
> +
> + if (!nvmf_ns_nsze(ctrl, nsid)) {
> + error_setg(errp, "nvmf: invalid size");
> + goto release_ctrl;
> + }
> +
> + host->ioqueues = g_new0(NvmfQueue, host->nr_ioqueues + 1);
> + memset(host->ioqueues, 0x00, sizeof(NvmfQueue) * (host->nr_ioqueues +
> 1));
g_new0() already zeroes the memory so there is no need to do it again.
> + for (i = 0; i < host->nr_ioqueues + 1; i++) {
> + queue = host->ioqueues + i;
> + queue->qid = i;
> + qemu_co_mutex_init(&queue->wait_lock);
> + qemu_co_queue_init(&queue->wait_queue);
> + }
> +
> + host->ctrl = ctrl;
> + host->opts = opts;
> + qemu_spin_init(&host->lock);
> + nvmf_attach_aio_context(bs, bdrv_get_aio_context(bs));
> + g_free((void *)uri);
If uri is the return value of qemu_opt_get() then it must not be freed.
> +
> + return 0;
> +
> +release_ctrl:
> + nvmf_ctrl_release(ctrl);
> +
> +free_nvmf_opts:
> + nvmf_options_free(opts);
> +
> +free_uri:
> + g_free((void *)uri);
> +
> +free_qopts:
> + qemu_opts_del(qopts);
> +
> + return -EINVAL;
> +}
> +
> +static void nvmf_close(BlockDriverState *bs)
> +{
> + NvmfHost *host = bs->opaque;
> +
> + nvmf_ctrl_release(host->ctrl);
> + nvmf_options_free(host->opts);
> + g_free(host->ioqueues);
> +}
> +
> +static inline bool nvmf_is_lba_aligned(nvmf_ctrl_t ctrl, size_t offset,
> + size_t length)
> +{
> + unsigned int nsid = nvmf_ns_id(ctrl);
> + unsigned char lbads = nvmf_ns_lbads(ctrl, nsid);
> + return !(offset & ((1 << lbads) - 1)) && !(length & ((1 << lbads) - 1));
> +}
> +
> +static inline void nvmf_req_co_init(NvmfHost *host, NvmfReq *req)
> +{
> + req->host = host;
> + req->req = req;
> + req->co = qemu_coroutine_self();
> + req->done = false;
> +}
> +
> +static void nvmf_req_cb(unsigned short status, void *opaque)
> +{
> + NvmfReq *req = opaque;
> +
> + switch (status) {
> + case NVME_SUCCESS:
> + req->retval = 0;
> + break;
> +
> + case NVME_INVALID_OPCODE:
> + req->retval = -ENOSYS;
> + break;
> +
> + case NVME_INVALID_FIELD:
> + req->retval = -EINVAL;
> + break;
> +
> + default:
> + req->retval = -EIO;
> + break;
> + }
> +
> + req->done = true;
> + aio_co_wake(req->co);
> +}
> +
> +static coroutine_fn int nvmf_co_io(BlockDriverState *bs, uint64_t offset,
> + uint64_t bytes, QEMUIOVector *qiov,
> + int flags, NvmfOp op)
> +{
> + NvmfHost *host = bs->opaque;
> + NvmfQueue *queue;
> + NvmfReq req;
> + struct iovec iov = {.iov_base = NULL, .iov_len = bytes};
> + int qid;
> +
> + if (!nvmf_is_lba_aligned(host->ctrl, offset, bytes)) {
> + return -EINVAL;
> + }
> + /* RR policy */
> + qid = (host->requests++ % host->nr_ioqueues) + 1;
> + queue = host->ioqueues + qid;
> + nvmf_req_co_init(host, &req);
> + req.qid = qid;
> +
> +retry:
> + switch (op) {
> + case NvmfOpRead:
> + req.req = nvmf_read_async(host->ctrl, qid, qiov->iov, qiov->niov,
> + offset, 0, nvmf_req_cb, &req);
> + break;
> + case NvmfOpWrite:
> + req.req = nvmf_write_async(host->ctrl, qid, qiov->iov, qiov->niov,
> + offset, 0, nvmf_req_cb, &req);
> + break;
> + case NvmfOpDiscard:
> + req.req = nvmf_discard_async(host->ctrl, qid, &iov, 1, offset, 0,
> + nvmf_req_cb, &req);
> + break;
> + case NvmfOpWritezeroes:
> + req.req = nvmf_writezeroes_async(host->ctrl, qid, &iov, 1, offset, 0,
> + nvmf_req_cb, &req);
> + break;
> + default:
> + return -EOPNOTSUPP;
> + }
> +
> + if (!req.req) {
> + /* test queue is full */
> + if (nvmf_queue_nr_inflight(host->ctrl, qid) ==
> + nvmf_queue_depth(host->ctrl, qid)) {
> + qemu_co_mutex_lock(&queue->wait_lock);
This works because all requests run in a single AioContext in QEMU.
Emanuele Esposito and Paolo Bonzini have been sending multi-queue
patches that convert block drivers to support multi-threading. Requests
can be submitted from any thread without taking the AioContext lock. It
will probably be necessary to move queue->wait_lock to also cover the if
expression so there are no race conditions.
I think you can leave the code as it is, but there's a chance it will
need to be converted before it can be merged since the multi-queue block
layer patches are making their way upstream at the same time.
> + qemu_co_queue_wait(&queue->wait_queue, &queue->wait_lock);
> + qemu_co_mutex_unlock(&queue->wait_lock);
> + goto retry;
> + } else {
> + assert(0);
> + }
> + }
> +
> + while (!req.done) {
> + qemu_coroutine_yield();
> + }
> +
> + nvmf_req_free(req.req);
> +
> + qemu_co_mutex_lock(&queue->wait_lock);
> + if (!qemu_co_queue_empty(&queue->wait_queue)) {
> + qemu_co_queue_restart_all(&queue->wait_queue);
> + }
> + qemu_co_mutex_unlock(&queue->wait_lock);
> +
> + return req.retval;
> +}
> +
> +static coroutine_fn int nvmf_co_preadv(BlockDriverState *bs, uint64_t offset,
> + uint64_t bytes, QEMUIOVector *qiov,
> + int flags)
> +{
> + return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpRead);
> +}
> +
> +static coroutine_fn int nvmf_co_pwritev(BlockDriverState *bs, uint64_t
> offset,
> + uint64_t bytes, QEMUIOVector *qiov,
> + int flags)
> +{
> + return nvmf_co_io(bs, offset, bytes, qiov, flags, NvmfOpWrite);
> +}
> +
> +static coroutine_fn int nvmf_co_pwrite_zeroes(BlockDriverState *bs,
> + int64_t offset, int bytes,
> + BdrvRequestFlags flags)
> +{
> + return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpWritezeroes);
> +}
> +
> +static coroutine_fn int nvmf_co_pdiscard(BlockDriverState *bs, int64_t
> offset,
> + int bytes)
> +{
> + return nvmf_co_io(bs, offset, bytes, NULL, 0, NvmfOpDiscard);
> +}
> +
> +static void nvmf_refresh_limits(BlockDriverState *bs, Error **errp)
> +{
> + NvmfHost *host = bs->opaque;
> + unsigned int nsid = nvmf_ns_id(host->ctrl);
> + unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid);
> +
> + bs->bl.request_alignment = MAX(BDRV_SECTOR_SIZE, (1 << lbads));
> + bs->bl.max_transfer = nvmf_ctrl_mdts(host->ctrl);
> + bs->bl.max_iov = nvmf_max_iov(host->ctrl);
> +}
> +
> +static int64_t nvmf_getlength(BlockDriverState *bs)
> +{
> + NvmfHost *host = bs->opaque;
> + unsigned int nsid = nvmf_ns_id(host->ctrl);
> + unsigned char lbads = nvmf_ns_lbads(host->ctrl, nsid);
> + unsigned long nsze = nvmf_ns_nsze(host->ctrl, nsid);
> +
> + return (1 << lbads) * nsze;
> +}
> +
> +static void nvmf_detach_aio_context(BlockDriverState *bs)
> +{
> + NvmfHost *host = bs->opaque;
> +
> + qemu_spin_lock(&host->lock);
> + aio_set_fd_handler(host->aio_context, nvmf_ctrl_fd(host->ctrl),
> + false, NULL, NULL, NULL, NULL);
> + qemu_spin_unlock(&host->lock);
> +}
> +
> +static BlockDriver bdrv_nvmf_tcp = {
> + .format_name = "nvmf-tcp",
> + .protocol_name = "nvmf-tcp",
> + .instance_size = sizeof(NvmfHost),
> + .bdrv_parse_filename = nvmf_parse_filename,
> + .bdrv_file_open = nvmf_file_open,
> + .bdrv_close = nvmf_close,
> + .bdrv_getlength = nvmf_getlength,
> + .bdrv_refresh_limits = nvmf_refresh_limits,
> + .bdrv_co_pdiscard = nvmf_co_pdiscard,
> + .bdrv_co_preadv = nvmf_co_preadv,
> + .bdrv_co_pwritev = nvmf_co_pwritev,
> + .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes,
> + .bdrv_detach_aio_context = nvmf_detach_aio_context,
> + .bdrv_attach_aio_context = nvmf_attach_aio_context,
> +};
> +
> +static BlockDriver bdrv_nvmf_rdma = {
> + .format_name = "nvmf-rdma",
> + .protocol_name = "nvmf-rdma",
> + .instance_size = sizeof(NvmfHost),
> + .bdrv_parse_filename = nvmf_parse_filename,
> + .bdrv_file_open = nvmf_file_open,
> + .bdrv_close = nvmf_close,
> + .bdrv_getlength = nvmf_getlength,
> + .bdrv_refresh_limits = nvmf_refresh_limits,
> + .bdrv_co_pdiscard = nvmf_co_pdiscard,
> + .bdrv_co_preadv = nvmf_co_preadv,
> + .bdrv_co_pwritev = nvmf_co_pwritev,
> + .bdrv_co_pwrite_zeroes = nvmf_co_pwrite_zeroes,
> + .bdrv_detach_aio_context = nvmf_detach_aio_context,
> + .bdrv_attach_aio_context = nvmf_attach_aio_context,
> +};
> +
> +static void nvmf_block_init(void)
> +{
> + bdrv_register(&bdrv_nvmf_tcp);
> + bdrv_register(&bdrv_nvmf_rdma);
> +}
> +
> +block_init(nvmf_block_init);
> diff --git a/configure b/configure
> index 8dcb9965b2..608b108062 100755
> --- a/configure
> +++ b/configure
> @@ -393,6 +393,7 @@ vss_win32_sdk="$default_feature"
> win_sdk="no"
> want_tools="$default_feature"
> libiscsi="auto"
> +libnvmf="auto"
> libnfs="auto"
> coroutine=""
> coroutine_pool="$default_feature"
> @@ -1127,6 +1128,10 @@ for opt do
> ;;
> --enable-libiscsi) libiscsi="enabled"
> ;;
> + --disable-libnvmf) libnvmf="disabled"
> + ;;
> + --enable-libnvmf) libnvmf="enabled"
> + ;;
> --disable-libnfs) libnfs="disabled"
> ;;
> --enable-libnfs) libnfs="enabled"
> @@ -1889,6 +1894,7 @@ disabled with --disable-FEATURE, default is enabled if
> available
> spice-protocol spice-protocol
> rbd rados block device (rbd)
> libiscsi iscsi support
> + libnvmf NVMe over Fabric support
> libnfs nfs support
> smartcard smartcard support (libcacard)
> u2f U2F support (u2f-emu)
> @@ -6448,7 +6454,7 @@ if test "$skip_meson" = no; then
> -Dvhost_user_blk_server=$vhost_user_blk_server
> -Dmultiprocess=$multiprocess \
> -Dfuse=$fuse -Dfuse_lseek=$fuse_lseek
> -Dguest_agent_msi=$guest_agent_msi -Dbpf=$bpf\
> $(if test "$default_features" = no; then echo
> "-Dauto_features=disabled"; fi) \
> - -Dtcg_interpreter=$tcg_interpreter \
> + -Dtcg_interpreter=$tcg_interpreter -Dlibnvmf=$libnvmf\
> $cross_arg \
> "$PWD" "$source_path"
>
> diff --git a/meson.build b/meson.build
> index 626cf932c1..ccdc36266a 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -478,6 +478,12 @@ if not get_option('libiscsi').auto() or have_block
> required: get_option('libiscsi'),
> method: 'pkg-config', kwargs: static_kwargs)
> endif
> +libnvmf = not_found
> +if not get_option('libnvmf').auto() or have_block
> + libnvmf = dependency('libnvmf', version: '>=0.1',
> + required: get_option('libnvmf'),
> + method: 'pkg-config', kwargs: static_kwargs)
> +endif
> zstd = not_found
> if not get_option('zstd').auto() or have_block
> zstd = dependency('libzstd', version: '>=1.4.0',
> @@ -1150,6 +1156,7 @@ config_host_data.set('CONFIG_LIBATTR', have_old_libattr)
> config_host_data.set('CONFIG_LIBCAP_NG', libcap_ng.found())
> config_host_data.set('CONFIG_EBPF', libbpf.found())
> config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
> +config_host_data.set('CONFIG_LIBNVMF', libnvmf.found())
> config_host_data.set('CONFIG_LIBNFS', libnfs.found())
> config_host_data.set('CONFIG_RBD', rbd.found())
> config_host_data.set('CONFIG_SDL', sdl.found())
> @@ -2736,6 +2743,7 @@ summary_info += {'usb net redir':
> config_host.has_key('CONFIG_USB_REDIR')}
> summary_info += {'OpenGL support': config_host.has_key('CONFIG_OPENGL')}
> summary_info += {'GBM': config_host.has_key('CONFIG_GBM')}
> summary_info += {'libiscsi support': libiscsi.found()}
> +summary_info += {'libnvmf support': libnvmf.found()}
> summary_info += {'libnfs support': libnfs.found()}
> if targetos == 'windows'
> if config_host.has_key('CONFIG_GUEST_AGENT')
> diff --git a/meson_options.txt b/meson_options.txt
> index 3d304cac96..102705824f 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -68,6 +68,8 @@ option('glusterfs', type : 'feature', value : 'auto',
> description: 'Glusterfs block device driver')
> option('libiscsi', type : 'feature', value : 'auto',
> description: 'libiscsi userspace initiator')
> +option('libnvmf', type : 'feature', value : 'auto',
> + description: 'NVMe over Fabric userspace initiator')
> option('libnfs', type : 'feature', value : 'auto',
> description: 'libnfs block device driver')
> option('mpath', type : 'feature', value : 'auto',
> --
> 2.25.1
>
signature.asc
Description: PGP signature
