Add an option "iothread=x" to do emulation in a seperate iothread. This improves the performance because QEMU's main loop is responsible for a lot of other work while iothread is dedicated to NVMe emulation. Moreover, emulating in iothread brings the potential of polling on SQ/CQ doorbells, which I will bring up in a following patch.
Iothread can be enabled by: -object iothread,id=nvme0 \ -device nvme,iothread=nvme0 \ Performance comparisons (KIOPS): QD 1 4 16 64 QEMU 41 136 242 338 iothread 53 155 245 309 Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn> --- hw/nvme/ctrl.c | 67 ++++++++++++++++++++++++++++++++++++++++++++------ hw/nvme/ns.c | 21 +++++++++++++--- hw/nvme/nvme.h | 6 ++++- 3 files changed, 82 insertions(+), 12 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index e11328967f..869565d77b 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -4458,7 +4458,13 @@ static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) return ret; } - event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); + if (cq->cqid) { + aio_set_event_notifier(n->ctx, &cq->notifier, true, nvme_cq_notifier, + NULL, NULL); + } else { + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); + } + memory_region_add_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &cq->notifier); @@ -4487,7 +4493,13 @@ static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) return ret; } - event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); + if (sq->sqid) { + aio_set_event_notifier(n->ctx, &sq->notifier, true, nvme_sq_notifier, + NULL, NULL); + } else { + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); + } + memory_region_add_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &sq->notifier); @@ -4503,7 +4515,12 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) if (sq->ioeventfd_enabled) { memory_region_del_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &sq->notifier); - event_notifier_set_handler(&sq->notifier, NULL); + if (sq->sqid) { + aio_set_event_notifier(n->ctx, &sq->notifier, true, NULL, NULL, + NULL); + } else { + event_notifier_set_handler(&sq->notifier, NULL); + } event_notifier_cleanup(&sq->notifier); } g_free(sq->io_req); @@ -4573,7 +4590,13 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, sq->io_req[i].sq = sq; QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry); } - sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); + + if (sq->sqid) { + sq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS, + nvme_process_sq, sq); + } else { + sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); + } if (n->dbbuf_enabled) { sq->db_addr = n->dbbuf_dbs + (sqid << 3); @@ -4896,7 +4919,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) if (cq->ioeventfd_enabled) { memory_region_del_eventfd(&n->iomem, 0x1000 + offset, 4, false, 0, &cq->notifier); - event_notifier_set_handler(&cq->notifier, NULL); + if (cq->cqid) { + aio_set_event_notifier(n->ctx, &cq->notifier, true, NULL, NULL, + NULL); + } else { + event_notifier_set_handler(&cq->notifier, NULL); + } event_notifier_cleanup(&cq->notifier); } if (cq->assert_notifier.initialized) { @@ -4979,7 +5007,13 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, } } n->cq[cqid] = cq; - cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); + + if (cq->cqid) { + cq->timer = aio_timer_new(n->ctx, QEMU_CLOCK_VIRTUAL, SCALE_NS, + nvme_post_cqes, cq); + } else { + cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); + } /* * Only enable irq eventfd for IO queues since we always emulate admin @@ -7759,6 +7793,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) if (pci_is_vf(&n->parent_obj) && !sctrl->scs) { stl_le_p(&n->bar.csts, NVME_CSTS_FAILED); } + + if (n->params.iothread) { + n->iothread = n->params.iothread; + object_ref(OBJECT(n->iothread)); + n->ctx = iothread_get_aio_context(n->iothread); + } else { + n->ctx = qemu_get_aio_context(); + } } static int nvme_init_subsys(NvmeCtrl *n, Error **errp) @@ -7831,7 +7873,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp) ns = &n->namespace; ns->params.nsid = 1; - if (nvme_ns_setup(ns, errp)) { + if (nvme_ns_setup(ns, n->ctx, errp)) { return; } @@ -7862,6 +7904,15 @@ static void nvme_exit(PCIDevice *pci_dev) g_free(n->sq); g_free(n->aer_reqs); + aio_context_acquire(n->ctx); + blk_set_aio_context(n->namespace.blkconf.blk, qemu_get_aio_context(), NULL); + aio_context_release(n->ctx); + + if (n->iothread) { + object_unref(OBJECT(n->iothread)); + n->iothread = NULL; + } + if (n->params.cmb_size_mb) { g_free(n->cmb.buf); } @@ -7885,6 +7936,8 @@ static Property nvme_props[] = { HostMemoryBackend *), DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS, NvmeSubsystem *), + DEFINE_PROP_LINK("iothread", NvmeCtrl, params.iothread, TYPE_IOTHREAD, + IOThread *), DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial), DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0), DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0), diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 62a1f97be0..eb9141a67b 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -146,9 +146,11 @@ lbaf_found: return 0; } -static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) +static int nvme_ns_init_blk(NvmeNamespace *ns, AioContext *ctx, Error **errp) { bool read_only; + AioContext *old_context; + int ret; if (!blkconf_blocksizes(&ns->blkconf, errp)) { return -1; @@ -170,6 +172,17 @@ static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) return -1; } + old_context = blk_get_aio_context(ns->blkconf.blk); + aio_context_acquire(old_context); + ret = blk_set_aio_context(ns->blkconf.blk, ctx, errp); + aio_context_release(old_context); + + if (ret) { + error_setg(errp, "Set AioContext on BlockBackend failed"); + return ret; + } + + return 0; } @@ -482,13 +495,13 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp) return 0; } -int nvme_ns_setup(NvmeNamespace *ns, Error **errp) +int nvme_ns_setup(NvmeNamespace *ns, AioContext *ctx, Error **errp) { if (nvme_ns_check_constraints(ns, errp)) { return -1; } - if (nvme_ns_init_blk(ns, errp)) { + if (nvme_ns_init_blk(ns, ctx, errp)) { return -1; } @@ -563,7 +576,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) } } - if (nvme_ns_setup(ns, errp)) { + if (nvme_ns_setup(ns, n->ctx, errp)) { return; } diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index b0b986b024..224b73e6c4 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -22,6 +22,7 @@ #include "hw/pci/pci.h" #include "hw/pci/msi.h" #include "hw/block/block.h" +#include "sysemu/iothread.h" #include "block/nvme.h" @@ -276,7 +277,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns) } void nvme_ns_init_format(NvmeNamespace *ns); -int nvme_ns_setup(NvmeNamespace *ns, Error **errp); +int nvme_ns_setup(NvmeNamespace *ns, AioContext *ctx, Error **errp); void nvme_ns_drain(NvmeNamespace *ns); void nvme_ns_shutdown(NvmeNamespace *ns); void nvme_ns_cleanup(NvmeNamespace *ns); @@ -433,6 +434,7 @@ typedef struct NvmeParams { uint16_t sriov_vi_flexible; uint8_t sriov_max_vq_per_vf; uint8_t sriov_max_vi_per_vf; + IOThread *iothread; } NvmeParams; typedef struct NvmeCtrl { @@ -464,6 +466,8 @@ typedef struct NvmeCtrl { uint64_t dbbuf_dbs; uint64_t dbbuf_eis; bool dbbuf_enabled; + IOThread *iothread; + AioContext *ctx; struct { MemoryRegion mem; -- 2.25.1