at 4:13 AM, Klaus Jensen <i...@irrelevant.dk> wrote: > On Jun 27 18:48, Jinhao Fan wrote: >> Add property "ioeventfd" which is enabled by default. When this is >> enabled, updates on the doorbell registers will cause KVM to signal >> an event to the QEMU main loop to handle the doorbell updates. >> Therefore, instead of letting the vcpu thread run both guest VM and >> IO emulation, we now use the main loop thread to do IO emulation and >> thus the vcpu thread has more cycles for the guest VM. >> >> Since ioeventfd does not tell us the exact value that is written, it is >> only useful when shadow doorbell buffer is enabled, where we check >> for the value in the shadow doorbell buffer when we get the doorbell >> update event. >> >> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) >> >> qd 1 4 16 64 >> qemu 35 121 176 153 >> ioeventfd 41 133 258 313 >> >> Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn> >> --- >> hw/nvme/ctrl.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++- >> hw/nvme/nvme.h | 5 +++ >> 2 files changed, 101 insertions(+), 1 deletion(-) >> >> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c >> index c952c34f94..787b89f7d3 100644 >> --- a/hw/nvme/ctrl.c >> +++ b/hw/nvme/ctrl.c >> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue >> *cq, NvmeRequest *req) >> >> QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); >> QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); >> - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); >> + >> + if (req->sq->ioeventfd_enabled) { >> + /* Post CQE directly since we are in main loop thread */ >> + nvme_post_cqes(cq); >> + } else { >> + /* Schedule the timer to post CQE later since we are in vcpu thread >> */ >> + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); >> + } >> } >> >> static void nvme_process_aers(void *opaque) >> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest >> *req) >> return NVME_INVALID_OPCODE | NVME_DNR; >> } >> >> +static void nvme_cq_notifier(EventNotifier *e) >> +{ >> + NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); >> + NvmeCtrl *n = cq->ctrl; >> + >> + event_notifier_test_and_clear(&cq->notifier); >> + >> + nvme_update_cq_head(cq); >> + >> + if (cq->tail == cq->head) { >> + if (cq->irq_enabled) { >> + n->cq_pending--; >> + } >> + >> + nvme_irq_deassert(n, cq); >> + } >> + >> + nvme_post_cqes(cq); >> +} >> + >> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) >> +{ >> + NvmeCtrl *n = cq->ctrl; >> + uint16_t offset = (cq->cqid << 3) + (1 << 2); >> + int ret; >> + >> + if ((ret = event_notifier_init(&cq->notifier, 0))) { >> + return ret; >> + } > > Dont assign in conditionals and rely on the implicit value. It's too > error prone. Split into > > ret = event_notifier_init(&cq->notifier, 0); > if (ret < 0) { > return ret; > } > >> + >> + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); >> + memory_region_add_eventfd(&n->iomem, >> + 0x1000 + offset, 4, false, 0, &cq->notifier); >> + >> + return 0; >> +} >> + >> +static void nvme_sq_notifier(EventNotifier *e) >> +{ >> + NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); >> + >> + event_notifier_test_and_clear(&sq->notifier); >> + >> + nvme_process_sq(sq); >> +} >> + >> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) >> +{ >> + NvmeCtrl *n = sq->ctrl; >> + uint16_t offset = sq->sqid << 3; >> + int ret; >> + >> + if ((ret = event_notifier_init(&sq->notifier, 0))) { >> + return ret; >> + } > > Same as above. > >> + >> + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); >> + memory_region_add_eventfd(&n->iomem, >> + 0x1000 + offset, 4, false, 0, &sq->notifier); >> + >> + return 0; >> +} >> + >> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) >> { >> n->sq[sq->sqid] = NULL; >> timer_free(sq->timer); >> + event_notifier_cleanup(&sq->notifier); >> g_free(sq->io_req); >> if (sq->sqid) { >> g_free(sq); >> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, >> uint64_t dma_addr, >> uint16_t sqid, uint16_t cqid, uint16_t size) >> { >> int i; >> + int ret; >> NvmeCQueue *cq; >> >> sq->ctrl = n; >> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, >> uint64_t dma_addr, >> if (n->dbbuf_enabled) { >> sq->db_addr = n->dbbuf_dbs + (sqid << 3); >> sq->ei_addr = n->dbbuf_eis + (sqid << 3); >> + >> + if (n->params.ioeventfd && sq->sqid != 0) { >> + ret = nvme_init_sq_ioeventfd(sq); >> + sq->ioeventfd_enabled = ret == 0; >> + } > > Not using ret for anything really, so > > if (!nvme_init_sq_ioeventfd(sq)) { > sq->ioeventfd_enabled = true; > } > > should do. > >> } >> >> assert(n->cq[cqid]); >> @@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) >> { >> n->cq[cq->cqid] = NULL; >> timer_free(cq->timer); >> + event_notifier_cleanup(&cq->notifier); >> if (msix_enabled(&n->parent_obj)) { >> msix_vector_unuse(&n->parent_obj, cq->vector); >> } >> @@ -4635,6 +4713,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, >> uint64_t dma_addr, >> if (n->dbbuf_enabled) { >> cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); >> cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); >> + >> + if (n->params.ioeventfd && cqid != 0) { >> + ret = nvme_init_cq_ioeventfd(cq); >> + cq->ioeventfd_enabled = ret == 0; >> + } >> } >> n->cq[cqid] = cq; >> cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); >> @@ -5793,6 +5876,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const >> NvmeRequest *req) >> uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); >> uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); >> int i; >> + int ret; >> >> /* Address should be page aligned */ >> if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { >> @@ -5818,6 +5902,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const >> NvmeRequest *req) >> sq->ei_addr = eis_addr + (i << 3); >> pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, >> sizeof(sq->tail)); >> + >> + if (n->params.ioeventfd && sq->sqid != 0) { >> + ret = nvme_init_sq_ioeventfd(sq); >> + sq->ioeventfd_enabled = ret == 0; >> + } > > Same as above. > >> } >> >> if (cq) { >> @@ -5826,6 +5915,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const >> NvmeRequest *req) >> cq->ei_addr = eis_addr + (i << 3) + (1 << 2); >> pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, >> sizeof(cq->head)); >> + >> + if (n->params.ioeventfd && cq->cqid != 0) { >> + ret = nvme_init_cq_ioeventfd(cq); >> + cq->ioeventfd_enabled = ret == 0; >> + } >> } >> } >> >> @@ -7040,6 +7134,7 @@ static Property nvme_props[] = { >> DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), >> DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, >> params.auto_transition_zones, true), >> + DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h >> index 4452e4b1bf..2a9beea0c8 100644 >> --- a/hw/nvme/nvme.h >> +++ b/hw/nvme/nvme.h >> @@ -369,6 +369,8 @@ typedef struct NvmeSQueue { >> uint64_t db_addr; >> uint64_t ei_addr; >> QEMUTimer *timer; >> + EventNotifier notifier; >> + bool ioeventfd_enabled; >> NvmeRequest *io_req; >> QTAILQ_HEAD(, NvmeRequest) req_list; >> QTAILQ_HEAD(, NvmeRequest) out_req_list; >> @@ -388,6 +390,8 @@ typedef struct NvmeCQueue { >> uint64_t db_addr; >> uint64_t ei_addr; >> QEMUTimer *timer; >> + EventNotifier notifier; >> + bool ioeventfd_enabled; >> QTAILQ_HEAD(, NvmeSQueue) sq_list; >> QTAILQ_HEAD(, NvmeRequest) req_list; >> } NvmeCQueue; >> @@ -410,6 +414,7 @@ typedef struct NvmeParams { >> uint8_t zasl; >> bool auto_transition_zones; >> bool legacy_cmb; >> + bool ioeventfd; >> } NvmeParams; >> >> typedef struct NvmeCtrl { >> -- >> 2.25.1 > > Otherwise, looks good!
Hi Klaus, Thanks for the comments. I will post v2 soon.