at 4:13 AM, Klaus Jensen <i...@irrelevant.dk> wrote:

> On Jun 27 18:48, Jinhao Fan wrote:
>> Add property "ioeventfd" which is enabled by default. When this is
>> enabled, updates on the doorbell registers will cause KVM to signal
>> an event to the QEMU main loop to handle the doorbell updates.
>> Therefore, instead of letting the vcpu thread run both guest VM and
>> IO emulation, we now use the main loop thread to do IO emulation and
>> thus the vcpu thread has more cycles for the guest VM.
>> 
>> Since ioeventfd does not tell us the exact value that is written, it is
>> only useful when shadow doorbell buffer is enabled, where we check
>> for the value in the shadow doorbell buffer when we get the doorbell
>> update event.
>> 
>> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
>> 
>> qd           1   4  16  64
>> qemu        35 121 176 153
>> ioeventfd   41 133 258 313
>> 
>> Signed-off-by: Jinhao Fan <fanjinhao...@ict.ac.cn>
>> ---
>> hw/nvme/ctrl.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-
>> hw/nvme/nvme.h |  5 +++
>> 2 files changed, 101 insertions(+), 1 deletion(-)
>> 
>> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>> index c952c34f94..787b89f7d3 100644
>> --- a/hw/nvme/ctrl.c
>> +++ b/hw/nvme/ctrl.c
>> @@ -1374,7 +1374,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue 
>> *cq, NvmeRequest *req)
>> 
>>     QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
>>     QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
>> -    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +
>> +    if (req->sq->ioeventfd_enabled) {
>> +        /* Post CQE directly since we are in main loop thread */
>> +        nvme_post_cqes(cq);
>> +    } else {
>> +        /* Schedule the timer to post CQE later since we are in vcpu thread 
>> */
>> +        timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>> +    }
>> }
>> 
>> static void nvme_process_aers(void *opaque)
>> @@ -4195,10 +4202,74 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
>> *req)
>>     return NVME_INVALID_OPCODE | NVME_DNR;
>> }
>> 
>> +static void nvme_cq_notifier(EventNotifier *e)
>> +{
>> +    NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
>> +    NvmeCtrl *n = cq->ctrl;
>> +
>> +    event_notifier_test_and_clear(&cq->notifier);
>> +
>> +    nvme_update_cq_head(cq);
>> +
>> +    if (cq->tail == cq->head) {
>> +        if (cq->irq_enabled) {
>> +            n->cq_pending--;
>> +        }
>> +
>> +        nvme_irq_deassert(n, cq);
>> +    }
>> +
>> +    nvme_post_cqes(cq);
>> +}
>> +
>> +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
>> +{
>> +    NvmeCtrl *n = cq->ctrl;
>> +    uint16_t offset = (cq->cqid << 3) + (1 << 2);
>> +    int ret;
>> +
>> +    if ((ret = event_notifier_init(&cq->notifier, 0))) {
>> +        return ret;
>> +    }
> 
> Dont assign in conditionals and rely on the implicit value. It's too
> error prone. Split into
> 
>  ret = event_notifier_init(&cq->notifier, 0);
>  if (ret < 0) {
>    return ret;
>  }
> 
>> +
>> +    event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
>> +    memory_region_add_eventfd(&n->iomem,
>> +                              0x1000 + offset, 4, false, 0, &cq->notifier);
>> +    
>> +    return 0;
>> +}
>> +
>> +static void nvme_sq_notifier(EventNotifier *e)
>> +{
>> +    NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
>> +
>> +    event_notifier_test_and_clear(&sq->notifier);
>> +
>> +    nvme_process_sq(sq);
>> +}
>> +
>> +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
>> +{
>> +    NvmeCtrl *n = sq->ctrl;
>> +    uint16_t offset = sq->sqid << 3;
>> +    int ret;
>> +
>> +    if ((ret = event_notifier_init(&sq->notifier, 0))) {
>> +        return ret;
>> +    }
> 
> Same as above.
> 
>> +
>> +    event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
>> +    memory_region_add_eventfd(&n->iomem,
>> +                              0x1000 + offset, 4, false, 0, &sq->notifier);
>> +
>> +    return 0;
>> +}
>> +
>> static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
>> {
>>     n->sq[sq->sqid] = NULL;
>>     timer_free(sq->timer);
>> +    event_notifier_cleanup(&sq->notifier);
>>     g_free(sq->io_req);
>>     if (sq->sqid) {
>>         g_free(sq);
>> @@ -4250,6 +4321,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
>> uint64_t dma_addr,
>>                          uint16_t sqid, uint16_t cqid, uint16_t size)
>> {
>>     int i;
>> +    int ret;
>>     NvmeCQueue *cq;
>> 
>>     sq->ctrl = n;
>> @@ -4271,6 +4343,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
>> uint64_t dma_addr,
>>     if (n->dbbuf_enabled) {
>>         sq->db_addr = n->dbbuf_dbs + (sqid << 3);
>>         sq->ei_addr = n->dbbuf_eis + (sqid << 3);
>> +            
>> +        if (n->params.ioeventfd && sq->sqid != 0) {
>> +            ret = nvme_init_sq_ioeventfd(sq);
>> +            sq->ioeventfd_enabled = ret == 0;
>> +        }
> 
> Not using ret for anything really, so
> 
>  if (!nvme_init_sq_ioeventfd(sq)) {
>    sq->ioeventfd_enabled = true;
>  }
> 
> should do.
> 
>>     }
>> 
>>     assert(n->cq[cqid]);
>> @@ -4577,6 +4654,7 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
>> {
>>     n->cq[cq->cqid] = NULL;
>>     timer_free(cq->timer);
>> +    event_notifier_cleanup(&cq->notifier);
>>     if (msix_enabled(&n->parent_obj)) {
>>         msix_vector_unuse(&n->parent_obj, cq->vector);
>>     }
>> @@ -4635,6 +4713,11 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
>> uint64_t dma_addr,
>>     if (n->dbbuf_enabled) {
>>         cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
>>         cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
>> +
>> +        if (n->params.ioeventfd && cqid != 0) {
>> +            ret = nvme_init_cq_ioeventfd(cq);
>> +            cq->ioeventfd_enabled = ret == 0;
>> +        }
>>     }
>>     n->cq[cqid] = cq;
>>     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
>> @@ -5793,6 +5876,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
>> NvmeRequest *req)
>>     uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
>>     uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
>>     int i;
>> +    int ret;
>> 
>>     /* Address should be page aligned */
>>     if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
>> @@ -5818,6 +5902,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
>> NvmeRequest *req)
>>             sq->ei_addr = eis_addr + (i << 3);
>>             pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
>>                     sizeof(sq->tail));
>> +            
>> +            if (n->params.ioeventfd && sq->sqid != 0) {
>> +                ret = nvme_init_sq_ioeventfd(sq);
>> +                sq->ioeventfd_enabled = ret == 0;
>> +            }
> 
> Same as above.
> 
>>         }
>> 
>>         if (cq) {
>> @@ -5826,6 +5915,11 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
>> NvmeRequest *req)
>>             cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
>>             pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
>>                     sizeof(cq->head));
>> +            
>> +            if (n->params.ioeventfd && cq->cqid != 0) {
>> +                ret = nvme_init_cq_ioeventfd(cq);
>> +                cq->ioeventfd_enabled = ret == 0;
>> +            }
>>         }
>>     }
>> 
>> @@ -7040,6 +7134,7 @@ static Property nvme_props[] = {
>>     DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
>>     DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
>>                      params.auto_transition_zones, true),
>> +    DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
>>     DEFINE_PROP_END_OF_LIST(),
>> };
>> 
>> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
>> index 4452e4b1bf..2a9beea0c8 100644
>> --- a/hw/nvme/nvme.h
>> +++ b/hw/nvme/nvme.h
>> @@ -369,6 +369,8 @@ typedef struct NvmeSQueue {
>>     uint64_t    db_addr;
>>     uint64_t    ei_addr;
>>     QEMUTimer   *timer;
>> +    EventNotifier notifier;
>> +    bool        ioeventfd_enabled;
>>     NvmeRequest *io_req;
>>     QTAILQ_HEAD(, NvmeRequest) req_list;
>>     QTAILQ_HEAD(, NvmeRequest) out_req_list;
>> @@ -388,6 +390,8 @@ typedef struct NvmeCQueue {
>>     uint64_t    db_addr;
>>     uint64_t    ei_addr;
>>     QEMUTimer   *timer;
>> +    EventNotifier notifier;
>> +    bool        ioeventfd_enabled;
>>     QTAILQ_HEAD(, NvmeSQueue) sq_list;
>>     QTAILQ_HEAD(, NvmeRequest) req_list;
>> } NvmeCQueue;
>> @@ -410,6 +414,7 @@ typedef struct NvmeParams {
>>     uint8_t  zasl;
>>     bool     auto_transition_zones;
>>     bool     legacy_cmb;
>> +    bool     ioeventfd;
>> } NvmeParams;
>> 
>> typedef struct NvmeCtrl {
>> -- 
>> 2.25.1
> 
> Otherwise, looks good!

Hi Klaus,

Thanks for the comments. I will post v2 soon.

Reply via email to