Keith Busch <keith.bu...@intel.com> writes: > This seems like a silly feature, but the spec allows it, so this adds > support for queues that are not physically contiguous. > > Signed-off-by: Keith Busch <keith.bu...@intel.com> > --- > hw/nvme.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++----------- > hw/nvme.h | 2 + > 2 files changed, 118 insertions(+), 27 deletions(-) > > diff --git a/hw/nvme.c b/hw/nvme.c > index 3e5cfc0..2945406 100644 > --- a/hw/nvme.c > +++ b/hw/nvme.c > @@ -110,6 +110,42 @@ static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq) > } > } > > +static uint64_t *nvme_setup_discontig(NvmeCtrl *n, uint64_t prp_addr, > + uint16_t queue_depth, uint16_t entry_size) > +{ > + int i; > + uint16_t prps_per_page = n->page_size / sizeof(uint64_t); > + uint64_t prp[prps_per_page]; > + uint16_t total_prps = DIV_ROUND_UP(queue_depth * entry_size, > n->page_size); > + uint64_t *prp_list = g_malloc0(total_prps * sizeof(*prp_list)); > + > + for (i = 0; i < total_prps; i++) { > + if (i % prps_per_page == 0 && i < total_prps - 1) { > + if (!prp_addr || prp_addr & (n->page_size - 1)) { > + g_free(prp_list); > + return NULL; > + } > + pci_dma_read(&n->dev, prp_addr, (void *)&prp, sizeof(prp)); > + prp_addr = prp[prps_per_page - 1];
This doesn't appear to be endian safe. Regards, Anthony Liguori > + } > + prp_list[i] = prp[i % prps_per_page]; > + if (!prp_list[i] || prp_list[i] & (n->page_size - 1)) { > + g_free(prp_list); > + return NULL; > + } > + } > + return prp_list; > +} > + > +static hwaddr nvme_discontig(uint64_t *dma_addr, uint16_t page_size, > + uint16_t queue_idx, uint16_t entry_size) > +{ > + uint16_t entries_per_page = page_size / entry_size; > + uint16_t prp_index = queue_idx / entries_per_page; > + uint16_t index_in_prp = queue_idx % entries_per_page; > + return dma_addr[prp_index] + index_in_prp * entry_size; > +} > + > static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2, > uint32_t len, NvmeCtrl *n) > { > @@ -212,7 +248,12 @@ static void nvme_post_cqes(void *opaque) > req->cqe.status |= cq->phase; > req->cqe.sq_id = sq->id; > req->cqe.sq_head = sq->head; > - addr = cq->dma_addr + cq->tail * n->cqe_size; > + if (cq->phys_contig) { > + addr = cq->dma_addr + cq->tail * n->cqe_size; > + } else { > + addr = nvme_discontig(cq->prp_list, cq->tail, n->page_size, > + n->cqe_size); > + } > nvme_inc_cq_tail(cq); > pci_dma_write(&n->dev, addr, (void *)&req->cqe, sizeof(req->cqe)); > QTAILQ_INSERT_TAIL(&sq->req_list, req, entry); > @@ -528,7 +569,9 @@ static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) > qemu_del_timer(sq->timer); > qemu_free_timer(sq->timer); > g_free(sq->io_req); > - sq->io_req = NULL; > + if (sq->prp_list) { > + g_free(sq->prp_list); > + } > if (sq->id) > g_free(sq); > } > @@ -566,20 +609,32 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd) > return NVME_SUCCESS; > } > > -static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, > - uint16_t sqid, uint16_t cqid, uint16_t size, enum QueueFlags prio) > +static uint16_t nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, > + uint16_t sqid, uint16_t cqid, uint16_t size, enum QueueFlags prio, > + int contig) > { > int i; > NvmeCQueue *cq; > > sq->ctrl = n; > - sq->dma_addr = dma_addr; > sq->id = sqid; > sq->size = size; > sq->cqid = cqid; > sq->head = sq->tail = 0; > + sq->phys_contig = contig; > sq->io_req = g_malloc(sq->size * sizeof(*sq->io_req)); > > + if (sq->phys_contig) { > + sq->dma_addr = dma_addr; > + } > + else { > + sq->prp_list = nvme_setup_discontig(n, dma_addr, size, > + n->sqe_size); > + if (!sq->prp_list) { > + return NVME_INVALID_FIELD | NVME_DNR; > + } > + } > + > QTAILQ_INIT(&sq->req_list); > QTAILQ_INIT(&sq->out_req_list); > for (i = 0; i < sq->size; i++) { > @@ -607,6 +662,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, > uint64_t dma_addr, > cq = n->cq[cqid]; > QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); > n->sq[sqid] = sq; > + > + return NVME_SUCCESS; > } > > static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd) > @@ -625,12 +682,15 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd > *cmd) > if (!c->prp1 || c->prp1 & (n->page_size - 1)) { > return NVME_INVALID_FIELD | NVME_DNR; > } > - if (!(NVME_SQ_FLAGS_PC(c->sq_flags))) { > + if (!(NVME_SQ_FLAGS_PC(c->sq_flags)) && NVME_CAP_CQR(n->bar.cap)) { > return NVME_INVALID_FIELD | NVME_DNR; > } > sq = g_malloc0(sizeof(*sq)); > - nvme_init_sq(sq, n, c->prp1, c->sqid, c->cqid, c->qsize + 1, > - NVME_SQ_FLAGS_QPRIO(c->sq_flags)); > + if (nvme_init_sq(sq, n, c->prp1, c->sqid, c->cqid, c->qsize + 1, > + NVME_SQ_FLAGS_QPRIO(c->sq_flags), > NVME_SQ_FLAGS_PC(c->sq_flags))) { > + g_free(sq); > + return NVME_INVALID_FIELD | NVME_DNR; > + } > return NVME_SUCCESS; > } > > @@ -640,8 +700,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) > qemu_del_timer(cq->timer); > qemu_free_timer(cq->timer); > msix_vector_unuse(&n->dev, cq->vector); > - if (cq->id) > + if (cq->prp_list) { > + g_free(cq->prp_list); > + } > + if (cq->id) { > g_free(cq); > + } > > } > > @@ -662,22 +726,32 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd) > return NVME_SUCCESS; > } > > -static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, > - uint16_t cqid, uint16_t vector, uint16_t size) > +static uint16_t nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, > + uint16_t cqid, uint16_t vector, uint16_t size, int contig) > { > cq->ctrl = n; > cq->id = cqid; > cq->size = size; > - cq->dma_addr = dma_addr; > cq->phase = 1; > cq->irq_enabled = 1; > cq->vector = vector; > cq->head = cq->tail = 0; > + cq->phys_contig = contig; > + if (cq->phys_contig) { > + cq->dma_addr = dma_addr; > + } else { > + cq->prp_list = nvme_setup_discontig(n, dma_addr, size, > + n->cqe_size); > + if (!cq->prp_list) { > + return NVME_INVALID_FIELD | NVME_DNR; > + } > + } > QTAILQ_INIT(&cq->req_list); > QTAILQ_INIT(&cq->sq_list); > msix_vector_use(&n->dev, cq->vector); > n->cq[cqid] = cq; > cq->timer = qemu_new_timer_ns(vm_clock, nvme_post_cqes, cq); > + return NVME_SUCCESS; > } > > static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd) > @@ -696,12 +770,16 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd > *cmd) > if (c->irq_vector > n->num_queues) { > return NVME_INVALID_IRQ_VECTOR; > } > - if (!(NVME_CQ_FLAGS_PC(c->cq_flags))) { > + if (!(NVME_CQ_FLAGS_PC(c->cq_flags)) && NVME_CAP_CQR(n->bar.cap)) { > return NVME_INVALID_FIELD | NVME_DNR; > } > > cq = g_malloc0(sizeof(*cq)); > - nvme_init_cq(cq, n, c->prp1, c->cqid, c->irq_vector, c->qsize + 1); > + if (nvme_init_cq(cq, n, c->prp1, c->cqid, c->irq_vector, c->qsize + 1, > + NVME_CQ_FLAGS_PC(c->cq_flags))) { > + g_free(cq); > + return NVME_INVALID_FIELD | NVME_DNR; > + } > return NVME_SUCCESS; > } > > @@ -1001,8 +1079,14 @@ static uint16_t nvme_abort_req(NvmeCtrl *n, NvmeCmd > *cmd, uint32_t *result) > sq = n->sq[sqid]; > while ((sq->head + index) % sq->size != sq->tail) { > NvmeCmd abort_cmd; > - hwaddr addr = sq->dma_addr + ((sq->head + index) % sq->size) * > - n->sqe_size; > + hwaddr addr; > + if (sq->phys_contig) { > + addr = sq->dma_addr + ((sq->head + index) % sq->size) * > + n->sqe_size; > + } else { > + addr = nvme_discontig(sq->prp_list, (sq->head + index) % > sq->size, > + n->page_size, n->sqe_size); > + } > pci_dma_read(&n->dev, addr, (void *)&abort_cmd, sizeof(abort_cmd)); > if (abort_cmd.cid == cid) { > NvmeRequest *req = QTAILQ_FIRST(&sq->req_list); > @@ -1150,7 +1234,12 @@ static void nvme_sq_process(void *opaque) > > while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list)) && > processed++ < sq->arb_burst) { > - addr = sq->dma_addr + sq->head * n->sqe_size; > + if (sq->phys_contig) { > + addr = sq->dma_addr + sq->head * n->sqe_size; > + } else { > + addr = nvme_discontig(sq->prp_list, sq->head, n->page_size, > + n->sqe_size); > + } > pci_dma_read(&n->dev, addr, (void *)&cmd, sizeof(cmd)); > nvme_inc_sq_head(sq); > > @@ -1207,7 +1296,6 @@ static int nvme_start_ctrl(NvmeCtrl *n) > { > uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12; > uint32_t page_size = 1 << page_bits; > - > if (n->cq[0] || n->sq[0]) { > return -1; > } > @@ -1236,20 +1324,21 @@ static int nvme_start_ctrl(NvmeCtrl *n) > return -1; > } > > - n->page_bits = NVME_CC_MPS(n->bar.cc) + 12; > + n->page_bits = page_bits; > n->page_size = 1 << n->page_bits; > n->max_prp_ents = n->page_size / sizeof(uint64_t); > n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc); > n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc); > - > - nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0, > - NVME_AQA_ACQS(n->bar.aqa) + 1); > - nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, > - NVME_AQA_ASQS(n->bar.aqa) + 1, NVME_Q_PRIO_HIGH); > - > + if (nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0, > + NVME_AQA_ACQS(n->bar.aqa) + 1, 1)) { > + return -1; > + } > + if (nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0, > + NVME_AQA_ASQS(n->bar.aqa) + 1, NVME_Q_PRIO_HIGH, 1)) { > + return -1; > + } > n->aer_timer = qemu_new_timer_ns(vm_clock, nvme_aer_process_cb, n); > QSIMPLEQ_INIT(&n->aer_queue); > - > return 0; > } > > @@ -1426,7 +1515,7 @@ static int nvme_init(PCIDevice *pci_dev) > n->max_sqes < NVME_MIN_SQUEUE_ES || n->max_cqes < > NVME_MIN_CQUEUE_ES) { > return -1; > } > - if (n->cqr != 1) { > + if (n->cqr > 1) { > return -1; > } > if (n->vwc > 1) { > diff --git a/hw/nvme.h b/hw/nvme.h > index b7d6615..926f843 100644 > --- a/hw/nvme.h > +++ b/hw/nvme.h > @@ -612,6 +612,7 @@ typedef struct NvmeSQueue { > uint32_t size; > uint64_t dma_addr; > uint64_t completed; > + uint64_t *prp_list; > QEMUTimer *timer; > QTAILQ_ENTRY(NvmeSQueue) entry; > NvmeRequest *io_req; > @@ -630,6 +631,7 @@ typedef struct NvmeCQueue { > uint32_t vector; > uint32_t size; > uint64_t dma_addr; > + uint64_t *prp_list; > QEMUTimer *timer; > QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list; > QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list; > -- > 1.7.0.4