On Mon, 07/10 15:55, Stefan Hajnoczi wrote: > On Wed, Jul 05, 2017 at 09:36:31PM +0800, Fam Zheng wrote: > > +static bool nvme_identify(BlockDriverState *bs, int namespace, Error > > **errp) > > +{ > > + BDRVNVMeState *s = bs->opaque; > > + uint8_t *resp; > > + int r; > > + uint64_t iova; > > + NvmeCmd cmd = { > > + .opcode = NVME_ADM_CMD_IDENTIFY, > > + .cdw10 = cpu_to_le32(0x1), > > + }; > > + > > + resp = qemu_try_blockalign0(bs, 4096); > > Is it possible to use struct NvmeIdCtrl to make this code clearer and > eliminate the hardcoded sizes/offsets?
Yes, will do. > > > + if (!resp) { > > + error_setg(errp, "Cannot allocate buffer for identify response"); > > + return false; > > + } > > + r = nvme_vfio_dma_map(s->vfio, resp, 4096, true, &iova); > > + if (r) { > > + error_setg(errp, "Cannot map buffer for DMA"); > > + goto fail; > > + } > > + cmd.prp1 = cpu_to_le64(iova); > > + > > + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { > > + error_setg(errp, "Failed to identify controller"); > > + goto fail; > > + } > > + > > + if (le32_to_cpu(*(uint32_t *)&resp[516]) < namespace) { > > + error_setg(errp, "Invalid namespace"); > > + goto fail; > > + } > > + s->write_cache = le32_to_cpu(resp[525]) & 0x1; > > + s->max_transfer = (resp[77] ? 1 << resp[77] : 0) * s->page_size; > > + /* For now the page list buffer per command is one page, to hold at > > most > > + * s->page_size / sizeof(uint64_t) entries. */ > > + s->max_transfer = MIN_NON_ZERO(s->max_transfer, > > + s->page_size / sizeof(uint64_t) * s->page_size); > > + > > + memset((char *)resp, 0, 4096); > > + > > + cmd.cdw10 = 0; > > + cmd.nsid = namespace; > > + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { > > + error_setg(errp, "Failed to identify namespace"); > > + goto fail; > > + } > > + > > + s->nsze = le64_to_cpu(*(uint64_t *)&resp[0]); > > + > > + nvme_vfio_dma_unmap(s->vfio, resp); > > + qemu_vfree(resp); > > + return true; > > +fail: > > + qemu_vfree(resp); > > + return false; > > nvme_vfio_dma_unmap() is not called in the error path. Will fix. > > > +static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd > > *cmd, > > + NVMeRequest *req, QEMUIOVector > > *qiov) > > +{ > > + BDRVNVMeState *s = bs->opaque; > > + uint64_t *pagelist = req->prp_list_page; > > + int i, j, r; > > + int entries = 0; > > + > > + assert(qiov->size); > > + assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); > > + assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); > > + for (i = 0; i < qiov->niov; ++i) { > > + bool retry = true; > > + uint64_t iova; > > + qemu_co_mutex_lock(&s->dma_map_lock); > > +try_map: > > + r = nvme_vfio_dma_map(s->vfio, > > + qiov->iov[i].iov_base, > > + qiov->iov[i].iov_len, > > + true, &iova); > > + if (r == -ENOMEM && retry) { > > + retry = false; > > + trace_nvme_dma_flush_queue_wait(s); > > + if (s->inflight) { > > + trace_nvme_dma_map_flush(s); > > + qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); > > + } else { > > + r = nvme_vfio_dma_reset_temporary(s->vfio); > > + if (r) { > > + return r; > > dma_map_lock is held here! Oops, will fix. > > > +static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t > > bytes, > > + QEMUIOVector *qiov, bool is_write, int flags) > > +{ > > + BDRVNVMeState *s = bs->opaque; > > + int r; > > + uint8_t *buf = NULL; > > + QEMUIOVector local_qiov; > > + > > + assert(QEMU_IS_ALIGNED(offset, s->page_size)); > > + assert(QEMU_IS_ALIGNED(bytes, s->page_size)); > > + assert(bytes <= s->max_transfer); > > Who guarantees max_transfer? I think request alignment is enforced by > block/io.c but there is no generic max_transfer handling code, so this > assertion can be triggered by the guest. Please handle it as a genuine > request error instead of using an assertion. There has been one since 04ed95f4843281e292d93018d56d4b14705f9f2c, see the code around max_transfer in block/io.c:bdrv_aligned_*. > > > +static int nvme_reopen_prepare(BDRVReopenState *reopen_state, > > + BlockReopenQueue *queue, Error **errp) > > +{ > > + return 0; > > +} > > What is the purpose of this dummy .bdrv_reopen_prepare() implementation? This is necessary for block jobs to work, other drivers provides dummy implementations as well. Fam