> -----Original Message----- > From: Pei, Andy <andy....@intel.com> > Sent: Thursday, April 21, 2022 4:34 PM > To: dev@dpdk.org > Cc: Xia, Chenbo <chenbo....@intel.com>; maxime.coque...@redhat.com; Cao, > Gang <gang....@intel.com>; Liu, Changpeng <changpeng....@intel.com> > Subject: [PATCH v6 06/16] vdpa/ifc: add block device SW live-migration > > Add SW live-migration support to block device. > Add dirty page logging to block device.
Add SW live-migration support including dirty page logging for block device. > > Signed-off-by: Andy Pei <andy....@intel.com> > --- > drivers/vdpa/ifc/base/ifcvf.c | 4 +- > drivers/vdpa/ifc/base/ifcvf.h | 6 ++ > drivers/vdpa/ifc/ifcvf_vdpa.c | 128 +++++++++++++++++++++++++++++++++++-- > ----- > 3 files changed, 115 insertions(+), 23 deletions(-) > > diff --git a/drivers/vdpa/ifc/base/ifcvf.c b/drivers/vdpa/ifc/base/ifcvf.c > index d10c1fd..e417c50 100644 > --- a/drivers/vdpa/ifc/base/ifcvf.c > +++ b/drivers/vdpa/ifc/base/ifcvf.c > @@ -191,7 +191,7 @@ > IFCVF_WRITE_REG32(val >> 32, hi); > } > > -STATIC int > +int > ifcvf_hw_enable(struct ifcvf_hw *hw) > { > struct ifcvf_pci_common_cfg *cfg; > @@ -240,7 +240,7 @@ > return 0; > } > > -STATIC void > +void > ifcvf_hw_disable(struct ifcvf_hw *hw) > { > u32 i; > diff --git a/drivers/vdpa/ifc/base/ifcvf.h b/drivers/vdpa/ifc/base/ifcvf.h > index 769c603..6dd7925 100644 > --- a/drivers/vdpa/ifc/base/ifcvf.h > +++ b/drivers/vdpa/ifc/base/ifcvf.h > @@ -179,4 +179,10 @@ struct ifcvf_hw { > u64 > ifcvf_get_queue_notify_off(struct ifcvf_hw *hw, int qid); > > +int > +ifcvf_hw_enable(struct ifcvf_hw *hw); > + > +void > +ifcvf_hw_disable(struct ifcvf_hw *hw); > + > #endif /* _IFCVF_H_ */ > diff --git a/drivers/vdpa/ifc/ifcvf_vdpa.c b/drivers/vdpa/ifc/ifcvf_vdpa.c > index 8d104b7..a23dc2d 100644 > --- a/drivers/vdpa/ifc/ifcvf_vdpa.c > +++ b/drivers/vdpa/ifc/ifcvf_vdpa.c > @@ -345,6 +345,56 @@ struct rte_vdpa_dev_info { > } > } > > +static void > +vdpa_ifcvf_blk_pause(struct ifcvf_internal *internal) > +{ > + struct ifcvf_hw *hw = &internal->hw; > + struct rte_vhost_vring vq; > + int i, vid; > + uint64_t features = 0; > + uint64_t log_base = 0, log_size = 0; > + uint64_t len; > + > + vid = internal->vid; > + > + if (internal->device_type == IFCVF_BLK) { > + for (i = 0; i < hw->nr_vring; i++) { > + rte_vhost_get_vhost_vring(internal->vid, i, &vq); > + while (vq.avail->idx != vq.used->idx) { > + ifcvf_notify_queue(hw, i); > + usleep(10); > + } > + hw->vring[i].last_avail_idx = vq.avail->idx; > + hw->vring[i].last_used_idx = vq.used->idx; > + } > + } > + > + ifcvf_hw_disable(hw); > + > + for (i = 0; i < hw->nr_vring; i++) > + rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, > + hw->vring[i].last_used_idx); > + > + if (internal->sw_lm) > + return; > + > + rte_vhost_get_negotiated_features(vid, &features); > + if (RTE_VHOST_NEED_LOG(features)) { > + ifcvf_disable_logging(hw); > + rte_vhost_get_log_base(internal->vid, &log_base, &log_size); > + rte_vfio_container_dma_unmap(internal->vfio_container_fd, > + log_base, IFCVF_LOG_BASE, log_size); > + /* > + * IFCVF marks dirty memory pages for only packet buffer, > + * SW helps to mark the used ring as dirty after device stops. > + */ > + for (i = 0; i < hw->nr_vring; i++) { > + len = IFCVF_USED_RING_LEN(hw->vring[i].size); > + rte_vhost_log_used_vring(vid, i, 0, len); > + } > + } > +} Can we consider combining vdpa_ifcvf_blk_pause and vdpa_ifcvf_stop to one function and check device type internally to do different things? Because as I see, most logic is the same. > + > #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ > sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1)) > static int > @@ -659,15 +709,22 @@ struct rte_vdpa_dev_info { > } > hw->vring[i].avail = gpa; > > - /* Direct I/O for Tx queue, relay for Rx queue */ > - if (i & 1) { > - gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used); > - if (gpa == 0) { > - DRV_LOG(ERR, "Fail to get GPA for used ring."); > - return -1; > + if (internal->device_type == IFCVF_NET) { > + /* Direct I/O for Tx queue, relay for Rx queue */ > + if (i & 1) { > + gpa = hva_to_gpa(vid, > (uint64_t)(uintptr_t)vq.used); > + if (gpa == 0) { > + DRV_LOG(ERR, "Fail to get GPA for used > ring."); > + return -1; > + } > + hw->vring[i].used = gpa; > + } else { > + hw->vring[i].used = m_vring_iova + > + (char *)internal->m_vring[i].used - > + (char *)internal->m_vring[i].desc; > } > - hw->vring[i].used = gpa; > - } else { > + } else if (internal->device_type == IFCVF_BLK) { > + /* BLK: relay every queue */ > hw->vring[i].used = m_vring_iova + > (char *)internal->m_vring[i].used - > (char *)internal->m_vring[i].desc; > @@ -686,7 +743,10 @@ struct rte_vdpa_dev_info { > } > hw->nr_vring = nr_vring; > > - return ifcvf_start_hw(&internal->hw); > + if (internal->device_type == IFCVF_NET) > + return ifcvf_start_hw(&internal->hw); > + else if (internal->device_type == IFCVF_BLK) > + return ifcvf_hw_enable(&internal->hw); > > error: > for (i = 0; i < nr_vring; i++) > @@ -710,8 +770,12 @@ struct rte_vdpa_dev_info { > > for (i = 0; i < hw->nr_vring; i++) { > /* synchronize remaining new used entries if any */ > - if ((i & 1) == 0) > + if (internal->device_type == IFCVF_NET) { > + if ((i & 1) == 0) > + update_used_ring(internal, i); > + } else if (internal->device_type == IFCVF_BLK) { > update_used_ring(internal, i); > + } > > rte_vhost_get_vhost_vring(vid, i, &vq); > len = IFCVF_USED_RING_LEN(vq.size); > @@ -773,17 +837,36 @@ struct rte_vdpa_dev_info { > } > } > > - for (qid = 0; qid < q_num; qid += 2) { > - ev.events = EPOLLIN | EPOLLPRI; > - /* leave a flag to mark it's for interrupt */ > - ev.data.u64 = 1 | qid << 1 | > - (uint64_t)internal->intr_fd[qid] << 32; > - if (epoll_ctl(epfd, EPOLL_CTL_ADD, internal->intr_fd[qid], &ev) > - < 0) { > - DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); > - return NULL; > + if (internal->device_type == IFCVF_NET) { > + for (qid = 0; qid < q_num; qid += 2) { > + ev.events = EPOLLIN | EPOLLPRI; > + /* leave a flag to mark it's for interrupt */ > + ev.data.u64 = 1 | qid << 1 | > + (uint64_t)internal->intr_fd[qid] << 32; > + if (epoll_ctl(epfd, EPOLL_CTL_ADD, > + internal->intr_fd[qid], &ev) > + < 0) { > + DRV_LOG(ERR, "epoll add error: %s", > + strerror(errno)); > + return NULL; > + } > + update_used_ring(internal, qid); > + } > + } else if (internal->device_type == IFCVF_BLK) { > + for (qid = 0; qid < q_num; qid += 1) { > + ev.events = EPOLLIN | EPOLLPRI; > + /* leave a flag to mark it's for interrupt */ > + ev.data.u64 = 1 | qid << 1 | > + (uint64_t)internal->intr_fd[qid] << 32; > + if (epoll_ctl(epfd, EPOLL_CTL_ADD, > + internal->intr_fd[qid], &ev) > + < 0) { > + DRV_LOG(ERR, "epoll add error: %s", > + strerror(errno)); > + return NULL; > + } > + update_used_ring(internal, qid); It seems we can also reduce duplicate code for above case. And for other checks, if we can use only one combined condition to check, I prefer to just use one. Thanks, Chenbo > } > - update_used_ring(internal, qid); > } > > /* start relay with a first kick */ > @@ -871,7 +954,10 @@ struct rte_vdpa_dev_info { > > /* stop the direct IO data path */ > unset_notify_relay(internal); > - vdpa_ifcvf_stop(internal); > + if (internal->device_type == IFCVF_NET) > + vdpa_ifcvf_stop(internal); > + else if (internal->device_type == IFCVF_BLK) > + vdpa_ifcvf_blk_pause(internal); > vdpa_disable_vfio_intr(internal); > > ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false); > -- > 1.8.3.1