On 03/31/2018 04:29 AM, Xiao Wang wrote:
The IFCVF vDPA (vhost data path acceleration) driver provides support for the Intel FPGA 100G VF (IFCVF). IFCVF's datapath is virtio ring compatible, it works as a HW vhost backend which can send/receive packets to/from virtio directly by DMA. Different VF devices serve different virtio frontends which are in different VMs, so each VF needs to have its own DMA address translation service. During the driver probe a new container is created, with this container vDPA driver can program DMA remapping table with the VM's memory region information. Key vDPA driver ops implemented: - ifcvf_dev_config: Enable VF data path with virtio information provided by vhost lib, including IOMMU programming to enable VF DMA to VM's memory, VFIO interrupt setup to route HW interrupt to virtio driver, create notify relay thread to translate virtio driver's kick to a MMIO write onto HW, HW queues configuration. - ifcvf_dev_close: Revoke all the setup in ifcvf_dev_config. Live migration feature is supported by IFCVF and this driver enables it. For the dirty page logging, VF helps to log for packet buffer write, driver helps to make the used ring as dirty when device stops. Because vDPA driver needs to set up MSI-X vector to interrupt the guest, only vfio-pci is supported currently. Signed-off-by: Xiao Wang <xiao.w.w...@intel.com> Signed-off-by: Rosen Xu <rosen...@intel.com> --- config/common_base | 7 + config/common_linuxapp | 1 + drivers/net/Makefile | 3 + drivers/net/ifc/Makefile | 36 ++ drivers/net/ifc/base/ifcvf.c | 329 +++++++++++++ drivers/net/ifc/base/ifcvf.h | 160 +++++++ drivers/net/ifc/base/ifcvf_osdep.h | 52 +++ drivers/net/ifc/ifcvf_vdpa.c | 842 ++++++++++++++++++++++++++++++++++ drivers/net/ifc/rte_ifcvf_version.map | 4 + mk/rte.app.mk | 3 + 10 files changed, 1437 insertions(+) create mode 100644 drivers/net/ifc/Makefile create mode 100644 drivers/net/ifc/base/ifcvf.c create mode 100644 drivers/net/ifc/base/ifcvf.h create mode 100644 drivers/net/ifc/base/ifcvf_osdep.h create mode 100644 drivers/net/ifc/ifcvf_vdpa.c create mode 100644 drivers/net/ifc/rte_ifcvf_version.map
Thanks for having handled the changes, please see minor comments below. Feel free to add my: Reviewed-by: Maxime Coquelin <maxime.coque...@redhat.com> Thanks! Maxime
+static uint64_t +qva_to_gpa(int vid, uint64_t qva)
We might want to have this in vhost-lib to avoid duplication, but that can be done later.
+{ + struct rte_vhost_memory *mem = NULL; + struct rte_vhost_mem_region *reg; + uint32_t i; + uint64_t gpa = 0; + + if (rte_vhost_get_mem_table(vid, &mem) < 0) + goto exit; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + + if (qva >= reg->host_user_addr && + qva < reg->host_user_addr + reg->size) { + gpa = qva - reg->host_user_addr + reg->guest_phys_addr; + break; + } + } + +exit: + if (gpa == 0) + rte_panic("failed to get gpa\n"); + if (mem) + free(mem); + return gpa; +} + +static int +vdpa_ifcvf_start(struct ifcvf_internal *internal) +{ + struct ifcvf_hw *hw = &internal->hw; + int i, nr_vring; + int vid; + struct rte_vhost_vring vq; + + vid = internal->vid; + nr_vring = rte_vhost_get_vring_num(vid); + rte_vhost_get_negotiated_features(vid, &hw->req_features); + + for (i = 0; i < nr_vring; i++) { + rte_vhost_get_vhost_vring(vid, i, &vq); + hw->vring[i].desc = qva_to_gpa(vid, (uint64_t)vq.desc); + hw->vring[i].avail = qva_to_gpa(vid, (uint64_t)vq.avail); + hw->vring[i].used = qva_to_gpa(vid, (uint64_t)vq.used); + hw->vring[i].size = vq.size; + rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx, + &hw->vring[i].last_used_idx); + } + hw->nr_vring = i; + + return ifcvf_start_hw(&internal->hw); +} + +static void +vdpa_ifcvf_stop(struct ifcvf_internal *internal) +{ + struct ifcvf_hw *hw = &internal->hw; + int i, j; + int vid; + uint64_t features, pfn; + uint64_t log_base, log_size; + uint8_t *log_buf; + + vid = internal->vid; + ifcvf_stop_hw(hw); + + for (i = 0; i < hw->nr_vring; i++) + rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx, + hw->vring[i].last_used_idx); + + rte_vhost_get_negotiated_features(vid, &features); + if (RTE_VHOST_NEED_LOG(features)) { + ifcvf_disable_logging(hw); + rte_vhost_get_log_base(internal->vid, &log_base, &log_size); + /* + * IFCVF marks dirty memory pages for only packet buffer, + * SW helps to mark the used ring as dirty after device stops. + */ + log_buf = (uint8_t *)(uintptr_t)log_base; + for (i = 0; i < hw->nr_vring; i++) { + pfn = hw->vring[i].used / 4096; + for (j = 0; j <= hw->vring[i].size * 8 / 4096; j++) + __sync_fetch_and_or_8(&log_buf[(pfn + j) / 8], + 1 << ((pfn + j) % 8)); + } + } +} + +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1)) +static int +vdpa_enable_vfio_intr(struct ifcvf_internal *internal) +{ + int ret; + uint32_t i, nr_vring; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + struct rte_vhost_vring vring; + + nr_vring = rte_vhost_get_vring_num(internal->vid); + + irq_set = (struct vfio_irq_set *)irq_set_buf; + irq_set->argsz = sizeof(irq_set_buf); + irq_set->count = nr_vring + 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *)&irq_set->data; + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd; + + for (i = 0; i < nr_vring; i++) { + rte_vhost_get_vhost_vring(internal->vid, i, &vring); + fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd; + } + + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static int +vdpa_disable_vfio_intr(struct ifcvf_internal *internal) +{ + int ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + + irq_set = (struct vfio_irq_set *)irq_set_buf; + irq_set->argsz = sizeof(irq_set_buf); + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static void * +notify_relay(void *arg) +{ + int i, kickfd, epfd, nfds = 0; + uint32_t qid, q_num; + struct epoll_event events[IFCVF_MAX_QUEUES * 2]; + struct epoll_event ev; + uint64_t buf; + int nbytes; + struct rte_vhost_vring vring; + struct ifcvf_internal *internal = (struct ifcvf_internal *)arg; + struct ifcvf_hw *hw = &internal->hw; + + q_num = rte_vhost_get_vring_num(internal->vid); + + epfd = epoll_create(IFCVF_MAX_QUEUES * 2); + if (epfd < 0) { + DRV_LOG(ERR, "failed to create epoll instance."); + return NULL; + } + internal->epfd = epfd; + + for (qid = 0; qid < q_num; qid++) { + ev.events = EPOLLIN | EPOLLPRI; + rte_vhost_get_vhost_vring(internal->vid, qid, &vring); + ev.data.u64 = qid | (uint64_t)vring.kickfd << 32; + if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) { + DRV_LOG(ERR, "epoll add error: %s", strerror(errno)); + return NULL; + } + } + + for (;;) { + nfds = epoll_wait(epfd, events, q_num, -1); + if (nfds < 0) { + if (errno == EINTR) + continue; + DRV_LOG(ERR, "epoll_wait return fail\n"); + return NULL; + } + + for (i = 0; i < nfds; i++) { + qid = events[i].data.u32; + kickfd = (uint32_t)(events[i].data.u64 >> 32); + do { + nbytes = read(kickfd, &buf, 8); + if (nbytes < 0) { + if (errno == EINTR || + errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + DRV_LOG(INFO, "Error reading " + "kickfd: %s", + strerror(errno)); + } + break; + } while (1); + + ifcvf_notify_queue(hw, qid); + } + } + + return NULL; +} + +static int +setup_notify_relay(struct ifcvf_internal *internal) +{ + int ret; + + ret = pthread_create(&internal->tid, NULL, notify_relay, + (void *)internal); + if (ret) { + DRV_LOG(ERR, "failed to create notify relay pthread."); + return -1; + } + return 0; +} + +static int +unset_notify_relay(struct ifcvf_internal *internal) +{ + void *status; + + if (internal->tid) { + pthread_cancel(internal->tid); + pthread_join(internal->tid, &status); + } + internal->tid = 0; + + if (internal->epfd >= 0) + close(internal->epfd); + internal->epfd = -1; + + return 0; +} + +static int +update_datapath(struct ifcvf_internal *internal) +{ + int ret; + + rte_spinlock_lock(&internal->lock); + + if (!rte_atomic32_read(&internal->running) && + (rte_atomic32_read(&internal->started) && + rte_atomic32_read(&internal->dev_attached))) { + ret = ifcvf_dma_map(internal); + if (ret) + goto err; + + ret = vdpa_enable_vfio_intr(internal); + if (ret) + goto err; + + ret = setup_notify_relay(internal); + if (ret) + goto err; + + ret = vdpa_ifcvf_start(internal); + if (ret) + goto err; + + rte_atomic32_set(&internal->running, 1); + } else if (rte_atomic32_read(&internal->running) && + (!rte_atomic32_read(&internal->started) || + !rte_atomic32_read(&internal->dev_attached))) { + vdpa_ifcvf_stop(internal); + + ret = unset_notify_relay(internal); + if (ret) + goto err; + + ret = vdpa_disable_vfio_intr(internal); + if (ret) + goto err; + + ret = ifcvf_dma_unmap(internal); + if (ret) + goto err; + + rte_atomic32_set(&internal->running, 0); + } + + rte_spinlock_unlock(&internal->lock); + return 0; +err: + rte_spinlock_unlock(&internal->lock); + return ret; +} + +static int +ifcvf_dev_config(int vid) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + internal->vid = vid; + rte_atomic32_set(&internal->dev_attached, 1); + update_datapath(internal); + + return 0; +} + +static int +ifcvf_dev_close(int vid) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + rte_atomic32_set(&internal->dev_attached, 0); + update_datapath(internal); + + return 0; +} + +static int +ifcvf_feature_set(int vid) +{ + uint64_t features; + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + uint64_t log_base, log_size; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + rte_vhost_get_negotiated_features(internal->vid, &features); + + if (RTE_VHOST_NEED_LOG(features)) { + rte_vhost_get_log_base(internal->vid, &log_base, &log_size); + log_base = rte_mem_virt2phy((void *)(uintptr_t)log_base); + ifcvf_enable_logging(&internal->hw, log_base, log_size); + } + + return 0; +} + +static int +ifcvf_get_vfio_group_fd(int vid) +{ + int did; + struct internal_list *list; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + return list->internal->vfio_group_fd; +} + +static int +ifcvf_get_vfio_device_fd(int vid) +{ + int did; + struct internal_list *list; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + return list->internal->vfio_dev_fd; +} + +static int +ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size) +{ + int did; + struct internal_list *list; + struct ifcvf_internal *internal; + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + int ret; + + did = rte_vhost_get_vdpa_did(vid); + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + internal = list->internal; + + reg.index = ifcvf_get_notify_region(&internal->hw); + ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); + if (ret) { + DRV_LOG(ERR, "Get not get device region info: %s", + strerror(errno)); + return -1; + } + + *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset; + *size = 0x1000; + + return 0; +} + +static int +ifcvf_get_queue_num(int did, uint32_t *queue_num) +{ + struct internal_list *list; + + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + *queue_num = list->internal->max_queues; + + return 0; +} + +static int +ifcvf_get_vdpa_feature(int did, uint64_t *features) +{ + struct internal_list *list; + + list = find_internal_resource_by_did(did); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device id: %d", did); + return -1; + } + + *features = list->internal->features; + + return 0; +} + +#define VDPA_SUPPORTED_PROTOCOL_FEATURES \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) +static int +ifcvf_get_protocol_feature(int did __rte_unused, uint64_t *features) +{ + *features = VDPA_SUPPORTED_PROTOCOL_FEATURES; + return 0; +} + +struct rte_vdpa_dev_ops ifcvf_ops = { + .queue_num_get = ifcvf_get_queue_num, + .feature_get = ifcvf_get_vdpa_feature, + .protocol_feature_get = ifcvf_get_protocol_feature,
I have proposed in vDPA series to rename the ops so that it is consistant with Vhost-user protocol:
e.g. get_protocol_features, get_features... So you might have to rebase if this is change is implemented.
+ .dev_conf = ifcvf_dev_config, + .dev_close = ifcvf_dev_close, + .vring_state_set = NULL, + .feature_set = ifcvf_feature_set, + .migration_done = NULL, + .get_vfio_group_fd = ifcvf_get_vfio_group_fd, + .get_vfio_device_fd = ifcvf_get_vfio_device_fd, + .get_notify_area = ifcvf_get_notify_area, +}; + +static int +ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + struct rte_pci_device *pci_dev) +{ + uint64_t features; + struct ifcvf_internal *internal = NULL; + struct internal_list *list = NULL; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + list = rte_zmalloc("ifcvf", sizeof(*list), 0); + if (list == NULL) + goto error; + + internal = rte_zmalloc("ifcvf", sizeof(*internal), 0); + if (internal == NULL) + goto error; + + internal->pdev = pci_dev; + rte_spinlock_init(&internal->lock); + if (ifcvf_vfio_setup(internal) < 0) + return -1; + + internal->max_queues = IFCVF_MAX_QUEUES; + features = ifcvf_get_features(&internal->hw); + internal->features = (features & + ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) | + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES); + + internal->dev_addr.pci_addr = pci_dev->addr; + internal->dev_addr.type = PCI_ADDR; + list->internal = internal; + + pthread_mutex_lock(&internal_list_lock); + TAILQ_INSERT_TAIL(&internal_list, list, next); + pthread_mutex_unlock(&internal_list_lock); + + if (rte_vdpa_register_device(&internal->dev_addr, + &ifcvf_ops) < 0) + goto error; + + rte_atomic32_set(&internal->started, 1); + update_datapath(internal); + + return 0; + +error: + rte_free(list); + rte_free(internal); + return -1; +} + +static int +ifcvf_pci_remove(struct rte_pci_device *pci_dev) +{ + struct ifcvf_internal *internal; + struct internal_list *list; + + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + list = find_internal_resource_by_dev(pci_dev); + if (list == NULL) { + DRV_LOG(ERR, "Invalid device: %s", pci_dev->name); + return -1; + } + + internal = list->internal; + rte_atomic32_set(&internal->started, 0); + update_datapath(internal); + + rte_pci_unmap_device(internal->pdev); + rte_vfio_destroy_container(internal->vfio_container_fd); + rte_vdpa_unregister_device(internal->did); + + pthread_mutex_lock(&internal_list_lock); + TAILQ_REMOVE(&internal_list, list, next); + pthread_mutex_unlock(&internal_list_lock); + + rte_free(list); + rte_free(internal); + + return 0; +} + +/* + * The set of PCI devices this driver supports. + */ +static const struct rte_pci_id pci_id_ifcvf_map[] = { + { .class_id = RTE_CLASS_ANY_ID, + .vendor_id = IFCVF_VENDOR_ID, + .device_id = IFCVF_DEVICE_ID, + .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID, + .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID, + }, + + { .vendor_id = 0, /* sentinel */ + }, +}; + +static struct rte_pci_driver rte_ifcvf_vdpa = { + .driver = { + .name = "net_ifcvf", + }, + .id_table = pci_id_ifcvf_map, + .drv_flags = 0, + .probe = ifcvf_pci_probe, + .remove = ifcvf_pci_remove, +}; + +RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa); +RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map); +RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci"); + +RTE_INIT(ifcvf_vdpa_init_log); +static void +ifcvf_vdpa_init_log(void) +{ + ifcvf_vdpa_logtype = rte_log_register("net.ifcvf_vdpa"); + if (ifcvf_vdpa_logtype >= 0) + rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE); +} diff --git a/drivers/net/ifc/rte_ifcvf_version.map b/drivers/net/ifc/rte_ifcvf_version.map new file mode 100644 index 000000000..9b9ab1a4c --- /dev/null +++ b/drivers/net/ifc/rte_ifcvf_version.map @@ -0,0 +1,4 @@ +DPDK_18.05 { + + local: *; +}; diff --git a/mk/rte.app.mk b/mk/rte.app.mk index 3eb41d176..46f76146e 100644 --- a/mk/rte.app.mk +++ b/mk/rte.app.mk @@ -171,6 +171,9 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD) += -lrte_pmd_vdev_netvsc _LDLIBS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += -lrte_pmd_virtio ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y) _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += -lrte_pmd_vhost +ifeq ($(CONFIG_RTE_EAL_VFIO),y) +_LDLIBS-$(CONFIG_RTE_LIBRTE_IFCVF_VDPA) += -lrte_ifcvf_vdpa +endif # $(CONFIG_RTE_EAL_VFIO) endif # $(CONFIG_RTE_LIBRTE_VHOST) _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += -lrte_pmd_vmxnet3_uio