This driver is a reference sample of making vDPA device driver based
on vhost lib, this driver uses a standard virtio-net PCI device as
vDPA device, it can serve as a backend for a virtio-net pci device
in nested VM.

The key driver ops implemented are:

* vdpa_virtio_eng_init
Mapping virtio pci device with VFIO into userspace, and read device
capability and intialize internal data.

* vdpa_virtio_eng_uninit
Release the mapped device.

* vdpa_virtio_info_query
Device capability reporting, e.g. queue number, features.

* vdpa_virtio_dev_config
With the guest virtio information provideed by vhost lib, this
function configures device and IOMMU to set up vhost datapath,
which includes: Rx/Tx vring, VFIO interrupt, kick relay.

* vdpa_virtio_dev_close
Unset the stuff that are configured previously by dev_conf.

This driver requires the virtio device supports VIRTIO_F_IOMMU_PLATFORM
, because the buffer address written in desc is IOVA.

Because vDPA driver needs to set up MSI-X vector to interrupt the guest,
only vfio-pci is supported currently.

Signed-off-by: Xiao Wang <xiao.w.w...@intel.com>
---
 config/common_base                                 |    6 +
 config/common_linuxapp                             |    1 +
 drivers/net/Makefile                               |    1 +
 drivers/net/vdpa_virtio_pci/Makefile               |   31 +
 .../net/vdpa_virtio_pci/rte_eth_vdpa_virtio_pci.c  | 1527 ++++++++++++++++++++
 .../rte_vdpa_virtio_pci_version.map                |    4 +
 mk/rte.app.mk                                      |    1 +
 7 files changed, 1571 insertions(+)
 create mode 100644 drivers/net/vdpa_virtio_pci/Makefile
 create mode 100644 drivers/net/vdpa_virtio_pci/rte_eth_vdpa_virtio_pci.c
 create mode 100644 drivers/net/vdpa_virtio_pci/rte_vdpa_virtio_pci_version.map

diff --git a/config/common_base b/config/common_base
index ad03cf433..aaa775129 100644
--- a/config/common_base
+++ b/config/common_base
@@ -791,6 +791,12 @@ CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
 #
 CONFIG_RTE_LIBRTE_PMD_VHOST=n
 
+#
+# Compile VDPA VIRTIO PCI driver
+# To compile, CONFIG_RTE_LIBRTE_VHOST should be enabled.
+#
+CONFIG_RTE_LIBRTE_VDPA_VIRTIO_PCI=n
+
 #
 # Compile the test application
 #
diff --git a/config/common_linuxapp b/config/common_linuxapp
index ff98f2355..83446090c 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -15,6 +15,7 @@ CONFIG_RTE_LIBRTE_PMD_KNI=y
 CONFIG_RTE_LIBRTE_VHOST=y
 CONFIG_RTE_LIBRTE_VHOST_NUMA=y
 CONFIG_RTE_LIBRTE_PMD_VHOST=y
+CONFIG_RTE_LIBRTE_VDPA_VIRTIO_PCI=y
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
 CONFIG_RTE_LIBRTE_PMD_TAP=y
 CONFIG_RTE_LIBRTE_AVP_PMD=y
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index e1127326b..0a45ef603 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -53,6 +53,7 @@ endif # $(CONFIG_RTE_LIBRTE_SCHED)
 
 ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
+DIRS-$(CONFIG_RTE_LIBRTE_VDPA_VIRTIO_PCI) += vdpa_virtio_pci
 endif # $(CONFIG_RTE_LIBRTE_VHOST)
 
 ifeq ($(CONFIG_RTE_LIBRTE_MRVL_PMD),y)
diff --git a/drivers/net/vdpa_virtio_pci/Makefile 
b/drivers/net/vdpa_virtio_pci/Makefile
new file mode 100644
index 000000000..147d7a7a3
--- /dev/null
+++ b/drivers/net/vdpa_virtio_pci/Makefile
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_vdpa_virtio_pci.a
+
+LDLIBS += -lpthread
+LDLIBS += -lrte_eal -lrte_mempool -lrte_pci
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_vhost
+LDLIBS += -lrte_bus_vdev -lrte_bus_pci
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci/linux
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+EXPORT_MAP := rte_vdpa_virtio_pci_version.map
+
+LIBABIVER := 1
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vdpa_virtio_pci.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/vdpa_virtio_pci/rte_eth_vdpa_virtio_pci.c 
b/drivers/net/vdpa_virtio_pci/rte_eth_vdpa_virtio_pci.c
new file mode 100644
index 000000000..5e63b15e6
--- /dev/null
+++ b/drivers/net/vdpa_virtio_pci/rte_eth_vdpa_virtio_pci.c
@@ -0,0 +1,1527 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <pthread.h>
+#include <fcntl.h>
+#include <linux/pci_regs.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_pci.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+
+#include <rte_vfio.h>
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_bus_pci.h>
+#include <rte_bus_vdev.h>
+#include <rte_kvargs.h>
+#include <rte_vhost.h>
+#include <rte_vdpa.h>
+#include <rte_io.h>
+#include <rte_cycles.h>
+#include <rte_spinlock.h>
+#include <eal_vfio.h>
+#include <pci_init.h>
+
+#define MAX_QUEUES             1
+#define VIRTIO_F_IOMMU_PLATFORM        33
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+               sizeof(int) * (MAX_QUEUES * 2 + 1))
+
+#define ETH_VDPA_VIRTIO_PCI_BDF_ARG    "bdf"
+
+static const char *const valid_arguments[] = {
+       ETH_VDPA_VIRTIO_PCI_BDF_ARG,
+       NULL
+};
+
+static struct ether_addr base_eth_addr = {
+       .addr_bytes = {
+               0x56 /* V */,
+               0x44 /* D */,
+               0x50 /* P */,
+               0x41 /* A */,
+               0x00,
+               0x00
+       }
+};
+
+struct virtio_pci_info {
+       struct rte_pci_device pdev;
+       uint64_t    req_features;
+       uint32_t    notify_off_multiplier;
+       struct virtio_pci_common_cfg *common_cfg;
+       uint8_t     *isr;
+       uint16_t    *notify_base;
+       struct virtio_net_device_config *dev_cfg;
+       uint16_t    *notify_addr[MAX_QUEUES * 2];
+       int vfio_container_fd;
+       int vfio_group_fd;
+       int vfio_dev_fd;
+       pthread_t tid;  /* thread for notify relay */
+       int epfd;
+};
+
+struct vdpa_virtio_pci_internal {
+       char *dev_name;
+       uint16_t max_queues;
+       uint16_t max_devices;
+       uint64_t features;
+       struct rte_vdpa_eng_addr eng_addr;
+       int eid;
+       int vid;
+       struct virtio_pci_info vpci;
+       rte_atomic32_t started;
+       rte_atomic32_t dev_attached;
+       rte_atomic32_t running;
+       rte_spinlock_t lock;
+};
+
+struct internal_list {
+       TAILQ_ENTRY(internal_list) next;
+       struct rte_eth_dev *eth_dev;
+};
+
+TAILQ_HEAD(internal_list_head, internal_list);
+static struct internal_list_head internal_list =
+       TAILQ_HEAD_INITIALIZER(internal_list);
+
+static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static struct rte_eth_link vdpa_link = {
+               .link_speed = 10000,
+               .link_duplex = ETH_LINK_FULL_DUPLEX,
+               .link_status = ETH_LINK_DOWN
+};
+
+static struct internal_list *
+find_internal_resource_by_eid(int eid)
+{
+       int found = 0;
+       struct internal_list *list;
+       struct vdpa_virtio_pci_internal *internal;
+
+       pthread_mutex_lock(&internal_list_lock);
+
+       TAILQ_FOREACH(list, &internal_list, next) {
+               internal = list->eth_dev->data->dev_private;
+               if (eid == internal->eid) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       pthread_mutex_unlock(&internal_list_lock);
+
+       if (!found)
+               return NULL;
+
+       return list;
+}
+
+static struct internal_list *
+find_internal_resource_by_eng_addr(struct rte_vdpa_eng_addr *addr)
+{
+       int found = 0;
+       struct internal_list *list;
+       struct vdpa_virtio_pci_internal *internal;
+
+       pthread_mutex_lock(&internal_list_lock);
+
+       TAILQ_FOREACH(list, &internal_list, next) {
+               internal = list->eth_dev->data->dev_private;
+               if (addr == &internal->eng_addr) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       pthread_mutex_unlock(&internal_list_lock);
+
+       if (!found)
+               return NULL;
+
+       return list;
+}
+
+static int
+check_pci_dev(struct rte_pci_device *dev)
+{
+       char filename[PATH_MAX];
+       char dev_dir[PATH_MAX];
+       char driver[PATH_MAX];
+       int ret;
+
+       snprintf(dev_dir, sizeof(dev_dir), "%s/" PCI_PRI_FMT,
+                       rte_pci_get_sysfs_path(),
+                       dev->addr.domain, dev->addr.bus,
+                       dev->addr.devid, dev->addr.function);
+       if (access(dev_dir, R_OK) != 0) {
+               RTE_LOG(ERR, PMD, "%s not exist\n", dev_dir);
+               return -1;
+       }
+
+       /* parse resources */
+       snprintf(filename, sizeof(filename), "%s/resource", dev_dir);
+       if (pci_parse_sysfs_resource(filename, dev) < 0) {
+               RTE_LOG(ERR, PMD, "cannot parse resource: %s\n", filename);
+               return -1;
+       }
+
+       /* parse driver */
+       snprintf(filename, sizeof(filename), "%s/driver", dev_dir);
+       ret = pci_get_kernel_driver_by_path(filename, driver);
+       if (ret != 0) {
+               RTE_LOG(ERR, PMD, "Fail to get kernel driver: %s\n", filename);
+               return -1;
+       }
+
+       if (strcmp(driver, "vfio-pci") != 0) {
+               RTE_LOG(ERR, PMD, "kernel driver %s is not vfio-pci\n", driver);
+               return -1;
+       }
+       return 0;
+}
+
+static int
+vdpa_vfio_get_group_fd(int iommu_group_no)
+{
+       char filename[PATH_MAX];
+       int vfio_group_fd;
+
+       snprintf(filename, sizeof(filename), VFIO_GROUP_FMT, iommu_group_no);
+       vfio_group_fd = open(filename, O_RDWR);
+       if (vfio_group_fd < 0) {
+               if (errno != ENOENT) {
+                       RTE_LOG(ERR, PMD, "cannot open %s: %s\n", filename,
+                               strerror(errno));
+                       return -1;
+               }
+               return 0;
+       }
+
+       return vfio_group_fd;
+}
+
+static int
+vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+                 int *vfio_dev_fd, struct vfio_device_info *device_info,
+                 struct virtio_pci_info *vpci)
+{
+       struct vfio_group_status group_status = {
+               .argsz = sizeof(group_status)
+       };
+       int vfio_container_fd = -1;
+       int vfio_group_fd = -1;
+       int iommu_group_no;
+       int ret;
+
+       vfio_container_fd = vfio_get_container_fd();
+
+       /* check if we have VFIO driver enabled */
+       if (vfio_container_fd < 0) {
+               RTE_LOG(ERR, PMD, "failed to open VFIO container\n");
+               return -1;
+       }
+
+       ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+       if (ret < 0) {
+               RTE_LOG(ERR, PMD, "VFIO_TYPE1_IOMMU not supported\n");
+               goto err;
+       }
+
+       /* get group number */
+       ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+       if (ret <= 0) {
+               RTE_LOG(ERR, PMD, "%s: unable to find IOMMU group\n", dev_addr);
+               goto err;
+       }
+
+       /* get the actual group fd */
+       vfio_group_fd = vdpa_vfio_get_group_fd(iommu_group_no);
+       RTE_LOG(INFO, PMD, "\n%s group no %d group fd %d\n",
+                       dev_addr, iommu_group_no, vfio_group_fd);
+       if (vfio_group_fd <= 0)
+               goto err;
+
+       /* check if the group is viable */
+       ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "%s cannot get group status, error: %s\n",
+                               dev_addr, strerror(errno));
+               goto err;
+       } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+               RTE_LOG(ERR, PMD, "%s VFIO group is not viable\n", dev_addr);
+               goto err;
+       }
+
+       /* check if group does not have a container yet */
+       if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+               /* add group to a container */
+               ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+                               &vfio_container_fd);
+               if (ret) {
+                       RTE_LOG(ERR, PMD, "cannot add VFIO group to container, "
+                                       "error: %s\n", strerror(errno));
+                       goto err;
+               }
+               RTE_LOG(INFO, PMD, "vfio_group_fd %d set container_fd %d\n",
+                               vfio_group_fd, vfio_container_fd);
+       } else {
+               RTE_LOG(ERR, PMD, "%s has a container already\n", dev_addr);
+               goto err;
+       }
+
+       ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "%s set IOMMU type failed, error: %s\n",
+                               dev_addr, strerror(errno));
+               goto err;
+       }
+
+       /* get a file descriptor for the device */
+       *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+       if (*vfio_dev_fd < 0) {
+               RTE_LOG(ERR, PMD, "%s cannot get vfio_dev_fd, error: %s\n",
+                               dev_addr, strerror(errno));
+               goto err;
+       }
+
+       ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "%s cannot get device info, error: %s\n",
+                               dev_addr, strerror(errno));
+               close(*vfio_dev_fd);
+               goto err;
+       }
+
+       vpci->vfio_container_fd = vfio_container_fd;
+       vpci->vfio_group_fd = vfio_group_fd;
+       return 0;
+
+err:
+       if (vfio_container_fd >= 0)
+               close(vfio_container_fd);
+       if (vfio_group_fd > 0)
+               close(vfio_group_fd);
+       return -1;
+}
+
+static int
+virtio_pci_vfio_map_resource(struct virtio_pci_info *vpci)
+{
+       struct rte_pci_device *pdev = &vpci->pdev;
+       struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+       char pci_addr[PATH_MAX] = {0};
+       struct rte_pci_addr *loc = &pdev->addr;
+       int i, ret, nb_maps;
+       int vfio_dev_fd;
+       uint32_t ioport_bar;
+       struct pci_msix_table msix_table;
+
+       /* store PCI address string */
+       snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+                       loc->domain, loc->bus, loc->devid, loc->function);
+
+       ret = vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+                       &vfio_dev_fd, &device_info, vpci);
+       if (ret)
+               return ret;
+
+       ret = pci_vfio_get_msix_bar(vfio_dev_fd, &msix_table);
+       if (ret < 0) {
+               RTE_LOG(ERR, PMD, "%s cannot get MSI-X BAR number\n", pci_addr);
+               goto fail;
+       }
+
+       /* get number of regions (up to BAR5) */
+       nb_maps = RTE_MIN((int)device_info.num_regions,
+                               VFIO_PCI_BAR5_REGION_INDEX + 1);
+
+       /* map BARs */
+       for (i = 0; i < nb_maps; i++) {
+               struct vfio_region_info reg = { .argsz = sizeof(reg) };
+               void *bar_addr;
+
+               reg.index = i;
+               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
+               if (ret) {
+                       RTE_LOG(ERR, PMD, "%s cannot get region info, "
+                                       "error: %s\n",
+                                       pci_addr, strerror(errno));
+                       goto fail;
+               }
+
+               ret = pread(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
+                           VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                           PCI_BASE_ADDRESS_0 + i * 4);
+               if (ret != sizeof(ioport_bar)) {
+                       RTE_LOG(ERR, PMD, "cannot read command (%x) from "
+                               "config space\n", PCI_BASE_ADDRESS_0 + i * 4);
+                       goto fail;
+               }
+
+               /* check for io port region */
+               if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO)
+                       continue;
+
+               /* skip non-mmapable BARs */
+               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+                       continue;
+
+               if (i == msix_table.bar_index)
+                       continue;
+
+               /* try mapping somewhere close to the end of hugepages */
+               if (pci_map_addr == NULL)
+                       pci_map_addr = pci_find_max_end_va();
+
+               bar_addr = pci_map_addr;
+               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t)reg.size);
+
+               /* reserve the address using an inaccessible mapping */
+               bar_addr = mmap(bar_addr, reg.size, 0, MAP_PRIVATE |
+                               MAP_ANONYMOUS, -1, 0);
+               if (bar_addr != MAP_FAILED) {
+                       void *map_addr = NULL;
+                       if (reg.size)
+                               map_addr = pci_map_resource(bar_addr,
+                                               vfio_dev_fd,
+                                               reg.offset, reg.size,
+                                               MAP_FIXED);
+
+                       if (map_addr == MAP_FAILED || !map_addr) {
+                               munmap(bar_addr, reg.size);
+                               bar_addr = MAP_FAILED;
+                       }
+               }
+
+               if (bar_addr == MAP_FAILED) {
+                       RTE_LOG(ERR, PMD, "%s mapping BAR%d failed: %s\n",
+                                       pci_addr, i, strerror(errno));
+                       goto fail;
+               }
+               pdev->mem_resource[i].addr = bar_addr;
+       }
+
+       if (pci_rte_vfio_setup_device(pdev, vfio_dev_fd) < 0) {
+               RTE_LOG(ERR, PMD, "%s failed to set up device\n", pci_addr);
+               goto fail;
+       }
+
+       vpci->vfio_dev_fd = vfio_dev_fd;
+       return 0;
+
+fail:
+       close(vfio_dev_fd);
+       return -1;
+}
+
+static void *
+get_cap_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+       uint8_t bar = cap->bar;
+       uint32_t length = cap->length;
+       uint32_t offset = cap->offset;
+       uint8_t *base;
+
+       if (bar > 5) {
+               RTE_LOG(ERR, PMD, "invalid bar: %u\n", bar);
+               return NULL;
+       }
+
+       if (offset + length < offset) {
+               RTE_LOG(ERR, PMD, "offset(%u) + length(%u) overflows\n",
+                       offset, length);
+               return NULL;
+       }
+
+       if (offset + length > dev->mem_resource[bar].len) {
+               RTE_LOG(ERR, PMD, "invalid cap: overflows bar size: %u > %lu\n",
+                       offset + length, dev->mem_resource[bar].len);
+               return NULL;
+       }
+
+       base = dev->mem_resource[bar].addr;
+       if (base == NULL) {
+               RTE_LOG(ERR, PMD, "bar %u base addr is NULL", bar);
+               return NULL;
+       }
+
+       return base + offset;
+}
+
+static int
+virtio_pci_map(struct virtio_pci_info *vpci)
+{
+       uint8_t pos;
+       struct virtio_pci_cap cap;
+       struct rte_pci_device *dev = &vpci->pdev;
+       int ret;
+
+       if (virtio_pci_vfio_map_resource(vpci)) {
+               RTE_LOG(ERR, PMD, "failed to map pci device\n");
+               return -1;
+       }
+
+       ret = rte_pci_read_config(dev, &pos, sizeof(pos), PCI_CAPABILITY_LIST);
+       if (ret < 0) {
+               RTE_LOG(ERR, PMD, "failed to read pci capability list\n");
+               return -1;
+       }
+
+       while (pos) {
+               ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+               if (ret < 0) {
+                       RTE_LOG(ERR, PMD, "failed to read cap at pos: %x", pos);
+                       break;
+               }
+
+               if (cap.cap_vndr != PCI_CAP_ID_VNDR)
+                       goto next;
+
+               RTE_LOG(INFO, PMD, "cfg type: %u, bar: %u, offset: %u, "
+                               "len: %u\n", cap.cfg_type, cap.bar,
+                               cap.offset, cap.length);
+
+               switch (cap.cfg_type) {
+               case VIRTIO_PCI_CAP_COMMON_CFG:
+                       vpci->common_cfg = get_cap_addr(dev, &cap);
+                       break;
+               case VIRTIO_PCI_CAP_NOTIFY_CFG:
+                       rte_pci_read_config(dev, &vpci->notify_off_multiplier,
+                                               4, pos + sizeof(cap));
+                       vpci->notify_base = get_cap_addr(dev, &cap);
+                       break;
+               case VIRTIO_PCI_CAP_ISR_CFG:
+                       vpci->isr = get_cap_addr(dev, &cap);
+                       break;
+               case VIRTIO_PCI_CAP_DEVICE_CFG:
+                       vpci->dev_cfg = get_cap_addr(dev, &cap);
+                       break;
+               }
+next:
+               pos = cap.cap_next;
+       }
+
+       if (vpci->common_cfg == NULL || vpci->notify_base == NULL ||
+                       vpci->isr == NULL || vpci->dev_cfg == NULL) {
+               RTE_LOG(ERR, PMD, "capability incomplete\n");
+               return -1;
+       }
+
+       RTE_LOG(INFO, PMD, "capability mapping:\ncommon cfg: %p\n"
+                       "notify base: %p\nisr cfg: %p\ndevice cfg: %p\n"
+                       "multiplier: %u\n",
+                       vpci->common_cfg, vpci->dev_cfg,
+                       vpci->isr, vpci->notify_base,
+                       vpci->notify_off_multiplier);
+
+       return 0;
+}
+
+static int
+virtio_pci_dma_map(struct vdpa_virtio_pci_internal *internal)
+{
+       uint32_t i;
+       int ret = 0;
+       struct rte_vhost_memory *mem = NULL;
+       int vfio_container_fd;
+
+       ret = rte_vhost_get_mem_table(internal->vid, &mem);
+       if (ret < 0) {
+               RTE_LOG(ERR, PMD, "failed to get VM memory layout\n");
+               goto exit;
+       }
+
+       vfio_container_fd = internal->vpci.vfio_container_fd;
+
+       for (i = 0; i < mem->nregions; i++) {
+               struct vfio_iommu_type1_dma_map dma_map;
+               struct rte_vhost_mem_region *reg;
+               reg = &mem->regions[i];
+
+               RTE_LOG(INFO, PMD, "region %u: HVA 0x%lx, GPA 0x%lx, "
+                       "size 0x%lx\n", i, reg->host_user_addr,
+                       reg->guest_phys_addr, reg->size);
+
+               memset(&dma_map, 0, sizeof(dma_map));
+               dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+               dma_map.vaddr = reg->host_user_addr;
+               dma_map.size = reg->size;
+               dma_map.iova = reg->guest_phys_addr;
+               dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+                               VFIO_DMA_MAP_FLAG_WRITE;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+               if (ret) {
+                       RTE_LOG(ERR, PMD, "cannot set up DMA remapping, "
+                               "error: %s\n", strerror(errno));
+                       goto exit;
+               }
+       }
+
+exit:
+       if (mem)
+               free(mem);
+       return ret;
+}
+
+static int
+virtio_pci_dma_unmap(struct vdpa_virtio_pci_internal *internal)
+{
+       uint32_t i;
+       int ret = 0;
+       struct rte_vhost_memory *mem = NULL;
+       int vfio_container_fd;
+
+       ret = rte_vhost_get_mem_table(internal->vid, &mem);
+       if (ret < 0) {
+               RTE_LOG(ERR, PMD, "failed to get VM memory layout\n");
+               goto exit;
+       }
+
+       vfio_container_fd = internal->vpci.vfio_container_fd;
+
+       for (i = 0; i < mem->nregions; i++) {
+               struct vfio_iommu_type1_dma_unmap dma_unmap;
+               struct rte_vhost_mem_region *reg;
+               reg = &mem->regions[i];
+
+               memset(&dma_unmap, 0, sizeof(dma_unmap));
+               dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+               dma_unmap.size = reg->size;
+               dma_unmap.iova = reg->guest_phys_addr;
+               dma_unmap.flags = 0;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+                               &dma_unmap);
+               if (ret) {
+                       RTE_LOG(ERR, PMD, "cannot unset DMA remapping, "
+                               "error: %s\n", strerror(errno));
+                       goto exit;
+               }
+       }
+
+exit:
+       if (mem)
+               free(mem);
+       return ret;
+}
+
+static uint8_t
+virtio_get_status(struct virtio_pci_info *vpci)
+{
+       return rte_read8(&vpci->common_cfg->device_status);
+}
+
+static void
+virtio_set_status(struct virtio_pci_info *vpci, uint8_t status)
+{
+       rte_write8(status, &vpci->common_cfg->device_status);
+}
+
+static void
+vdpa_virtio_reset(struct virtio_pci_info *vpci)
+{
+       /* 0 means reset */
+       virtio_set_status(vpci, 0);
+
+       /* flush status write */
+       while (virtio_get_status(vpci))
+               rte_delay_ms(1);
+}
+
+static void
+vdpa_virtio_set_status(struct virtio_pci_info *vpci, uint8_t status)
+{
+       if (status != 0)
+               status |= virtio_get_status(vpci);
+
+       virtio_set_status(vpci, status);
+       virtio_get_status(vpci);
+}
+
+static uint64_t
+virtio_get_features(struct virtio_pci_info *vpci)
+{
+       uint32_t features_lo, features_hi;
+       struct virtio_pci_common_cfg *cfg = vpci->common_cfg;
+
+       rte_write32(0, &cfg->device_feature_select);
+       features_lo = rte_read32(&cfg->device_feature);
+
+       rte_write32(1, &cfg->device_feature_select);
+       features_hi = rte_read32(&cfg->device_feature);
+
+       return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+vdpa_set_features(struct virtio_pci_info *vpci, uint64_t features)
+{
+       struct virtio_pci_common_cfg *cfg = vpci->common_cfg;
+
+       /* enable device DMA with iova */
+       features |= (1ULL << VIRTIO_F_IOMMU_PLATFORM);
+
+       rte_write32(0, &cfg->guest_feature_select);
+       rte_write32(features & ((1ULL << 32) - 1), &cfg->guest_feature);
+
+       rte_write32(1, &cfg->guest_feature_select);
+       rte_write32(features >> 32, &cfg->guest_feature);
+}
+
+static int
+vdpa_virtio_config_features(struct virtio_pci_info *vpci, uint64_t 
req_features)
+{
+       uint64_t host_features;
+
+       host_features = virtio_get_features(vpci);
+       vpci->req_features = req_features & host_features;
+
+       vdpa_set_features(vpci, vpci->req_features);
+       vdpa_virtio_set_status(vpci, VIRTIO_CONFIG_S_FEATURES_OK);
+
+       if (!(virtio_get_status(vpci) & VIRTIO_CONFIG_S_FEATURES_OK)) {
+               RTE_LOG(ERR, PMD, "failed to set FEATURES_OK status\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+static uint64_t
+qva_to_gpa(int vid, uint64_t qva)
+{
+       struct rte_vhost_memory *mem = NULL;
+       struct rte_vhost_mem_region *reg;
+       uint32_t i;
+       uint64_t gpa = 0;
+
+       if (rte_vhost_get_mem_table(vid, &mem) < 0)
+               goto exit;
+
+       for (i = 0; i < mem->nregions; i++) {
+               reg = &mem->regions[i];
+
+               if (qva >= reg->host_user_addr &&
+                               qva < reg->host_user_addr + reg->size) {
+                       gpa = qva - reg->host_user_addr + reg->guest_phys_addr;
+                       break;
+               }
+       }
+
+exit:
+       if (gpa == 0)
+               rte_panic("failed to get gpa\n");
+       if (mem)
+               free(mem);
+       return gpa;
+}
+
+static void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+       rte_write32(val & ((1ULL << 32) - 1), lo);
+       rte_write32(val >> 32, hi);
+}
+
+static int
+vdpa_virtio_dev_enable(struct vdpa_virtio_pci_internal *internal)
+{
+       struct virtio_pci_info *vpci;
+       struct virtio_pci_common_cfg *cfg;
+       uint64_t desc_addr, avail_addr, used_addr;
+       uint32_t i, nr_vring;
+       uint16_t notify_off;
+       struct rte_vhost_vring vq;
+
+       vpci = &internal->vpci;
+       cfg = vpci->common_cfg;
+       nr_vring = rte_vhost_get_vring_num(internal->vid);
+
+       rte_write16(0, &cfg->msix_config);
+       if (rte_read16(&cfg->msix_config) == VIRTIO_MSI_NO_VECTOR) {
+               RTE_LOG(ERR, PMD, "msix vec alloc failed for device config\n");
+               return -1;
+       }
+
+       for (i = 0; i < nr_vring; i++) {
+               rte_vhost_get_vhost_vring(internal->vid, i, &vq);
+               desc_addr = qva_to_gpa(internal->vid, (uint64_t)vq.desc);
+               avail_addr = qva_to_gpa(internal->vid, (uint64_t)vq.avail);
+               used_addr = qva_to_gpa(internal->vid, (uint64_t)vq.used);
+
+               rte_write16(i, &cfg->queue_select);
+               io_write64_twopart(desc_addr, &cfg->queue_desc_lo,
+                               &cfg->queue_desc_hi);
+               io_write64_twopart(avail_addr, &cfg->queue_avail_lo,
+                               &cfg->queue_avail_hi);
+               io_write64_twopart(used_addr, &cfg->queue_used_lo,
+                               &cfg->queue_used_hi);
+               rte_write16(vq.size, &cfg->queue_size);
+
+               rte_write16(i + 1, &cfg->queue_msix_vector);
+               if (rte_read16(&cfg->queue_msix_vector) ==
+                               VIRTIO_MSI_NO_VECTOR) {
+                       RTE_LOG(ERR, PMD, "queue %u, msix vec alloc failed\n",
+                                       i);
+                       return -1;
+               }
+
+               notify_off = rte_read16(&cfg->queue_notify_off);
+               vpci->notify_addr[i] = (void *)((uint8_t *)vpci->notify_base +
+                               notify_off * vpci->notify_off_multiplier);
+               rte_write16(1, &cfg->queue_enable);
+
+               RTE_LOG(INFO, PMD, "queue %u addresses:\n"
+                               "desc_addr: 0x%lx\tavail_addr: 
0x%lx\tused_addr: 0x%lx\n"
+                               "queue size: %u\t\tnotify addr: %p\tnotify 
offset: %u\n",
+                               i, desc_addr, avail_addr, used_addr,
+                               vq.size, vpci->notify_addr[i], notify_off);
+       }
+
+       return 0;
+}
+
+static void
+vdpa_virtio_dev_disable(struct vdpa_virtio_pci_internal *internal)
+{
+       uint32_t i, nr_vring;
+       struct virtio_pci_info *vpci;
+       struct virtio_pci_common_cfg *cfg;
+
+       vpci = &internal->vpci;
+       cfg = vpci->common_cfg;
+       nr_vring = rte_vhost_get_vring_num(internal->vid);
+
+       rte_write16(VIRTIO_MSI_NO_VECTOR, &cfg->msix_config);
+       for (i = 0; i < nr_vring; i++) {
+               rte_write16(i, &cfg->queue_select);
+               rte_write16(0, &cfg->queue_enable);
+               rte_write16(VIRTIO_MSI_NO_VECTOR, &cfg->queue_msix_vector);
+       }
+}
+
+static int
+vdpa_virtio_pci_start(struct vdpa_virtio_pci_internal *internal)
+{
+       struct virtio_pci_info *vpci;
+       uint64_t features;
+
+       vpci = &internal->vpci;
+
+       rte_vhost_get_negotiated_features(internal->vid, &features);
+
+       /* Reset the device although not necessary at startup. */
+       vdpa_virtio_reset(vpci);
+
+       /* Tell the host we've noticed this device. */
+       vdpa_virtio_set_status(vpci, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+
+       /* Tell the host we've known how to drive the device. */
+       vdpa_virtio_set_status(vpci, VIRTIO_CONFIG_S_DRIVER);
+
+       if (vdpa_virtio_config_features(vpci, features) < 0)
+               return -1;
+
+       if (vdpa_virtio_dev_enable(internal) < 0)
+               return -1;
+
+       vdpa_virtio_set_status(vpci, VIRTIO_CONFIG_S_DRIVER_OK);
+       return 0;
+}
+
+static void
+vdpa_virtio_pci_stop(struct vdpa_virtio_pci_internal *internal)
+{
+       struct virtio_pci_info *vpci;
+
+       vpci = &internal->vpci;
+       vdpa_virtio_dev_disable(internal);
+       vdpa_virtio_reset(vpci);
+}
+
+static int
+vdpa_enable_vfio_intr(struct vdpa_virtio_pci_internal *internal)
+{
+       int ret;
+       uint32_t i, nr_vring;
+       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+       struct vfio_irq_set *irq_set;
+       int *fd_ptr;
+       struct virtio_pci_info *vpci;
+       struct rte_vhost_vring vring;
+
+       vpci = &internal->vpci;
+       nr_vring = rte_vhost_get_vring_num(internal->vid);
+
+       irq_set = (struct vfio_irq_set *)irq_set_buf;
+       irq_set->argsz = sizeof(irq_set_buf);
+       irq_set->count = nr_vring + 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                        VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+       irq_set->start = 0;
+       fd_ptr = (int *)&irq_set->data;
+       fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = vpci->pdev.intr_handle.fd;
+
+       for (i = 0; i < nr_vring; i++) {
+               rte_vhost_get_vhost_vring(internal->vid, i, &vring);
+               fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
+       }
+
+       ret = ioctl(vpci->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "Error enabling MSI-X interrupts: %s\n",
+                               strerror(errno));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+vdpa_disable_vfio_intr(struct vdpa_virtio_pci_internal *internal)
+{
+       int ret;
+       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+       struct vfio_irq_set *irq_set;
+       struct virtio_pci_info *vpci;
+
+       vpci = &internal->vpci;
+
+       irq_set = (struct vfio_irq_set *)irq_set_buf;
+       irq_set->argsz = sizeof(irq_set_buf);
+       irq_set->count = 0;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(vpci->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "Error disabling MSI-X interrupts: %s\n",
+                               strerror(errno));
+               return -1;
+       }
+
+       return 0;
+}
+
+static void *
+notify_relay(void *arg)
+{
+       int i, kickfd, epfd, nfds = 0;
+       struct virtio_pci_info *vpci;
+       uint32_t qid, q_num;
+       struct epoll_event events[MAX_QUEUES * 2];
+       struct epoll_event ev;
+       uint64_t buf;
+       int nbytes;
+       struct rte_vhost_vring vring;
+       struct vdpa_virtio_pci_internal *internal = arg;
+
+       vpci = &internal->vpci;
+       q_num = rte_vhost_get_vring_num(internal->vid);
+
+       epfd = epoll_create(MAX_QUEUES * 2);
+       if (epfd < 0) {
+               RTE_LOG(ERR, PMD, "failed to create epoll instance\n");
+               return NULL;
+       }
+       vpci->epfd = epfd;
+
+       for (qid = 0; qid < q_num; qid++) {
+               ev.events = EPOLLIN | EPOLLPRI;
+               rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
+               ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
+               if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
+                       RTE_LOG(ERR, PMD, "epoll add error, %s\n",
+                                       strerror(errno));
+                       return NULL;
+               }
+       }
+
+       for (;;) {
+               nfds = epoll_wait(epfd, events, q_num, -1);
+               if (nfds < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       RTE_LOG(ERR, PMD, "epoll_wait return fail\n");
+                       return NULL;
+               }
+
+               for (i = 0; i < nfds; i++) {
+                       qid = events[i].data.u32;
+                       kickfd = (uint32_t)(events[i].data.u64 >> 32);
+                       do {
+                               nbytes = read(kickfd, &buf, 8);
+                               if (nbytes < 0) {
+                                       if (errno == EINTR ||
+                                           errno == EWOULDBLOCK ||
+                                           errno == EAGAIN)
+                                               continue;
+                                       RTE_LOG(INFO, PMD, "Error reading "
+                                               "kickfd: %s\n",
+                                               strerror(errno));
+                               }
+                               break;
+                       } while (1);
+
+                       rte_write16(qid, vpci->notify_addr[qid]);
+               }
+       }
+
+       return NULL;
+}
+
+static int
+setup_notify_relay(struct vdpa_virtio_pci_internal *internal)
+{
+       int ret;
+
+       ret = pthread_create(&internal->vpci.tid, NULL, notify_relay,
+                            (void *)internal);
+       if (ret) {
+               RTE_LOG(ERR, PMD, "failed to create notify relay pthread\n");
+               return -1;
+       }
+       return 0;
+}
+
+static int
+unset_notify_relay(struct vdpa_virtio_pci_internal *internal)
+{
+       struct virtio_pci_info *vpci;
+       void *status;
+
+       vpci = &internal->vpci;
+       if (vpci->tid) {
+               pthread_cancel(vpci->tid);
+               pthread_join(vpci->tid, &status);
+       }
+       vpci->tid = 0;
+
+       if (vpci->epfd >= 0)
+               close(vpci->epfd);
+       vpci->epfd = -1;
+
+       return 0;
+}
+
+static int
+update_datapath(struct rte_eth_dev *eth_dev)
+{
+       struct vdpa_virtio_pci_internal *internal;
+       int ret;
+
+       internal = eth_dev->data->dev_private;
+       rte_spinlock_lock(&internal->lock);
+
+       if (!rte_atomic32_read(&internal->running) &&
+           (rte_atomic32_read(&internal->started) &&
+            rte_atomic32_read(&internal->dev_attached))) {
+               ret = virtio_pci_dma_map(internal);
+               if (ret)
+                       goto err;
+
+               ret = vdpa_enable_vfio_intr(internal);
+               if (ret)
+                       goto err;
+
+               ret = setup_notify_relay(internal);
+               if (ret)
+                       goto err;
+
+               ret = vdpa_virtio_pci_start(internal);
+               if (ret)
+                       goto err;
+
+               rte_atomic32_set(&internal->running, 1);
+       } else if (rte_atomic32_read(&internal->running) &&
+                  (!rte_atomic32_read(&internal->started) ||
+                   !rte_atomic32_read(&internal->dev_attached))) {
+               vdpa_virtio_pci_stop(internal);
+
+               ret = unset_notify_relay(internal);
+               if (ret)
+                       goto err;
+
+               ret = vdpa_disable_vfio_intr(internal);
+               if (ret)
+                       goto err;
+
+               ret = virtio_pci_dma_unmap(internal);
+               if (ret)
+                       goto err;
+
+               rte_atomic32_set(&internal->running, 0);
+       }
+
+       rte_spinlock_unlock(&internal->lock);
+       return 0;
+err:
+       rte_spinlock_unlock(&internal->lock);
+       return ret;
+}
+
+static int
+vdpa_virtio_dev_config(int vid)
+{
+       int eid;
+       struct internal_list *list;
+       struct rte_eth_dev *eth_dev;
+       struct vdpa_virtio_pci_internal *internal;
+
+       eid = rte_vhost_get_vdpa_eid(vid);
+       list = find_internal_resource_by_eid(eid);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+               return -1;
+       }
+
+       eth_dev = list->eth_dev;
+       internal = eth_dev->data->dev_private;
+
+       eth_dev->data->dev_link.link_status = ETH_LINK_UP;
+
+       rte_atomic32_set(&internal->dev_attached, 1);
+       update_datapath(eth_dev);
+
+       return 0;
+}
+
+static int
+vdpa_virtio_dev_close(int vid)
+{
+       int eid;
+       struct internal_list *list;
+       struct rte_eth_dev *eth_dev;
+       struct vdpa_virtio_pci_internal *internal;
+
+       eid = rte_vhost_get_vdpa_eid(vid);
+       list = find_internal_resource_by_eid(eid);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+               return -1;
+       }
+
+       eth_dev = list->eth_dev;
+       internal = eth_dev->data->dev_private;
+
+       eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
+
+       rte_atomic32_set(&internal->dev_attached, 0);
+       update_datapath(eth_dev);
+
+       return 0;
+}
+
+static void
+vfio_close_fds(struct virtio_pci_info *vpci)
+{
+       if (vpci->vfio_dev_fd >= 0)
+               close(vpci->vfio_dev_fd);
+       if (vpci->vfio_group_fd >= 0)
+               close(vpci->vfio_group_fd);
+       if (vpci->vfio_container_fd >= 0)
+               close(vpci->vfio_container_fd);
+
+       vpci->vfio_dev_fd = -1;
+       vpci->vfio_group_fd = -1;
+       vpci->vfio_container_fd = -1;
+}
+
+static int
+vdpa_virtio_eng_init(int eid, struct rte_vdpa_eng_addr *addr)
+{
+       struct internal_list *list;
+       struct vdpa_virtio_pci_internal *internal;
+       struct virtio_pci_info *vpci;
+       uint64_t features;
+
+       list = find_internal_resource_by_eng_addr(addr);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine addr\n");
+               return -1;
+       }
+
+       internal = list->eth_dev->data->dev_private;
+       vpci = &internal->vpci;
+
+       vpci->vfio_dev_fd = -1;
+       vpci->vfio_group_fd = -1;
+       vpci->vfio_container_fd = -1;
+
+       if (check_pci_dev(&vpci->pdev) < 0)
+               return -1;
+
+       if (virtio_pci_map(vpci) < 0)
+               goto err;
+
+       internal->eid = eid;
+       internal->max_devices = 1;
+       internal->max_queues = MAX_QUEUES;
+       features = virtio_get_features(&internal->vpci);
+       if ((features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) == 0) {
+               RTE_LOG(ERR, PMD, "VIRTIO_F_IOMMU_PLATFORM feature is required "
+                               "to support DMA with IOVA");
+               goto err;
+       }
+
+       /* We need the nested VM's driver to use GPA */
+       internal->features = (features & ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
+                         (1ULL << RTE_VHOST_USER_F_PROTOCOL_FEATURES);
+       return 0;
+
+err:
+       vfio_close_fds(vpci);
+       return -1;
+}
+
+static int
+vdpa_virtio_eng_uninit(int eid)
+{
+       struct internal_list *list;
+       struct vdpa_virtio_pci_internal *internal;
+
+       list = find_internal_resource_by_eid(eid);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine id %d\n", eid);
+               return -1;
+       }
+
+       internal = list->eth_dev->data->dev_private;
+       vfio_close_fds(&internal->vpci);
+       return 0;
+}
+
+#define VDPA_SUPPORTED_PROTOCOL_FEATURES \
+               (1ULL << RTE_VHOST_USER_PROTOCOL_F_REPLY_ACK)
+static int
+vdpa_virtio_info_query(int eid, struct rte_vdpa_eng_attr *attr)
+{
+       struct internal_list *list;
+       struct vdpa_virtio_pci_internal *internal;
+
+       list = find_internal_resource_by_eid(eid);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine id: %d\n", eid);
+               return -1;
+       }
+
+       internal = list->eth_dev->data->dev_private;
+       attr->dev_num = internal->max_devices;
+       attr->queue_num = internal->max_queues;
+       attr->features = internal->features;
+       attr->protocol_features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
+
+       return 0;
+}
+
+struct rte_vdpa_eng_driver vdpa_virtio_pci_driver = {
+       .name = "vdpa_virtio_pci",
+       .eng_ops = {
+               .eng_init = vdpa_virtio_eng_init,
+               .eng_uninit = vdpa_virtio_eng_uninit,
+               .info_query = vdpa_virtio_info_query,
+       },
+       .dev_ops = {
+               .dev_conf = vdpa_virtio_dev_config,
+               .dev_close = vdpa_virtio_dev_close,
+               .vring_state_set = NULL,
+               .feature_set = NULL,
+               .migration_done = NULL,
+       },
+};
+
+RTE_VDPA_REGISTER_DRIVER(vdpa_virtio_pci, vdpa_virtio_pci_driver);
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+       struct vdpa_virtio_pci_internal *internal;
+
+       internal = dev->data->dev_private;
+       rte_atomic32_set(&internal->started, 1);
+       update_datapath(dev);
+
+       return 0;
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+       struct vdpa_virtio_pci_internal *internal;
+
+       internal = dev->data->dev_private;
+       rte_atomic32_set(&internal->started, 0);
+       update_datapath(dev);
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev)
+{
+       struct vdpa_virtio_pci_internal *internal;
+       struct internal_list *list;
+
+       internal = dev->data->dev_private;
+       eth_dev_stop(dev);
+
+       list = find_internal_resource_by_eng_addr(&internal->eng_addr);
+       if (list == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid engine addr\n");
+               return;
+       }
+
+       rte_vdpa_unregister_engine(internal->eid);
+
+       pthread_mutex_lock(&internal_list_lock);
+       TAILQ_REMOVE(&internal_list, list, next);
+       pthread_mutex_unlock(&internal_list_lock);
+       rte_free(list);
+
+       rte_free(dev->data->mac_addrs);
+       free(internal->dev_name);
+       rte_free(internal);
+
+       dev->data->dev_private = NULL;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+       return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+       struct vdpa_virtio_pci_internal *internal;
+
+       internal = dev->data->dev_private;
+       if (internal == NULL) {
+               RTE_LOG(ERR, PMD, "Invalid device specified\n");
+               return;
+       }
+
+       dev_info->max_mac_addrs = 1;
+       dev_info->max_rx_pktlen = (uint32_t)-1;
+       dev_info->max_rx_queues = internal->max_queues;
+       dev_info->max_tx_queues = internal->max_queues;
+       dev_info->min_rx_bufsize = 0;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev __rte_unused,
+                  uint16_t rx_queue_id __rte_unused,
+                  uint16_t nb_rx_desc __rte_unused,
+                  unsigned int socket_id __rte_unused,
+                  const struct rte_eth_rxconf *rx_conf __rte_unused,
+                  struct rte_mempool *mb_pool __rte_unused)
+{
+       return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev __rte_unused,
+                  uint16_t tx_queue_id __rte_unused,
+                  uint16_t nb_tx_desc __rte_unused,
+                  unsigned int socket_id __rte_unused,
+                  const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+       return 0;
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static uint16_t
+eth_vdpa_virtio_pci_rx(void *q __rte_unused,
+                      struct rte_mbuf **bufs __rte_unused,
+                      uint16_t nb_bufs __rte_unused)
+{
+       return 0;
+}
+
+static uint16_t
+eth_vdpa_virtio_pci_tx(void *q __rte_unused,
+                      struct rte_mbuf **bufs __rte_unused,
+                      uint16_t nb_bufs __rte_unused)
+{
+       return 0;
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+               int wait_to_complete __rte_unused)
+{
+       return 0;
+}
+
+static const struct eth_dev_ops ops = {
+       .dev_start = eth_dev_start,
+       .dev_stop = eth_dev_stop,
+       .dev_close = eth_dev_close,
+       .dev_configure = eth_dev_configure,
+       .dev_infos_get = eth_dev_info,
+       .rx_queue_setup = eth_rx_queue_setup,
+       .tx_queue_setup = eth_tx_queue_setup,
+       .rx_queue_release = eth_queue_release,
+       .tx_queue_release = eth_queue_release,
+       .link_update = eth_link_update,
+};
+
+static int
+eth_dev_vdpa_virtio_pci_create(struct rte_vdev_device *dev,
+               struct rte_pci_addr *pci_addr)
+{
+       const char *name = rte_vdev_device_name(dev);
+       struct rte_eth_dev *eth_dev = NULL;
+       struct ether_addr *eth_addr = NULL;
+       struct vdpa_virtio_pci_internal *internal = NULL;
+       struct internal_list *list = NULL;
+       struct rte_eth_dev_data *data = NULL;
+
+       list = rte_zmalloc_socket(name, sizeof(*list), 0,
+                       dev->device.numa_node);
+       if (list == NULL)
+               goto error;
+
+       /* reserve an ethdev entry */
+       eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
+       if (eth_dev == NULL)
+               goto error;
+
+       eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0,
+                       dev->device.numa_node);
+       if (eth_addr == NULL)
+               goto error;
+
+       *eth_addr = base_eth_addr;
+       eth_addr->addr_bytes[5] = eth_dev->data->port_id;
+
+       internal = eth_dev->data->dev_private;
+       internal->dev_name = strdup(name);
+       if (internal->dev_name == NULL)
+               goto error;
+
+       internal->eng_addr.pci_addr = *pci_addr;
+       internal->vpci.pdev.addr = *pci_addr;
+       rte_spinlock_init(&internal->lock);
+
+       list->eth_dev = eth_dev;
+       pthread_mutex_lock(&internal_list_lock);
+       TAILQ_INSERT_TAIL(&internal_list, list, next);
+       pthread_mutex_unlock(&internal_list_lock);
+
+       data = eth_dev->data;
+       data->nb_rx_queues = MAX_QUEUES;
+       data->nb_tx_queues = MAX_QUEUES;
+       data->dev_link = vdpa_link;
+       data->mac_addrs = eth_addr;
+       data->dev_flags = RTE_ETH_DEV_INTR_LSC;
+       eth_dev->dev_ops = &ops;
+
+       /* assign rx and tx ops, could be used as vDPA fallback */
+       eth_dev->rx_pkt_burst = eth_vdpa_virtio_pci_rx;
+       eth_dev->tx_pkt_burst = eth_vdpa_virtio_pci_tx;
+
+       if (rte_vdpa_register_engine(vdpa_virtio_pci_driver.name,
+                               &internal->eng_addr) < 0)
+               goto error;
+
+       return 0;
+
+error:
+       rte_free(list);
+       rte_free(eth_addr);
+       if (internal && internal->dev_name)
+               free(internal->dev_name);
+       rte_free(internal);
+       if (eth_dev)
+               rte_eth_dev_release_port(eth_dev);
+
+       return -1;
+}
+
+static int
+get_pci_addr(const char *key __rte_unused, const char *value, void *extra_args)
+{
+       if (value == NULL || extra_args == NULL)
+               return -1;
+
+       return parse_pci_addr_format(value, strlen(value), extra_args);
+}
+
+static int
+rte_vdpa_virtio_pci_probe(struct rte_vdev_device *dev)
+{
+       struct rte_kvargs *kvlist = NULL;
+       int ret = 0;
+       struct rte_pci_addr pci_addr;
+
+       RTE_LOG(INFO, PMD, "Initializing vdpa_virtio_pci for %s\n",
+               rte_vdev_device_name(dev));
+
+       kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+       if (kvlist == NULL)
+               return -1;
+
+       if (rte_kvargs_count(kvlist, ETH_VDPA_VIRTIO_PCI_BDF_ARG) == 1) {
+               ret = rte_kvargs_process(kvlist, ETH_VDPA_VIRTIO_PCI_BDF_ARG,
+                               &get_pci_addr, &pci_addr);
+               if (ret < 0)
+                       goto out_free;
+       } else {
+               ret = -1;
+               goto out_free;
+       }
+
+       eth_dev_vdpa_virtio_pci_create(dev, &pci_addr);
+
+out_free:
+       rte_kvargs_free(kvlist);
+       return ret;
+}
+
+static int
+rte_vdpa_virtio_pci_remove(struct rte_vdev_device *dev)
+{
+       const char *name;
+       struct rte_eth_dev *eth_dev = NULL;
+
+       name = rte_vdev_device_name(dev);
+       RTE_LOG(INFO, PMD, "Un-Initializing vdpa_virtio_pci for %s\n", name);
+
+       /* find an ethdev entry */
+       eth_dev = rte_eth_dev_allocated(name);
+       if (eth_dev == NULL)
+               return -ENODEV;
+
+       eth_dev_close(eth_dev);
+       rte_free(eth_dev->data);
+       rte_eth_dev_release_port(eth_dev);
+
+       return 0;
+}
+
+static struct rte_vdev_driver vdpa_virtio_pci_drv = {
+       .probe = rte_vdpa_virtio_pci_probe,
+       .remove = rte_vdpa_virtio_pci_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_vdpa_virtio_pci, vdpa_virtio_pci_drv);
+RTE_PMD_REGISTER_ALIAS(net_vdpa_virtio_pci, eth_vdpa_virtio_pci);
+RTE_PMD_REGISTER_PARAM_STRING(net_vdpa_virtio_pci,
+       "bdf=<bdf>");
diff --git a/drivers/net/vdpa_virtio_pci/rte_vdpa_virtio_pci_version.map 
b/drivers/net/vdpa_virtio_pci/rte_vdpa_virtio_pci_version.map
new file mode 100644
index 000000000..33d237913
--- /dev/null
+++ b/drivers/net/vdpa_virtio_pci/rte_vdpa_virtio_pci_version.map
@@ -0,0 +1,4 @@
+EXPERIMENTAL {
+
+       local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3eb41d176..44e87f4d9 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -171,6 +171,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD) += 
-lrte_pmd_vdev_netvsc
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD)     += -lrte_pmd_virtio
 ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
+_LDLIBS-$(CONFIG_RTE_LIBRTE_VDPA_VIRTIO_PCI)      += -lrte_vdpa_virtio_pci
 endif # $(CONFIG_RTE_LIBRTE_VHOST)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD)    += -lrte_pmd_vmxnet3_uio
 
-- 
2.15.1

Reply via email to