This patch adds the mdev support in PCI bus driver. A mdev
driver is introduced to probe the mdev devices whose device
API is "vfio-pci" on the mdev bus.

PS. There are some hacks in this patch for now.

Signed-off-by: Cunming Liang <cunming.li...@intel.com>
Signed-off-by: Tiwei Bie <tiwei....@intel.com>
---
 drivers/bus/pci/Makefile              |   3 +
 drivers/bus/pci/linux/Makefile        |   4 +
 drivers/bus/pci/linux/pci_vfio.c      |  35 ++-
 drivers/bus/pci/linux/pci_vfio_mdev.c | 305 ++++++++++++++++++++++++++
 drivers/bus/pci/meson.build           |   4 +-
 drivers/bus/pci/pci_common.c          |  17 +-
 drivers/bus/pci/private.h             |   9 +
 drivers/bus/pci/rte_bus_pci.h         |  11 +-
 8 files changed, 370 insertions(+), 18 deletions(-)
 create mode 100644 drivers/bus/pci/linux/pci_vfio_mdev.c

diff --git a/drivers/bus/pci/Makefile b/drivers/bus/pci/Makefile
index de53ce1bf..085ec9066 100644
--- a/drivers/bus/pci/Makefile
+++ b/drivers/bus/pci/Makefile
@@ -27,6 +27,9 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_pci -lrte_kvargs
+ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y)
+LDLIBS += -lrte_bus_mdev
+endif
 
 include $(RTE_SDK)/drivers/bus/pci/$(SYSTEM)/Makefile
 SRCS-$(CONFIG_RTE_LIBRTE_PCI_BUS) := $(addprefix $(SYSTEM)/,$(SRCS))
diff --git a/drivers/bus/pci/linux/Makefile b/drivers/bus/pci/linux/Makefile
index 90404468b..88bbc2390 100644
--- a/drivers/bus/pci/linux/Makefile
+++ b/drivers/bus/pci/linux/Makefile
@@ -4,3 +4,7 @@
 SRCS += pci.c
 SRCS += pci_uio.c
 SRCS += pci_vfio.c
+
+ifeq ($(CONFIG_RTE_LIBRTE_MDEV_BUS),y)
+       SRCS += pci_vfio_mdev.c
+endif
diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index ebf6ccd3c..c2c4c6a50 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -13,6 +13,9 @@
 
 #include <rte_log.h>
 #include <rte_pci.h>
+#ifdef RTE_LIBRTE_MDEV_BUS
+#include <rte_bus_mdev.h>
+#endif
 #include <rte_bus_pci.h>
 #include <rte_eal_memconfig.h>
 #include <rte_malloc.h>
@@ -20,6 +23,7 @@
 #include <rte_eal.h>
 #include <rte_bus.h>
 #include <rte_spinlock.h>
+#include <rte_uuid.h>
 
 #include "eal_filesystem.h"
 
@@ -648,6 +652,7 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 {
        struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
        char pci_addr[PATH_MAX] = {0};
+       const char *sysfs_path;
        int vfio_dev_fd;
        struct rte_pci_addr *loc = &dev->addr;
        int i, ret;
@@ -663,10 +668,20 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
 #endif
 
        /* store PCI address string */
-       snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+       if (dev->use_uuid) {
+#ifdef RTE_LIBRTE_MDEV_BUS
+               sysfs_path = rte_mdev_get_sysfs_path();
+               rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+#else
+               return -1;
+#endif
+       } else {
+               sysfs_path = rte_pci_get_sysfs_path();
+               snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
                        loc->domain, loc->bus, loc->devid, loc->function);
+       }
 
-       ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+       ret = rte_vfio_setup_device(sysfs_path, pci_addr,
                                        &vfio_dev_fd, &device_info);
        if (ret)
                return ret;
@@ -793,6 +808,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 {
        struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
        char pci_addr[PATH_MAX] = {0};
+       const char *sysfs_path;
        int vfio_dev_fd;
        struct rte_pci_addr *loc = &dev->addr;
        int i, ret;
@@ -808,8 +824,19 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
 #endif
 
        /* store PCI address string */
-       snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+       if (dev->use_uuid) {
+#ifdef RTE_LIBRTE_MDEV_BUS
+               sysfs_path = rte_mdev_get_sysfs_path();
+               rte_uuid_unparse(dev->uuid, pci_addr, sizeof(pci_addr));
+#else
+               return -1;
+#endif
+       } else {
+               sysfs_path = rte_pci_get_sysfs_path();
+               snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
                        loc->domain, loc->bus, loc->devid, loc->function);
+       }
+
 
        /* if we're in a secondary process, just find our tailq entry */
        TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
@@ -825,7 +852,7 @@ pci_vfio_map_resource_secondary(struct rte_pci_device *dev)
                return -1;
        }
 
-       ret = rte_vfio_setup_device(rte_pci_get_sysfs_path(), pci_addr,
+       ret = rte_vfio_setup_device(sysfs_path, pci_addr,
                                        &vfio_dev_fd, &device_info);
        if (ret)
                return ret;
diff --git a/drivers/bus/pci/linux/pci_vfio_mdev.c 
b/drivers/bus/pci/linux/pci_vfio_mdev.c
new file mode 100644
index 000000000..92498c2fe
--- /dev/null
+++ b/drivers/bus/pci/linux/pci_vfio_mdev.c
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <string.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/pci_regs.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_devargs.h>
+#include <rte_memcpy.h>
+#include <rte_vfio.h>
+#include <rte_bus_mdev.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+
+#include "private.h"
+
+extern struct rte_pci_bus rte_pci_bus;
+
+static int
+get_pci_id(const char *sysfs_base, const char *dev_addr,
+          struct rte_pci_id *pci_id)
+{
+       int ret = 0;
+       int iommu_group_num;
+       int vfio_group_fd;
+       int vfio_dev_fd;
+       int container;
+       int class;
+       char name[PATH_MAX];
+       struct vfio_group_status group_status = {
+               .argsz = sizeof(group_status) };
+
+       container = open("/dev/vfio/vfio", O_RDWR);
+       if (container < 0) {
+               RTE_LOG(WARNING, EAL, "Failed to open VFIO container\n");
+               ret = -1;
+               goto out;
+       }
+
+       if (ioctl(container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+               /* Unknown API version */
+               RTE_LOG(WARNING, EAL, "Unknown VFIO API version\n");
+               ret = -1;
+               goto close_container;
+       }
+
+       if (rte_vfio_get_group_num(sysfs_base, dev_addr,
+                                  &iommu_group_num) <= 0) {
+               RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
+                       dev_addr);
+               ret = -1;
+               goto close_container;
+       }
+
+       snprintf(name, sizeof(name), "/dev/vfio/%d", iommu_group_num);
+
+       vfio_group_fd = open(name, O_RDWR);
+       if (vfio_group_fd < 0) {
+               ret = -1;
+               goto close_container;
+       }
+
+       /* if group_fd == 0, that means the device isn't managed by VFIO */
+       if (vfio_group_fd == 0) {
+               RTE_LOG(WARNING, EAL, "%s not managed by VFIO driver\n",
+                       dev_addr);
+               ret = -1;
+               goto close_group;
+       }
+
+       if (ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status)) {
+               RTE_LOG(ERR, EAL, "%s cannot get group status, error %i (%s)\n",
+                       dev_addr, errno, strerror(errno));
+               ret = -1;
+               goto close_group;
+       }
+
+       if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+               RTE_LOG(ERR, EAL, "%s VFIO group is not viable!\n", dev_addr);
+               ret = -1;
+               goto close_group;
+       }
+
+       if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+               if (ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+                           &container)) {
+                       RTE_LOG(ERR, EAL, "%s cannot add VFIO group to 
container, error %i (%s)\n",
+                               dev_addr, errno, strerror(errno));
+                       ret = -1;
+                       goto close_group;
+               }
+       }
+
+       if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+               RTE_LOG(ERR, EAL, "%s cannot set iommu, error %i (%s)\n",
+                       dev_addr, errno, strerror(errno));
+               ret = -1;
+               goto close_group;
+       }
+
+       vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+       if (vfio_dev_fd < 0) {
+               /* if we cannot get a device fd, this implies a problem with
+                * the VFIO group or the container not having IOMMU configured.
+                */
+               RTE_LOG(ERR, EAL, "Getting a vfio_dev_fd for %s failed errno 
%d\n",
+                       dev_addr, errno);
+               ret = -1;
+               goto close_group;
+       }
+
+       /* vendor_id */
+       if (pread64(vfio_dev_fd, &pci_id->vendor_id, sizeof(uint16_t),
+                     VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                     PCI_VENDOR_ID) != sizeof(uint16_t)) {
+               RTE_LOG(ERR, EAL, "Cannot read VendorID from PCI config 
space\n");
+               ret = -1;
+               goto close_device;
+       }
+
+       /* device_id */
+       if (pread64(vfio_dev_fd, &pci_id->device_id, sizeof(uint16_t),
+                     VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                     PCI_DEVICE_ID) != sizeof(uint16_t)) {
+               RTE_LOG(ERR, EAL, "Cannot read DeviceID from PCI config 
space\n");
+               ret = -1;
+               goto close_device;
+       }
+
+       /* subsystem_vendor_id */
+       if (pread64(vfio_dev_fd, &pci_id->subsystem_vendor_id, sizeof(uint16_t),
+                     VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                     PCI_SUBSYSTEM_VENDOR_ID) != sizeof(uint16_t)) {
+               RTE_LOG(ERR, EAL, "Cannot read SubVendorID from PCI config 
space\n");
+               ret = -1;
+               goto close_device;
+       }
+
+       /* subsystem_device_id */
+       if (pread64(vfio_dev_fd, &pci_id->subsystem_device_id, sizeof(uint16_t),
+                     VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                     PCI_SUBSYSTEM_ID) != sizeof(uint16_t)) {
+               RTE_LOG(ERR, EAL, "Cannot read SubDeviceID from PCI config 
space\n");
+               ret = -1;
+               goto close_device;
+       }
+
+       /* class_id */
+       if (pread64(vfio_dev_fd, &class, sizeof(uint32_t),
+                     VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+                     PCI_CLASS_REVISION) != sizeof(uint32_t)) {
+               RTE_LOG(ERR, EAL, "Cannot read ClassID from PCI config 
space\n");
+               ret = -1;
+               goto close_device;
+       }
+       pci_id->class_id = class >> 8;
+
+close_device:
+       if (close(vfio_dev_fd) < 0) {
+               RTE_LOG(INFO, EAL, "Error when closing VFIO device for %s\n",
+                       dev_addr);
+               ret = -1;
+       }
+
+close_group:
+       if (close(vfio_group_fd) < 0) {
+               RTE_LOG(INFO, EAL, "Error when closing VFIO group for %s\n",
+                       dev_addr);
+               ret = -1;
+       }
+
+close_container:
+       if (close(container) < 0) {
+               RTE_LOG(INFO, EAL, "Error when closing VFIO container\n");
+               ret = -1;
+       }
+
+out:
+       return ret;
+}
+
+static int vfio_pci_probe(struct rte_mdev_driver *mdev_drv __rte_unused,
+                         struct rte_mdev_device *mdev_dev)
+{
+       char name[RTE_UUID_STRLEN];
+       struct rte_pci_device *dev;
+       struct rte_bus *bus;
+       int ret;
+
+       bus = rte_bus_find_by_name("pci");
+       if (bus == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot find bus pci\n");
+               return -ENOENT;
+       }
+
+       if (bus->plug == NULL) {
+               RTE_LOG(ERR, EAL, "Function plug not supported by bus (%s)\n",
+                       bus->name);
+               return -ENOTSUP;
+       }
+
+       dev = malloc(sizeof(*dev));
+       if (dev == NULL)
+               return -ENOMEM;
+
+       memset(dev, 0, sizeof(*dev));
+       dev->device.bus = &rte_pci_bus.bus;
+       rte_uuid_unparse(mdev_dev->addr, name, sizeof(name));
+
+       if (get_pci_id(rte_mdev_get_sysfs_path(), name, &dev->id)) {
+               free(dev);
+               return -1;
+       }
+
+       snprintf(dev->name, sizeof(dev->name), "%s", name);
+       dev->device.name = dev->name;
+       dev->kdrv = RTE_KDRV_VFIO;
+       dev->use_uuid = 1;
+       rte_uuid_copy(dev->uuid, mdev_dev->addr);
+
+       // TODO: dev->device.devargs, etc
+
+       memset(&dev->addr, -1, sizeof(dev->addr)); // XXX: TODO
+
+       /* device is valid, add to the list (sorted) */
+       if (TAILQ_EMPTY(&rte_pci_bus.device_list)) {
+               rte_pci_add_device(dev);
+       } else {
+               struct rte_pci_device *dev2;
+               int ret;
+
+               TAILQ_FOREACH(dev2, &rte_pci_bus.device_list, next) {
+                       // XXX
+                       ret = rte_pci_addr_cmp(&dev->addr, &dev2->addr);
+                       if (ret == 0)
+                               ret = strncmp(dev->name, dev2->name,
+                                             sizeof(dev->name));
+                       if (ret > 0)
+                               continue;
+                       if (ret < 0) {
+                               rte_pci_insert_device(dev2, dev);
+                               goto plug;
+                       }
+                       /* already registered */
+                       free(dev);
+                       return 0;
+               }
+
+               rte_pci_add_device(dev);
+       }
+
+plug:
+       ret = bus->plug(&dev->device);
+       if (ret != 0) {
+               rte_pci_remove_device(dev);
+               free(dev);
+       } else {
+               mdev_dev->private = dev;
+       }
+       return ret;
+}
+
+static int vfio_pci_remove(struct rte_mdev_device *mdev_dev)
+{
+       struct rte_pci_device *dev = mdev_dev->private;
+       struct rte_bus *bus;
+       int ret;
+
+       if (dev == NULL)
+               return 0;
+
+       bus = rte_bus_find_by_name("pci");
+       if (bus == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot find bus pci\n");
+               return -ENOENT;
+       }
+
+       if (bus->unplug == NULL) {
+               RTE_LOG(ERR, EAL, "Function unplug not supported by bus (%s)\n",
+                       bus->name);
+               return -ENOTSUP;
+       }
+
+       ret = bus->unplug(&dev->device);
+       if (ret == 0)
+               mdev_dev->private = NULL;
+
+       return ret;
+}
+
+static struct rte_mdev_driver vfio_pci_drv = {
+       .dev_api = RTE_MDEV_DEV_API_VFIO_PCI,
+       .probe = vfio_pci_probe,
+       .remove = vfio_pci_remove
+};
+
+RTE_MDEV_REGISTER_DRIVER(mdev_vfio_pci, vfio_pci_drv);
diff --git a/drivers/bus/pci/meson.build b/drivers/bus/pci/meson.build
index a3140ff97..c3e884657 100644
--- a/drivers/bus/pci/meson.build
+++ b/drivers/bus/pci/meson.build
@@ -11,8 +11,10 @@ sources = files('pci_common.c',
 if host_machine.system() == 'linux'
        sources += files('linux/pci.c',
                        'linux/pci_uio.c',
-                       'linux/pci_vfio.c')
+                       'linux/pci_vfio.c',
+                       'linux/pci_vfio_mdev.c')
        includes += include_directories('linux')
+       deps += ['bus_mdev']
 else
        sources += files('bsd/pci.c')
        includes += include_directories('bsd')
diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
index 704b9d71a..6b47333e6 100644
--- a/drivers/bus/pci/pci_common.c
+++ b/drivers/bus/pci/pci_common.c
@@ -124,21 +124,17 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 {
        int ret;
        bool already_probed;
-       struct rte_pci_addr *loc;
 
        if ((dr == NULL) || (dev == NULL))
                return -EINVAL;
 
-       loc = &dev->addr;
-
        /* The device is not blacklisted; Check if driver supports it */
        if (!rte_pci_match(dr, dev))
                /* Match of device and driver failed */
                return 1;
 
-       RTE_LOG(INFO, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
-                       loc->domain, loc->bus, loc->devid, loc->function,
-                       dev->device.numa_node);
+       RTE_LOG(INFO, EAL, "PCI device %s on NUMA socket %i\n",
+               dev->name, dev->device.numa_node);
 
        /* no initialization when blacklisted, return without error */
        if (dev->device.devargs != NULL &&
@@ -208,7 +204,6 @@ rte_pci_probe_one_driver(struct rte_pci_driver *dr,
 static int
 rte_pci_detach_dev(struct rte_pci_device *dev)
 {
-       struct rte_pci_addr *loc;
        struct rte_pci_driver *dr;
        int ret = 0;
 
@@ -216,11 +211,9 @@ rte_pci_detach_dev(struct rte_pci_device *dev)
                return -EINVAL;
 
        dr = dev->driver;
-       loc = &dev->addr;
 
-       RTE_LOG(DEBUG, EAL, "PCI device "PCI_PRI_FMT" on NUMA socket %i\n",
-                       loc->domain, loc->bus, loc->devid,
-                       loc->function, dev->device.numa_node);
+       RTE_LOG(DEBUG, EAL, "PCI device %s on NUMA socket %i\n",
+               dev->name, dev->device.numa_node);
 
        RTE_LOG(DEBUG, EAL, "  remove driver: %x:%x %s\n", dev->id.vendor_id,
                        dev->id.device_id, dr->driver.name);
@@ -387,7 +380,7 @@ rte_pci_insert_device(struct rte_pci_device *exist_pci_dev,
 }
 
 /* Remove a device from PCI bus */
-static void
+void
 rte_pci_remove_device(struct rte_pci_device *pci_dev)
 {
        TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
index 13c3324bb..d5815ee44 100644
--- a/drivers/bus/pci/private.h
+++ b/drivers/bus/pci/private.h
@@ -67,6 +67,15 @@ void rte_pci_add_device(struct rte_pci_device *pci_dev);
 void rte_pci_insert_device(struct rte_pci_device *exist_pci_dev,
                struct rte_pci_device *new_pci_dev);
 
+/**
+ * Remove a PCI device from the PCI Bus.
+ *
+ * @param pci_dev
+ *     PCI device to remove
+ * @return void
+ */
+void rte_pci_remove_device(struct rte_pci_device *pci_dev);
+
 /**
  * Update a pci device object by asking the kernel for the latest information.
  *
diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
index 06e004cd3..465a44935 100644
--- a/drivers/bus/pci/rte_bus_pci.h
+++ b/drivers/bus/pci/rte_bus_pci.h
@@ -51,6 +51,13 @@ TAILQ_HEAD(rte_pci_driver_list, rte_pci_driver);
 
 struct rte_devargs;
 
+/* It's RTE_UUID_STRLEN, which is bigger than PCI_PRI_STR_SIZE. */
+#define RTE_PCI_NAME_LEN               (36 + 1)
+
+// XXX: we can't include rte_uuid.h directly due to the conflicts
+//      introduced by stdbool.h
+typedef unsigned char rte_uuid_t[16];
+
 /**
  * A structure describing a PCI device.
  */
@@ -58,6 +65,8 @@ struct rte_pci_device {
        TAILQ_ENTRY(rte_pci_device) next;   /**< Next probed PCI device. */
        struct rte_device device;           /**< Inherit core device */
        struct rte_pci_addr addr;           /**< PCI location. */
+       rte_uuid_t uuid;                    /**< Mdev location. */
+       uint8_t use_uuid;                   /**< True if uuid field valid. */
        struct rte_pci_id id;               /**< PCI ID. */
        struct rte_mem_resource mem_resource[PCI_MAX_RESOURCE];
                                            /**< PCI Memory Resource */
@@ -65,7 +74,7 @@ struct rte_pci_device {
        struct rte_pci_driver *driver;      /**< PCI driver used in probing */
        uint16_t max_vfs;                   /**< sriov enable if not zero */
        enum rte_kernel_driver kdrv;        /**< Kernel driver passthrough */
-       char name[PCI_PRI_STR_SIZE+1];      /**< PCI location (ASCII) */
+       char name[RTE_PCI_NAME_LEN];        /**< PCI/Mdev location (ASCII) */
        struct rte_intr_handle vfio_req_intr_handle;
                                /**< Handler of VFIO request interrupt */
 };
-- 
2.17.1

Reply via email to