Current device hotplug is just when using UIO. This patch adds
same functionality with VFIO.

It has been validated through tests using IOMMU and also with VFIO
and no-iommu mode.

Signed-off-by: Alejandro Lucero <alejandro.luc...@netronome.com>
---
 lib/librte_eal/common/eal_common_pci.c         |   2 +-
 lib/librte_eal/linuxapp/eal/eal_pci.c          |   5 +-
 lib/librte_eal/linuxapp/eal/eal_pci_init.h     |   1 +
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c     |  82 ++++++++++-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 193 +++++++++++++++++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  11 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c |  23 ++-
 lib/librte_ether/rte_ethdev.c                  |   2 +-
 8 files changed, 263 insertions(+), 56 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_pci.c 
b/lib/librte_eal/common/eal_common_pci.c
index 72547bd..4278126 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -199,7 +199,7 @@ static struct rte_devargs *pci_devargs_lookup(struct 
rte_pci_device *dev)
                                dev->id.device_id, dr->driver.name);
 
                if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) {
-                       /* map resources for devices that use igb_uio */
+                       /* map resources for devices that use igb_uio or VFIO */
                        ret = rte_eal_pci_map_device(dev);
                        if (ret != 0)
                                return ret;
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c 
b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 61d55b9..7d6f50a 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -121,7 +121,10 @@
        /* try unmapping the NIC resources using VFIO if it exists */
        switch (dev->kdrv) {
        case RTE_KDRV_VFIO:
-               RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n");
+#ifdef VFIO_PRESENT
+               if (pci_vfio_is_enabled())
+                       pci_vfio_unmap_resource(dev);
+#endif
                break;
        case RTE_KDRV_IGB_UIO:
        case RTE_KDRV_UIO_GENERIC:
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h 
b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index 6a960d1..bb1fabe 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -90,6 +90,7 @@ void pci_vfio_ioport_write(struct rte_pci_ioport *p,
 
 /* map VFIO resource prototype */
 int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
 
 #endif
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 5f478c5..f02bf53 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -38,6 +38,7 @@
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <stdbool.h>
 
 #include <rte_log.h>
 #include <rte_pci.h>
@@ -172,7 +173,7 @@
 
 /* set PCI bus mastering */
 static int
-pci_vfio_set_bus_master(int dev_fd)
+pci_vfio_set_bus_master(int dev_fd, bool op)
 {
        uint16_t reg;
        int ret;
@@ -185,8 +186,11 @@
                return -1;
        }
 
-       /* set the master bit */
-       reg |= PCI_COMMAND_MASTER;
+       if (op)
+               /* set the master bit */
+               reg |= PCI_COMMAND_MASTER;
+       else
+               reg &= ~(PCI_COMMAND_MASTER);
 
        ret = pwrite64(dev_fd, &reg, sizeof(reg),
                        VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
@@ -517,7 +521,7 @@
                }
 
                /* set bus mastering for the device */
-               if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+               if (pci_vfio_set_bus_master(vfio_dev_fd, 1)) {
                        RTE_LOG(ERR, EAL, "  %s cannot set up bus 
mastering!\n", pci_addr);
                        close(vfio_dev_fd);
                        rte_free(vfio_res);
@@ -535,6 +539,76 @@
 }
 
 int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+       char pci_addr[PATH_MAX] = {0};
+       struct rte_pci_addr *loc = &dev->addr;
+       int i, ret;
+       struct mapped_pci_resource *vfio_res = NULL;
+       struct mapped_pci_res_list *vfio_res_list;
+
+       struct pci_map *maps;
+
+       /* store PCI address string */
+       snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+                       loc->domain, loc->bus, loc->devid, loc->function);
+
+
+       if (close(dev->intr_handle.fd) < 0) {
+               RTE_LOG(INFO, EAL, "Error when Closing eventfd file descriptor 
for %s\n",
+                       pci_addr);
+               return -1;
+       }
+
+       if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, 0)) {
+               RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI 
device!\n",
+                               pci_addr);
+               return -1;
+       }
+
+       ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
+                                 dev->intr_handle.vfio_dev_fd);
+       if (ret < 0) {
+               RTE_LOG(ERR, EAL,
+                       "%s(): cannot release device\n", __func__);
+               return ret;
+       }
+
+       vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, 
mapped_pci_res_list);
+       /* Get vfio_res */
+       TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+               if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+                       continue;
+               break;
+       }
+       /* if we haven't found our tailq entry, something's wrong */
+       if (vfio_res == NULL) {
+               RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI 
device!\n",
+                               pci_addr);
+               return -1;
+       }
+
+       /* map BARs */
+       maps = vfio_res->maps;
+
+       RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+               pci_addr);
+       for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+               /* TODO: what about the maps offset field? */
+               if (maps[i].addr) {
+                       RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s 
at %p\n",
+                               pci_addr, maps[i].addr);
+                       pci_unmap_resource(maps[i].addr, maps[i].size);
+               }
+       }
+
+       TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+       return 0;
+}
+
+int
 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
                    struct rte_pci_ioport *p)
 {
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 9377a66..42e0f62 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -68,13 +68,32 @@
 {
        int i;
        int vfio_group_fd;
+       int group_idx = -1;
        char filename[PATH_MAX];
 
        /* check if we already have the group descriptor open */
-       for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
                if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
                        return vfio_cfg.vfio_groups[i].fd;
 
+       /* Lets see first if there is room for a new group */
+       if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+               return -1;
+       }
+
+       /* Now lets get an index for the new group */
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg.vfio_groups[i].group_no == -1) {
+                       group_idx = i;
+                       break;
+               }
+
+       /* This should not happen */
+       if (group_idx == -1) {
+               RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+               return -1;
+       }
        /* if primary, try to open the group */
        if (internal_config.process_type == RTE_PROC_PRIMARY) {
                /* try regular group format */
@@ -104,14 +123,9 @@
                        /* noiommu group found */
                }
 
-               /* if the fd is valid, create a new group for it */
-               if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
-                       RTE_LOG(ERR, EAL, "Maximum number of VFIO groups 
reached!\n");
-                       close(vfio_group_fd);
-                       return -1;
-               }
-               vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 
iommu_group_no;
-               vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = 
vfio_group_fd;
+               vfio_cfg.vfio_groups[group_idx].group_no = iommu_group_no;
+               vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
+               vfio_cfg.vfio_active_groups++;
                return vfio_group_fd;
        }
        /* if we're in a secondary process, request group fd from the primary
@@ -158,14 +172,62 @@
        return -1;
 }
 
-static void
-clear_current_group(void)
+int
+clear_group(int vfio_group_fd)
 {
-       vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
-       vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+       int i;
+       int socket_fd, ret;
+
+       if (internal_config.process_type == RTE_PROC_PRIMARY) {
+
+               for (i = 0; i < VFIO_MAX_GROUPS; i++)
+                       if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
+                               vfio_cfg.vfio_groups[i].group_no = -1;
+                               vfio_cfg.vfio_groups[i].fd = -1;
+                               vfio_cfg.vfio_active_groups--;
+                               return 0;
+                       }
+               return -1;
+       }
+
+       /* This is just for SECONDARY processes */
+       socket_fd = vfio_mp_sync_connect_to_primary();
+
+       if (socket_fd < 0) {
+               RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+               return -1;
+       }
+
+       if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
+               RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+               close(socket_fd);
+               return -1;
+       }
+
+       if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
+               RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
+               close(socket_fd);
+               return -1;
+       }
+
+       ret = vfio_mp_sync_receive_request(socket_fd);
+       switch (ret) {
+       case SOCKET_NO_FD:
+               RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
+               close(socket_fd);
+               break;
+       case SOCKET_OK:
+               close(socket_fd);
+               return 0;
+       default:
+               RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
+               close(socket_fd);
+       }
+       return -1;
 }
 
-int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+int
+vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                int *vfio_dev_fd, struct vfio_device_info *device_info)
 {
        struct vfio_group_status group_status = {
@@ -192,18 +254,10 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
        if (vfio_group_fd < 0)
                return -1;
 
-       /* store group fd */
-       vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
-       vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
-
        /* if group_fd == 0, that means the device isn't managed by VFIO */
        if (vfio_group_fd == 0) {
-               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, 
skipping\n",
+               RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, 
skipping\n",
                                dev_addr);
-               /* we store 0 as group fd to distinguish between existing but
-                * unbound VFIO groups, and groups that don't exist at all.
-                */
-               vfio_cfg.vfio_group_idx++;
                return 1;
        }
 
@@ -218,12 +272,12 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
                RTE_LOG(ERR, EAL, "  %s cannot get group status, "
                                "error %i (%s)\n", dev_addr, errno, 
strerror(errno));
                close(vfio_group_fd);
-               clear_current_group();
+               clear_group(vfio_group_fd);
                return -1;
        } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
                RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", dev_addr);
                close(vfio_group_fd);
-               clear_current_group();
+               clear_group(vfio_group_fd);
                return -1;
        }
 
@@ -237,58 +291,62 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
                        RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to 
container, "
                                        "error %i (%s)\n", dev_addr, errno, 
strerror(errno));
                        close(vfio_group_fd);
-                       clear_current_group();
+                       clear_group(vfio_group_fd);
                        return -1;
                }
-               /*
-                * at this point we know that this group has been successfully
-                * initialized, so we increment vfio_group_idx to indicate that 
we can
-                * add new groups.
-                */
-               vfio_cfg.vfio_group_idx++;
        }
 
        /*
         * pick an IOMMU type and set up DMA mappings for container
         *
-        * needs to be done only once, only when at least one group is assigned 
to
-        * a container and only in primary process
+        * needs to be done only once, only when first group is assigned to
+        * a container and only in primary process. Note this can happen several
+        * times with the hotplug functionality.
         */
        if (internal_config.process_type == RTE_PROC_PRIMARY &&
-                       vfio_cfg.vfio_container_has_dma == 0) {
+                       vfio_cfg.vfio_active_groups == 1) {
                /* select an IOMMU type which we will be using */
                const struct vfio_iommu_type *t =
                                vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
                if (!t) {
                        RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", 
dev_addr);
+                       close(vfio_group_fd);
+                       clear_group(vfio_group_fd);
                        return -1;
                }
                ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
                if (ret) {
                        RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
                                        "error %i (%s)\n", dev_addr, errno, 
strerror(errno));
+                       close(vfio_group_fd);
+                       clear_group(vfio_group_fd);
                        return -1;
                }
-               vfio_cfg.vfio_container_has_dma = 1;
        }
 
        /* get a file descriptor for the device */
        *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
        if (*vfio_dev_fd < 0) {
-               /* if we cannot get a device fd, this simply means that this
-               * particular port is not bound to VFIO
-               */
-               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, 
skipping\n",
+               /* if we cannot get a device fd, this implies a problem with
+                * the VFIO group or the container not having IOMMU configured.
+                */
+
+               RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
                                dev_addr);
-               return 1;
+               close(vfio_group_fd);
+               clear_group(vfio_group_fd);
+               return -1;
        }
 
        /* test and setup the device */
        ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
        if (ret) {
                RTE_LOG(ERR, EAL, "  %s cannot get device info, "
-                               "error %i (%s)\n", dev_addr, errno, 
strerror(errno));
+                               "error %i (%s)\n", dev_addr, errno,
+                               strerror(errno));
                close(*vfio_dev_fd);
+               close(vfio_group_fd);
+               clear_group(vfio_group_fd);
                return -1;
        }
 
@@ -296,6 +354,54 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
 }
 
 int
+vfio_release_device(const char *sysfs_base, const char *dev_addr,
+                   int vfio_dev_fd)
+{
+       struct vfio_group_status group_status = {
+                       .argsz = sizeof(group_status)
+       };
+       int vfio_group_fd;
+       int iommu_group_no;
+       int ret;
+
+       /* get group number */
+       ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+       if (ret <= 0) {
+               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver\n",
+                       dev_addr);
+               /* This is an error at this point. */
+               return -1;
+       }
+
+       /* get the actual group fd */
+       vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+       if (vfio_group_fd <= 0) {
+               RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
+                                  dev_addr);
+               return -1;
+       }
+
+       /* At this point we got an active group. Closing it will make the
+        * container detachment. If this is the last active group, VFIO kernel
+        * code will unset the container and the IOMMU mappings.
+        */
+
+       if (close(vfio_group_fd) < 0)
+               RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+                                  dev_addr);
+
+       if (close(vfio_dev_fd) < 0)
+               RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+                                  dev_addr);
+
+       if (clear_group(vfio_group_fd) < 0)
+               RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+                                  dev_addr);
+
+       return 0;
+}
+
+int
 vfio_enable(const char *modname)
 {
        /* initialize group list */
@@ -534,7 +640,8 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
 
                if (ret) {
                        RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-                                       "error %i (%s)\n", errno, 
strerror(errno));
+                                         "error %i (%s)\n", errno,
+                                         strerror(errno));
                        return -1;
                }
        }
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h 
b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index ac31a4f..3482c50 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -108,8 +108,7 @@ struct vfio_group {
 struct vfio_config {
        int vfio_enabled;
        int vfio_container_fd;
-       int vfio_container_has_dma;
-       int vfio_group_idx;
+       int vfio_active_groups;
        struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
 };
 
@@ -155,6 +154,10 @@ struct vfio_iommu_type {
 int
 vfio_get_group_fd(int iommu_group_no);
 
+/* remove group fd from internal VFIO group fd array */
+int
+clear_group(int vfio_group_fd);
+
 /**
  * Setup vfio_cfg for the device identified by its address. It discovers
  * the configured I/O MMU groups or sets a new one for the device. If a new
@@ -165,6 +168,9 @@ struct vfio_iommu_type {
 int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                int *vfio_dev_fd, struct vfio_device_info *device_info);
 
+int vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
+
+int vfio_enable(const char *modname);
 int vfio_enable(const char *modname);
 int vfio_is_enabled(const char *modname);
 
@@ -175,6 +181,7 @@ int vfio_setup_device(const char *sysfs_base, const char 
*dev_addr,
 
 #define SOCKET_REQ_CONTAINER 0x100
 #define SOCKET_REQ_GROUP 0x200
+#define SOCKET_CLR_GROUP 0x300
 #define SOCKET_OK 0x0
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index fb4a2f8..7e8095c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -267,7 +267,7 @@
 static __attribute__((noreturn)) void *
 vfio_mp_sync_thread(void __rte_unused * arg)
 {
-       int ret, fd, vfio_group_no;
+       int ret, fd, vfio_data;
 
        /* wait for requests on the socket */
        for (;;) {
@@ -305,13 +305,13 @@ static __attribute__((noreturn)) void *
                        break;
                case SOCKET_REQ_GROUP:
                        /* wait for group number */
-                       vfio_group_no = vfio_mp_sync_receive_request(conn_sock);
-                       if (vfio_group_no < 0) {
+                       vfio_data = vfio_mp_sync_receive_request(conn_sock);
+                       if (vfio_data < 0) {
                                close(conn_sock);
                                continue;
                        }
 
-                       fd = vfio_get_group_fd(vfio_group_no);
+                       fd = vfio_get_group_fd(vfio_data);
 
                        if (fd < 0)
                                vfio_mp_sync_send_request(conn_sock, 
SOCKET_ERR);
@@ -324,6 +324,21 @@ static __attribute__((noreturn)) void *
                                vfio_mp_sync_send_fd(conn_sock, fd);
                        }
                        break;
+               case SOCKET_CLR_GROUP:
+                       /* wait for group fd */
+                       vfio_data = vfio_mp_sync_receive_request(conn_sock);
+                       if (vfio_data < 0) {
+                               close(conn_sock);
+                               continue;
+                       }
+
+                       ret = clear_group(vfio_data);
+
+                       if (ret < 0)
+                               vfio_mp_sync_send_request(conn_sock, 
SOCKET_NO_FD);
+                       else
+                               vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
+                       break;
                default:
                        vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
                        break;
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index eb0a94a..d844f42 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -455,8 +455,8 @@ struct rte_eth_dev *
        case RTE_KDRV_UIO_GENERIC:
        case RTE_KDRV_NIC_UIO:
        case RTE_KDRV_NONE:
-               break;
        case RTE_KDRV_VFIO:
+               break;
        default:
                return -ENOTSUP;
        }
-- 
1.9.1

Reply via email to