This patch adds APIs to support container create/destroy and device
bind/unbind with a container. It also provides API for IOMMU programing
on a specified container.

A driver could use "rte_vfio_container_create" helper to create a new
container from eal, use "rte_vfio_container_group_bind" to bind a device
to the newly created container. During rte_vfio_setup_device the container
bound with the device will be used for IOMMU setup.

Signed-off-by: Junjie Chen <junjie.j.c...@intel.com>
Signed-off-by: Xiao Wang <xiao.w.w...@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coque...@redhat.com>
Reviewed-by: Ferruh Yigit <ferruh.yi...@intel.com>
---
 lib/librte_eal/bsdapp/eal/eal.c          |  52 ++++++
 lib/librte_eal/common/include/rte_vfio.h | 128 ++++++++++++++-
 lib/librte_eal/linuxapp/eal/eal_vfio.c   | 269 ++++++++++++++++++++++++++++---
 lib/librte_eal/rte_eal_version.map       |   6 +
 4 files changed, 436 insertions(+), 19 deletions(-)

diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index bfbec0d7f..b5c0386e4 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -769,6 +769,14 @@ int rte_vfio_noiommu_is_enabled(void);
 int rte_vfio_clear_group(int vfio_group_fd);
 int rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len);
 int rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len);
+int rte_vfio_container_create(void);
+int rte_vfio_container_destroy(int container_fd);
+int rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
+int rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
+int rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+               uint64_t iova, uint64_t len);
+int rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+               uint64_t iova, uint64_t len);
 
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
                      __rte_unused const char *dev_addr,
@@ -838,3 +846,47 @@ rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
 {
        return -1;
 }
+
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+                       __rte_unused uint64_t vaddr,
+                       __rte_unused uint64_t iova,
+                       __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+                       __rte_unused uint64_t vaddr,
+                       __rte_unused uint64_t iova,
+                       __rte_unused uint64_t len)
+{
+       return -1;
+}
diff --git a/lib/librte_eal/common/include/rte_vfio.h 
b/lib/librte_eal/common/include/rte_vfio.h
index c4a2e606f..c10c206a3 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -154,7 +154,10 @@ rte_vfio_clear_group(int vfio_group_fd);
 /**
  * Map memory region for use with VFIO.
  *
- * @note requires at least one device to be attached at the time of mapping.
+ * @note Require at least one device to be attached at the time of
+ *       mapping. DMA maps done via this API will only apply to default
+ *       container and will not apply to any of the containers created
+ *       via rte_vfio_container_create().
  *
  * @param vaddr
  *   Starting virtual address of memory to be mapped.
@@ -245,6 +248,129 @@ rte_vfio_get_container_fd(void);
 int __rte_experimental
 rte_vfio_get_group_fd(int iommu_group_num);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Create a new container for device binding.
+ *
+ * @note Any newly allocated DPDK memory will not be mapped into these
+ *       containers by default, user needs to manage DMA mappings for
+ *       any container created by this API.
+ *
+ * @return
+ *   the container fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_create(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Destroy the container, unbind all vfio groups within it.
+ *
+ * @param container_fd
+ *   the container fd to destroy
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Bind a IOMMU group to a container.
+ *
+ * @param container_fd
+ *   the container's fd
+ *
+ * @param iommu_group_num
+ *   the iommu group number to bind to container
+ *
+ * @return
+ *   group fd if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Unbind a IOMMU group from a container.
+ *
+ * @param container_fd
+ *   the container fd of container
+ *
+ * @param iommu_group_num
+ *   the iommu group number to delete from container
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform DMA mapping for devices in a container.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be mapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ *
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
+               uint64_t iova, uint64_t len);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Perform DMA unmapping for devices in a container.
+ *
+ * @param container_fd
+ *   the specified container fd
+ *
+ * @param vaddr
+ *   Starting virtual address of memory to be unmapped.
+ *
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ *
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *    0 if successful
+ *   <0 if failed
+ */
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
+               uint64_t iova, uint64_t len);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c 
b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 6289f6316..64ea194f0 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1532,19 +1532,15 @@ vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t 
vaddr, uint64_t iova,
                        len, do_map);
 }
 
-int __rte_experimental
-rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+static int
+container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
 {
        struct user_mem_map *new_map;
        struct user_mem_maps *user_mem_maps;
        int ret = 0;
 
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       user_mem_maps = &default_vfio_cfg->mem_maps;
+       user_mem_maps = &vfio_cfg->mem_maps;
        rte_spinlock_recursive_lock(&user_mem_maps->lock);
        if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
                RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
@@ -1553,7 +1549,7 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t 
len)
                goto out;
        }
        /* map the entry */
-       if (vfio_dma_mem_map(default_vfio_cfg, vaddr, iova, len, 1)) {
+       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
                /* technically, this will fail if there are currently no devices
                 * plugged in, even if a device were added later, this mapping
                 * might have succeeded. however, since we cannot verify if this
@@ -1577,19 +1573,15 @@ rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, 
uint64_t len)
        return ret;
 }
 
-int __rte_experimental
-rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+static int
+container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t 
iova,
+               uint64_t len)
 {
        struct user_mem_map *map, *new_map = NULL;
        struct user_mem_maps *user_mem_maps;
        int ret = 0;
 
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       user_mem_maps = &default_vfio_cfg->mem_maps;
+       user_mem_maps = &vfio_cfg->mem_maps;
        rte_spinlock_recursive_lock(&user_mem_maps->lock);
 
        /* find our mapping */
@@ -1614,7 +1606,7 @@ rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, 
uint64_t len)
        }
 
        /* unmap the entry */
-       if (vfio_dma_mem_map(default_vfio_cfg, vaddr, iova, len, 0)) {
+       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
                /* there may not be any devices plugged in, so unmapping will
                 * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
                 * stop us from removing the mapping, as the assumption is we
@@ -1653,6 +1645,28 @@ rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, 
uint64_t len)
        return ret;
 }
 
+int __rte_experimental
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return container_dma_map(default_vfio_cfg, vaddr, iova, len);
+}
+
+int __rte_experimental
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
+}
+
 int
 rte_vfio_noiommu_is_enabled(void)
 {
@@ -1685,6 +1699,181 @@ rte_vfio_noiommu_is_enabled(void)
        return c == 'Y';
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+       int i;
+
+       /* Find an empty slot to store new vfio config */
+       for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+               if (vfio_cfgs[i].vfio_container_fd == -1)
+                       break;
+       }
+
+       if (i == VFIO_MAX_CONTAINERS) {
+               RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+               return -1;
+       }
+
+       vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
+       if (vfio_cfgs[i].vfio_container_fd < 0) {
+               RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+               return -1;
+       }
+
+       return vfio_cfgs[i].vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg->vfio_groups[i].group_num != -1)
+                       rte_vfio_container_group_unbind(container_fd,
+                               vfio_cfg->vfio_groups[i].group_num);
+
+       close(container_fd);
+       vfio_cfg->vfio_container_fd = -1;
+       vfio_cfg->vfio_active_groups = 0;
+       vfio_cfg->vfio_iommu_type = NULL;
+
+       return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+       struct vfio_group *cur_grp;
+       int vfio_group_fd;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       /* Check room for new group */
+       if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+               return -1;
+       }
+
+       /* Get an index for the new group */
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg->vfio_groups[i].group_num == -1) {
+                       cur_grp = &vfio_cfg->vfio_groups[i];
+                       break;
+               }
+
+       /* This should not happen */
+       if (i == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+               return -1;
+       }
+
+       vfio_group_fd = vfio_open_group_fd(iommu_group_num);
+       if (vfio_group_fd < 0) {
+               RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
+               return -1;
+       }
+       cur_grp->group_num = iommu_group_num;
+       cur_grp->fd = vfio_group_fd;
+       cur_grp->devices = 0;
+       vfio_cfg->vfio_active_groups++;
+
+       return vfio_group_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+       struct vfio_group *cur_grp;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+               if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
+                       cur_grp = &vfio_cfg->vfio_groups[i];
+                       break;
+               }
+       }
+
+       /* This should not happen */
+       if (i == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "Specified group number not found\n");
+               return -1;
+       }
+
+       if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+               RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+                       " iommu_group_num %d\n", iommu_group_num);
+               return -1;
+       }
+       cur_grp->group_num = -1;
+       cur_grp->fd = -1;
+       cur_grp->devices = 0;
+       vfio_cfg->vfio_active_groups--;
+
+       return 0;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct vfio_config *vfio_cfg;
+
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       return container_dma_map(vfio_cfg, vaddr, iova, len);
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct vfio_config *vfio_cfg;
+
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       return container_dma_unmap(vfio_cfg, vaddr, iova, len);
+}
+
 #else
 
 int __rte_experimental
@@ -1701,4 +1890,48 @@ rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t 
__rte_unused iova,
        return -1;
 }
 
+int __rte_experimental
+rte_vfio_container_create(void)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+               __rte_unused uint64_t vaddr,
+               __rte_unused uint64_t iova,
+               __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+int __rte_experimental
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+               __rte_unused uint64_t vaddr,
+               __rte_unused uint64_t iova,
+               __rte_unused uint64_t len)
+{
+       return -1;
+}
+
 #endif
diff --git a/lib/librte_eal/rte_eal_version.map 
b/lib/librte_eal/rte_eal_version.map
index d02d80b8a..28f51f8d2 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -293,5 +293,11 @@ EXPERIMENTAL {
        rte_vfio_get_container_fd;
        rte_vfio_get_group_fd;
        rte_vfio_get_group_num;
+       rte_vfio_container_create;
+       rte_vfio_container_destroy;
+       rte_vfio_container_dma_map;
+       rte_vfio_container_dma_unmap;
+       rte_vfio_container_group_bind;
+       rte_vfio_container_group_unbind;
 
 } DPDK_18.02;
-- 
2.15.1

Reply via email to