From: Beilei Xing <beilei.x...@intel.com>

VFIO IOMMUFD is a new component added to co-work with IOMMUFD.
IOMMUFD has no impact on the existing VFIO Container/Group
interface, while the latest IOMMU feature(e.g. PASID/SSID) may
be only available through VFIO IOMMUFD/CDEV interface.

This path exposes setup/release vfio device functions with VFIO
IOMMUFD/CDEV interface.

Signed-off-by: Beilei Xing <beilei.x...@intel.com>
Signed-off-by: Yahui Cao <yahui....@intel.com>
---
 lib/eal/include/rte_vfio.h       |  55 +++++
 lib/eal/linux/eal_vfio.h         |   3 +
 lib/eal/linux/eal_vfio_iommufd.c | 385 +++++++++++++++++++++++++++++++
 lib/eal/linux/meson.build        |   1 +
 lib/eal/version.map              |   2 +
 5 files changed, 446 insertions(+)
 create mode 100644 lib/eal/linux/eal_vfio_iommufd.c

diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index 22832afd0f..7a9b26b0f7 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -17,6 +17,8 @@ extern "C" {
 #include <stdbool.h>
 #include <stdint.h>
 
+#include <rte_compat.h>
+
 /*
  * determine if VFIO is present on the system
  */
@@ -28,6 +30,9 @@ extern "C" {
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0)
 #define HAVE_VFIO_DEV_REQ_INTERFACE
 #endif /* kernel version >= 4.0.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
@@ -42,6 +47,10 @@ extern "C" {
 #define VFIO_NOIOMMU_MODE      \
        "/sys/module/vfio/parameters/enable_unsafe_noiommu_mode"
 
+#ifdef VFIO_IOMMUFD_PRESENT
+#define VFIO_CDEV_CLASS_DIR "/sys/class/vfio-dev"
+#endif
+
 /* NOIOMMU is defined from kernel version 4.5 onwards */
 #ifdef VFIO_NOIOMMU_IOMMU
 #define RTE_VFIO_NOIOMMU VFIO_NOIOMMU_IOMMU
@@ -137,6 +146,33 @@ struct vfio_device_info;
 int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
                int *vfio_dev_fd, struct vfio_device_info *device_info);
 
+/**
+ * Setup iommufd_cfg for the device identified by its address.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param sysfs_base
+ *   sysfs path prefix.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param vfio_dev_fd
+ *   VFIO fd.
+ *
+ * @param device_info
+ *   Device information.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ *   >1 if the device cannot be managed this way.
+ */
+__rte_experimental
+int rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+                                 int *vfio_dev_fd, struct vfio_device_info 
*device_info);
+
 /**
  * Release a device mapped to a VFIO-managed I/O MMU group.
  *
@@ -158,6 +194,25 @@ int rte_vfio_setup_device(const char *sysfs_base, const 
char *dev_addr,
  */
 int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int 
fd);
 
+/**
+ * Release a device mapped to a VFIO-iommufd-managed I/O MMU group.
+ *
+ * This function is only relevant to linux and will return
+ * an error on BSD.
+ *
+ * @param dev_addr
+ *   device location.
+ *
+ * @param fd
+ *   VFIO fd.
+ *
+ * @return
+ *   0 on success.
+ *   <0 on failure.
+ */
+__rte_experimental
+int rte_vfio_iommufd_release_device(const char *dev_addr, int fd);
+
 /**
  * Enable a VFIO-related kmod.
  *
diff --git a/lib/eal/linux/eal_vfio.h b/lib/eal/linux/eal_vfio.h
index 23a787ad20..c94409e828 100644
--- a/lib/eal/linux/eal_vfio.h
+++ b/lib/eal/linux/eal_vfio.h
@@ -17,6 +17,9 @@
 #else
 #pragma message("VFIO configured but not supported by this kernel, disabling.")
 #endif /* kernel version >= 3.6.0 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 6, 0)
+#define VFIO_IOMMUFD_PRESENT
+#endif /* kernel version >= 6.6.0 */
 #endif /* RTE_EAL_VFIO */
 
 #ifdef VFIO_PRESENT
diff --git a/lib/eal/linux/eal_vfio_iommufd.c b/lib/eal/linux/eal_vfio_iommufd.c
new file mode 100644
index 0000000000..02996a588a
--- /dev/null
+++ b/lib/eal/linux/eal_vfio_iommufd.c
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+#include <rte_errno.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+#ifdef VFIO_IOMMUFD_PRESENT
+#include <linux/iommufd.h>
+#include "eal_iommufd.h"
+
+#define VFIO_IOMMUFD_MEM_EVENT_CLB_NAME "vfio_iommufd_mem_event_clb"
+
+struct ioas_info {
+       int iommufd;
+       uint32_t ioas_id;
+};
+
+static int
+vfio_iommufd_add_device(const char *dev_addr, int vfio_dev_fd)
+{
+       struct iommufd_config *iommufd_cfg;
+       int iommufd;
+       uint32_t ioas_id;
+       struct vfio_device_bind_iommufd bind = {};
+       struct vfio_device_attach_iommufd_pt attach = {};
+       int ret = 0;
+
+       iommufd_cfg = default_iommufd_cfg;
+       iommufd = iommufd_cfg->iommufd;
+       ioas_id = iommufd_cfg->ioas_id;
+
+       bind.argsz = sizeof(bind);
+       bind.iommufd = iommufd;
+       bind.flags = 0;
+
+       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Device %s cannot bind to iommufd\n", 
dev_addr);
+               return ret;
+       }
+
+       attach.argsz = sizeof(attach);
+       attach.flags = 0;
+       attach.pt_id = ioas_id;
+
+       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Device %s cannot attach to ioas\n", 
dev_addr);
+               return ret;
+       }
+
+       return 0;
+}
+
+static int
+vfio_iommufd_map_contig(const struct rte_memseg_list *msl, const struct 
rte_memseg *ms,
+                       size_t len, void *arg)
+{
+       struct ioas_info *info = arg;
+
+       if (msl->external)
+               return 0;
+
+       return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+                                  ms->iova, len, 1);
+}
+
+static int
+vfio_iommufd_map(const struct rte_memseg_list *msl, const struct rte_memseg 
*ms,
+                void *arg)
+{
+       struct ioas_info *info = arg;
+
+       /* skip external memory that isn't a heap */
+       if (msl->external && !msl->heap)
+               return 0;
+
+       /* skip any segments with invalid IOVA addresses */
+       if (ms->iova == RTE_BAD_IOVA)
+               return 0;
+
+       /* if IOVA mode is VA, we've already mapped the internal segments */
+       if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
+               return 0;
+
+       return iommufd_dma_mem_map(info->iommufd, info->ioas_id, ms->addr_64,
+                                  ms->iova,  ms->len, 1);
+}
+
+static int
+vfio_iommufd_dma_map(int iommufd, uint32_t ioasid)
+{
+       struct ioas_info info = {.iommufd = iommufd, .ioas_id = ioasid};
+       if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+               /* with IOVA as VA mode, we can get away with mapping contiguous
+                * chunks rather than going page-by-page.
+                */
+               int ret = rte_memseg_contig_walk(vfio_iommufd_map_contig,
+                                                &info);
+               if (ret)
+                       return ret;
+               /* we have to continue the walk because we've skipped the
+                * external segments during the config walk.
+                */
+       }
+       return rte_memseg_walk(vfio_iommufd_map, &info);
+}
+
+static void
+vfio_iommufd_mem_event_callback(enum rte_mem_event type, const void *addr,
+                               size_t len, void *arg __rte_unused)
+{
+       struct rte_memseg_list *msl;
+       struct rte_memseg *ms;
+       size_t cur_len = 0;
+
+       msl = rte_mem_virt2memseg_list(addr);
+
+       /* for IOVA as VA mode, no need to care for IOVA addresses */
+       if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+               uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+               uint64_t page_sz = msl->page_sz;
+
+               /* Maintain granularity of DMA map/unmap to memseg size */
+               for (; cur_len < len; cur_len += page_sz) {
+                       if (type == RTE_MEM_EVENT_ALLOC)
+                               
iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+                                                   
default_iommufd_cfg->ioas_id,
+                                                   vfio_va, vfio_va, page_sz, 
1);
+                       else
+                               
iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+                                                   
default_iommufd_cfg->ioas_id,
+                                                   vfio_va, vfio_va, page_sz, 
0);
+                       vfio_va += page_sz;
+               }
+
+               return;
+       }
+
+       /* memsegs are contiguous in memory */
+       ms = rte_mem_virt2memseg(addr, msl);
+       while (cur_len < len) {
+               /* some memory segments may have invalid IOVA */
+               if (ms->iova == RTE_BAD_IOVA) {
+                       RTE_LOG(DEBUG, EAL,
+                               "Memory segment at %p has bad IOVA, skipping\n",
+                               ms->addr);
+                       goto next;
+               }
+               if (type == RTE_MEM_EVENT_ALLOC)
+                       iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+                                           default_iommufd_cfg->ioas_id,
+                                           ms->addr_64, ms->iova, ms->len, 1);
+               else
+                       iommufd_dma_mem_map(default_iommufd_cfg->iommufd,
+                                           default_iommufd_cfg->ioas_id,
+                                           ms->addr_64, ms->iova, ms->len, 0);
+next:
+               cur_len += ms->len;
+               ++ms;
+       }
+}
+
+static int
+vfio_iommufd_get_fd(const char *sysfs_base, const char *dev_addr)
+{
+       char vfio_cdev_path[PATH_MAX];
+       char vfio_path[PATH_MAX];
+       char dirname[PATH_MAX];
+       int vfio_dev_fd;
+       struct dirent *dent;
+       unsigned int major, minor;
+       struct stat st;
+       dev_t cdev;
+       DIR *dir;
+       FILE *f;
+       int ret = 0;
+
+       memset(vfio_cdev_path, 0, sizeof(vfio_cdev_path));
+       memset(vfio_path, 0, sizeof(vfio_path));
+       memset(dirname, 0, sizeof(dirname));
+
+       snprintf(dirname, sizeof(dirname), "%s/%s/vfio-dev",
+                sysfs_base, dev_addr);
+
+       dir = opendir(dirname);
+       if (dir == NULL) {
+               RTE_LOG(ERR, EAL, "%s(): opendir failed: %s\n",
+                       __func__, strerror(errno));
+               return -1;
+       }
+
+       while ((dent = readdir(dir)) != NULL) {
+               if (!strncmp(dent->d_name, "vfio", 4)) {
+                       snprintf(vfio_cdev_path, sizeof(vfio_cdev_path),
+                                "%s/%s/vfio-dev/%s/dev", sysfs_base,
+                                dev_addr, dent->d_name);
+                       break;
+               }
+       }
+
+       f = fopen(vfio_cdev_path, "r");
+       if (f == NULL) {
+               RTE_LOG(ERR, EAL, "%s(): cannot open sysfs to get 
major:minor\n",
+                       __func__);
+               ret = -1;
+               goto err_fopen;
+       }
+
+       ret = fscanf(f, "%u:%u", &major, &minor);
+       if (ret != 2) {
+               RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs to get 
major:minor\n",
+                       __func__);
+               ret = -1;
+               goto err_fscanf;
+       }
+
+       cdev = makedev(major, minor);
+
+       snprintf(vfio_path, sizeof(vfio_path), "/dev/vfio/devices/%s", 
dent->d_name);
+       vfio_dev_fd = open(vfio_path, O_RDWR);
+       if (vfio_dev_fd == -1) {
+               RTE_LOG(ERR, EAL, "%s(): can't open %s: %s\n",
+                       __func__, vfio_path, strerror(errno));
+               ret = -1;
+               goto err_fscanf;
+       }
+
+       if (fstat(vfio_dev_fd, &st) || !S_ISCHR(st.st_mode) ||
+           (cdev != 0 && st.st_rdev != cdev)) {
+               RTE_LOG(ERR, EAL, "%s(): vfio char device is not matched\n",
+                       __func__);
+               ret = -1;
+       }
+
+       ret = vfio_dev_fd;
+
+err_fscanf:
+       fclose(f);
+err_fopen:
+       closedir(dir);
+       return ret;
+}
+
+int
+rte_vfio_iommufd_setup_device(const char *sysfs_base, const char *dev_addr,
+                             int *vfio_dev_fd, struct vfio_device_info 
*device_info)
+{
+       struct iommufd_config *iommufd_cfg;
+       int iommufd;
+       uint32_t ioas_id;
+       int ret = 0;
+       const struct internal_config *internal_conf =
+               eal_get_internal_configuration();
+
+       iommufd_cfg = default_iommufd_cfg;
+       iommufd = iommufd_cfg->iommufd;
+       ioas_id = iommufd_cfg->ioas_id;
+
+       *vfio_dev_fd = vfio_iommufd_get_fd(sysfs_base, dev_addr);
+       if (*vfio_dev_fd < 0) {
+               RTE_LOG(ERR, EAL, "Failed to get device fd for device %s\n", 
dev_addr);
+               return -1;
+       }
+
+       if (vfio_iommufd_add_device(dev_addr, *vfio_dev_fd)) {
+               RTE_LOG(ERR, EAL, "Failed to add device %s to iommufd\n", 
dev_addr);
+               ret = -1;
+               goto err_add_dev;
+       }
+
+       if (!iommufd_cfg->dma_init &&
+           internal_conf->process_type == RTE_PROC_PRIMARY &&
+           iommufd != -1) {
+               /* lock memory hotplug before mapping and release it
+                * after registering callback, to prevent races
+                */
+               rte_mcfg_mem_read_lock();
+               ret = vfio_iommufd_dma_map(iommufd, ioas_id);
+               if (ret) {
+                       RTE_LOG(ERR, EAL,
+                               "%s DMA remapping failed, error "
+                               "%i (%s)\n",
+                               dev_addr, errno, strerror(errno));
+                       rte_mcfg_mem_read_unlock();
+                       ret = -1;
+                       goto err_dma_map;
+               }
+
+               /* register callback for mem events */
+               ret = rte_mem_event_callback_register(
+                       VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+                       vfio_iommufd_mem_event_callback, NULL);
+
+               /* unlock memory hotplug */
+               rte_mcfg_mem_read_unlock();
+
+               if (ret && rte_errno != ENOTSUP) {
+                       RTE_LOG(ERR, EAL, "Could not install memory event 
callback for VFIO\n");
+                       ret = -1;
+                       goto err_dma_map;
+               }
+               if (ret)
+                       RTE_LOG(DEBUG, EAL, "Memory event callbacks not 
supported\n");
+               else
+                       RTE_LOG(DEBUG, EAL, "Installed memory event callback 
for VFIO\n");
+
+               iommufd_cfg->dma_init = true;
+       }
+
+       ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "%s cannot get device info, "
+                       "error %i (%s)\n", dev_addr, errno,
+                       strerror(errno));
+               ret = -1;
+               goto err_dma_map;
+       }
+
+       return 0;
+
+err_dma_map:
+       rte_vfio_iommufd_release_device(dev_addr, *vfio_dev_fd);
+err_add_dev:
+       close(*vfio_dev_fd);
+       return ret;
+}
+
+int
+rte_vfio_iommufd_release_device(const char *dev_addr, int vfio_dev_fd)
+{
+       struct vfio_device_detach_iommufd_pt detach = {};
+       int ret = 0;
+
+       rte_mcfg_mem_read_lock();
+
+       detach.argsz = sizeof(detach);
+       detach.flags = 0;
+
+       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Device %s cannot detach from iommufd\n", 
dev_addr);
+               goto err;
+       }
+
+       close(vfio_dev_fd);
+
+       rte_mem_event_callback_unregister(VFIO_IOMMUFD_MEM_EVENT_CLB_NAME,
+                                         NULL);
+
+err:
+       rte_mcfg_mem_read_unlock();
+       return ret;
+}
+
+#else
+int
+rte_vfio_iommufd_setup_device(__rte_unused const char *sysfs_base,
+                             __rte_unused const char *dev_addr,
+                             __rte_unused int *vfio_dev_fd,
+                             __rte_unused struct vfio_device_info *device_info)
+{
+       return -1;
+}
+
+int
+rte_vfio_iommufd_release_device(__rte_unused const char *dev_addr,
+                               __rte_unused int vfio_dev_fd)
+{
+       return -1;
+}
+
+#endif /* VFIO_IOMMUFD_PRESENT */
diff --git a/lib/eal/linux/meson.build b/lib/eal/linux/meson.build
index 8081087584..bf246e64c9 100644
--- a/lib/eal/linux/meson.build
+++ b/lib/eal/linux/meson.build
@@ -16,6 +16,7 @@ sources += files(
         'eal_thread.c',
         'eal_timer.c',
         'eal_vfio.c',
+       'eal_vfio_iommufd.c',
         'eal_iommufd.c',
         'eal_vfio_mp_sync.c',
 )
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 30e66a7267..9c1e70feca 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -396,6 +396,8 @@ EXPERIMENTAL {
 
        rte_iommufd_enable; # WINDOWS_NO_EXPORT
        rte_iommufd_is_enabled; # WINDOWS_NO_EXPORT
+       rte_vfio_iommufd_release_device; # WINDOWS_NO_EXPORT
+       rte_vfio_iommufd_setup_device; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
-- 
2.34.1

Reply via email to