Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.

Signed-off-by: Nathan Chen <nath...@nvidia.com>
---
 src/qemu/qemu_command.c |  44 +++++++-
 src/qemu/qemu_command.h |   3 +-
 src/qemu/qemu_domain.c  |   8 ++
 src/qemu/qemu_domain.h  |   7 ++
 src/qemu/qemu_hotplug.c |   2 +-
 src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 290 insertions(+), 6 deletions(-)

diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 6b3e2ffd0d..359dbb2621 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4797,7 +4797,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
 
 virJSONValue *
 qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev)
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm)
 {
     g_autoptr(virJSONValue) props = NULL;
     virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4807,6 +4808,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
     const char *driver = NULL;
     /* 'ramfb' property must be omitted unless it's to be enabled */
     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+    bool useIommufd = false;
+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+        dev->iommufdId) {
+        useIommufd = true;
+    }
 
     /* caller has to assign proper passthrough driver name */
     switch (pcisrc->driver.name) {
@@ -4850,6 +4858,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                               NULL) < 0)
         return NULL;
 
+    if (useIommufd && priv) {
+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                      pcisrc->addr.domain, 
pcisrc->addr.bus,
+                                                      pcisrc->addr.slot, 
pcisrc->addr.function);
+
+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, 
vfioFdName));
+        if (virJSONValueObjectAdd(&props,
+                                  "S:fd", g_strdup_printf("%d", vfiofd),
+                                  NULL) < 0)
+            return NULL;
+    }
+
     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
         return NULL;
 
@@ -5223,11 +5243,13 @@ qemuBuildHostdevSCSICommandLine(virCommand *cmd,
 static int
 qemuBuildHostdevCommandLine(virCommand *cmd,
                             const virDomainDef *def,
-                            virQEMUCaps *qemuCaps)
+                            virQEMUCaps *qemuCaps,
+                            virDomainObj *vm)
 {
     size_t i;
     g_autoptr(virJSONValue) props = NULL;
     int iommufd = 0;
+    qemuDomainObjPrivate *priv = vm->privateData;
 
     for (i = 0; i < def->nhostdevs; i++) {
         virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5239,8 +5261,11 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
 
         if (hostdev->iommufdId && iommufd == 0) {
             iommufd = 1;
+            virCommandPassFD(cmd, priv->iommufd, 
VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+
             if (qemuMonitorCreateObjectProps(&props, "iommufd",
                                              hostdev->iommufdId,
+                                             "S:fd", g_strdup_printf("%d", 
priv->iommufd),
                                              NULL) < 0)
                 return -1;
 
@@ -5270,7 +5295,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
                 return -1;
 
-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+            if (hostdev->iommufdId) {
+                virDomainHostdevSubsysPCI *pcisrc = 
&hostdev->source.subsys.u.pci;
+                g_autofree char *vfioFdName = 
g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                              
pcisrc->addr.domain, pcisrc->addr.bus,
+                                                              
pcisrc->addr.slot, pcisrc->addr.function);
+
+                int vfiofd = 
GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+                virCommandPassFD(cmd, vfiofd, 
VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+            }
+
+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
                 return -1;
 
             if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, 
qemuCaps) < 0)
@@ -10960,7 +10996,7 @@ qemuBuildCommandLine(virDomainObj *vm,
     if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
         return NULL;
 
-    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
         return NULL;
 
     if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
 /* Current, best practice */
 virJSONValue *
 qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev);
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm);
 
 virJSONValue *
 qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a2c7c88a7e..2086dbb575 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1954,6 +1954,11 @@ qemuDomainObjPrivateFree(void *data)
 
     virChrdevFree(priv->devs);
 
+    if (priv->iommufd >= 0) {
+        virEventRemoveHandle(priv->iommufd);
+        priv->iommufd = -1;
+    }
+
     if (priv->pidMonitored >= 0) {
         virEventRemoveHandle(priv->pidMonitored);
         priv->pidMonitored = -1;
@@ -1975,6 +1980,7 @@ qemuDomainObjPrivateFree(void *data)
 
     g_clear_pointer(&priv->blockjobs, g_hash_table_unref);
     g_clear_pointer(&priv->fds, g_hash_table_unref);
+    g_clear_pointer(&priv->vfioDeviceFds, g_hash_table_unref);
 
     /* This should never be non-NULL if we get here, but just in case... */
     if (priv->eventThread) {
@@ -2003,7 +2009,9 @@ qemuDomainObjPrivateAlloc(void *opaque)
 
     priv->blockjobs = virHashNew(virObjectUnref);
     priv->fds = virHashNew(g_object_unref);
+    priv->vfioDeviceFds = g_hash_table_new(g_str_hash, g_str_equal);
 
+    priv->iommufd = -1;
     priv->pidMonitored = -1;
 
     /* agent commands block by default, user can choose different behavior */
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 1afd932764..6460323554 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -266,6 +266,10 @@ struct _qemuDomainObjPrivate {
     /* named file descriptor groups associated with the VM */
     GHashTable *fds;
 
+    int iommufd;
+
+    GHashTable *vfioDeviceFds;
+
     char *memoryBackingDir;
 };
 
@@ -1172,3 +1176,6 @@ qemuDomainCheckCPU(virArch arch,
 bool
 qemuDomainMachineSupportsFloppy(const char *machine,
                                 virQEMUCaps *qemuCaps);
+
+int qemuProcessOpenVfioFds(virDomainObj *vm);
+void qemuProcessCloseVfioFds(virDomainObj *vm);
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index e9568af125..e0e693e251 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -1633,7 +1633,7 @@ qemuDomainAttachHostPCIDevice(virQEMUDriver *driver,
         goto error;
     }
 
-    if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev)))
+    if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev, vm)))
         goto error;
 
     qemuDomainObjEnterMonitor(vm);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index a81c02c9d5..1bc779c6aa 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
 #include <unistd.h>
 #include <signal.h>
 #include <sys/stat.h>
+#include <dirent.h>
 #if WITH_SYS_SYSCALL_H
 # include <sys/syscall.h>
 #endif
@@ -8025,6 +8026,9 @@ qemuProcessLaunch(virConnectPtr conn,
     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
         goto cleanup;
 
+    if (qemuProcessOpenVfioFds(vm) < 0)
+        goto cleanup;
+
     if (!(cmd = qemuBuildCommandLine(vm,
                                      incoming ? "defer" : NULL,
                                      vmop,
@@ -10206,3 +10210,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
     virObjectUnlock(vm);
 }
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+    int fd = -1;
+
+    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("IOMMU FD support requires /dev/iommu device"));
+        } else {
+            virReportSystemError(errno, "%s",
+                                 _("cannot open /dev/iommu"));
+        }
+        return -1;
+    }
+
+    *iommuFd = fd;
+    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+    return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
+                             char **vfioPath)
+{
+    virPCIDeviceAddress *addr;
+    g_autofree char *sysfsPath = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    int ret = -1;
+
+    if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+        hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("VFIO FD only supported for PCI hostdevs"));
+        return -1;
+    }
+
+    addr = &hostdev->source.subsys.u.pci.addr;
+
+    /* Build sysfs path: /sys/bus/pci/devices/DDDD:BB:DD.F/vfio-dev/ */
+    sysfsPath = g_strdup_printf("/sys/bus/pci/devices/"
+                                "%04x:%02x:%02x.%d/vfio-dev/",
+                                addr->domain, addr->bus,
+                                addr->slot, addr->function);
+
+    if (virDirOpen(&dir, sysfsPath) < 0) {
+        virReportSystemError(errno,
+                             _("cannot open VFIO sysfs directory %1$s"),
+                             sysfsPath);
+        return -1;
+    }
+
+    /* Find the vfio device name in the directory */
+    while (virDirRead(dir, &entry, sysfsPath) > 0) {
+        if (STRPREFIX(entry->d_name, "vfio")) {
+            *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name);
+            ret = 0;
+            break;
+        }
+    }
+
+    if (ret < 0) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("cannot find VFIO device for PCI device 
%1$04x:%2$02x:%3$02x.%4$d"),
+                       addr->domain, addr->bus, addr->slot, addr->function);
+    }
+
+    virDirClose(dir);
+    return ret;
+}
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+                            int *vfioFd)
+{
+    g_autofree char *vfioPath = NULL;
+    int fd = -1;
+
+    if (qemuProcessGetVfioDevicePath(hostdev, &vfioPath) < 0)
+        return -1;
+
+    VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+    if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("VFIO device %1$s not found - ensure device is 
bound to vfio-pci driver"),
+                           vfioPath);
+        } else {
+            virReportSystemError(errno,
+                                 _("cannot open VFIO device %1$s"), vfioPath);
+        }
+        return -1;
+    }
+
+    *vfioFd = fd;
+    VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+    return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+    qemuDomainObjPrivate *priv = vm->privateData;
+    bool needsIommuFd = false;
+    size_t i;
+
+    /* Check if we have any hostdevs that need VFIO FDs */
+    for (i = 0; i < vm->def->nhostdevs; i++) {
+        virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+        int vfioFd = -1;
+        g_autofree char *fdname = NULL;
+
+        if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+            hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) 
{
+
+            /* Check if this hostdev uses VFIO with IOMMU FD */
+            if (hostdev->source.subsys.u.pci.driver.name == 
VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+                hostdev->iommufdId) {
+
+                needsIommuFd = true;
+
+                /* Open VFIO device FD */
+                if (qemuProcessOpenVfioDeviceFd(hostdev, &vfioFd) < 0)
+                    goto error;
+
+                /* Store the FD */
+                fdname = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                         
hostdev->source.subsys.u.pci.addr.domain,
+                                         hostdev->source.subsys.u.pci.addr.bus,
+                                         
hostdev->source.subsys.u.pci.addr.slot,
+                                         
hostdev->source.subsys.u.pci.addr.function);
+
+                g_hash_table_insert(priv->vfioDeviceFds, 
g_steal_pointer(&fdname), GINT_TO_POINTER(vfioFd));
+
+                VIR_DEBUG("Stored VFIO FD for device %s", fdname);
+            }
+        }
+    }
+
+    /* Open IOMMU FD if needed */
+    if (needsIommuFd) {
+        int iommuFd = -1;
+
+        if (qemuProcessOpenIommuFd(vm, &iommuFd) < 0)
+            goto error;
+
+        priv->iommufd = iommuFd;
+
+        VIR_DEBUG("Stored IOMMU FD");
+    }
+
+    return 0;
+
+ error:
+    qemuProcessCloseVfioFds(vm);
+    return -1;
+}
+
+/**
+ * qemuProcessCloseVfioFds:
+ * @vm: domain object
+ *
+ * Closes all VFIO file descriptors for the domain.
+ */
+void
+qemuProcessCloseVfioFds(virDomainObj *vm)
+{
+    qemuDomainObjPrivate *priv = vm->privateData;
+    GHashTableIter iter;
+    gpointer key, value;
+
+    /* Close all VFIO device FDs */
+    if (priv->vfioDeviceFds) {
+        g_hash_table_iter_init(&iter, priv->vfioDeviceFds);
+        while (g_hash_table_iter_next(&iter, &key, &value)) {
+            int fd = GPOINTER_TO_INT(value);
+            VIR_DEBUG("Closing VFIO device FD %d for %s", fd, (char*)key);
+            VIR_FORCE_CLOSE(fd);
+        }
+        g_hash_table_remove_all(priv->vfioDeviceFds);
+    }
+
+    /* Close IOMMU FD */
+    if (priv->iommufd >= 0) {
+        VIR_DEBUG("Closing IOMMU FD %d", priv->iommufd);
+        VIR_FORCE_CLOSE(priv->iommufd);
+    }
+}
-- 
2.43.0

Reply via email to