On 11/6/2025 11:19 AM, Ján Tomko wrote:
Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.


The part formatting the object and the part formatting the device
should be split.


Sounds good, I will split it into two commits.

Signed-off-by: Nathan Chen <[email protected]>
---
src/qemu/qemu_command.c |  43 +++++++-
src/qemu/qemu_command.h |   3 +-
src/qemu/qemu_domain.c  |   8 ++
src/qemu/qemu_domain.h  |   7 ++
src/qemu/qemu_hotplug.c |   2 +-
src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 8fd7527645..740a6970f2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,

virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev)
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm)

Hmm, perhaps exposing the iommufd object in the XML would save us from
having to pass this.

We are passing virDomainObj to this function in order to retrieve the you referring to exposing the


{
    g_autoptr(virJSONValue) props = NULL;
    virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
    const char *iommufdId = NULL;
    /* 'ramfb' property must be omitted unless it's to be enabled */
    bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+    bool useIommufd = false;
+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+        pcisrc->driver.iommufd) {
+        useIommufd = true;
+    }

    /* caller has to assign proper passthrough driver name */
    switch (pcisrc->driver.name) {
@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                              NULL) < 0)
        return NULL;

+    if (useIommufd && priv) {
+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x: %02x:%02x.%d", +                                                      pcisrc- >addr.domain, pcisrc->addr.bus, +                                                      pcisrc- >addr.slot, pcisrc->addr.function);
+

There's no need to duplicate the list of hostdevs which use iommufd in a
per-domain hash table.

For storing per-device file descriptors, we have per-device private
data.

+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv- >vfioDeviceFds, vfioFdName));
+        if (virJSONValueObjectAdd(&props,
+                                  "S:fd", g_strdup_printf("%d", vfiofd),
+                                  NULL) < 0)
+            return NULL;
+    }
+
    if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
        return NULL;

@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
static int
qemuBuildHostdevCommandLine(virCommand *cmd,
                            const virDomainDef *def,
-                            virQEMUCaps *qemuCaps)
+                            virQEMUCaps *qemuCaps,
+                            virDomainObj *vm)
{
    size_t i;
    g_autoptr(virJSONValue) props = NULL;
    int iommufd = 0;
    const char * iommufdId = "iommufd0";
+    qemuDomainObjPrivate *priv = vm->privateData;

    for (i = 0; i < def->nhostdevs; i++) {
        virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,

            if (subsys->u.pci.driver.iommufd && iommufd == 0) {
                iommufd = 1;
+                virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
                if (qemuMonitorCreateObjectProps(&props, "iommufd",
                                                 iommufdId,
+                                                 "S:fd", g_strdup_printf("%d", priv->iommufd),
                                                 NULL) < 0)
                    return -1;

@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
            if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
                return -1;

-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+            if (subsys->u.pci.driver.iommufd) {
+                virDomainHostdevSubsysPCI *pcisrc = &hostdev- >source.subsys.u.pci; +                g_autofree char *vfioFdName = g_strdup_printf("vfio- %04x:%02x:%02x.%d", +                                                              pcisrc- >addr.domain, pcisrc->addr.bus, +                                                              pcisrc- >addr.slot, pcisrc->addr.function);
+
+                int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+                virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);

This would become just:

qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock- >privateData;

virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);

+            }
+
+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
                return -1;

            if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0)
@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
    if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
        return NULL;

-    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
        return NULL;

    if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
/* Current, best practice */
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev);
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm);

virJSONValue *
qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a42721efad..86640aa3e3 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)

    virChrdevFree(priv->devs);

+    if (priv->iommufd >= 0) {
+        virEventRemoveHandle(priv->iommufd);

There is no handle to remove (and none is needed). So no need for the
condition either.

+        priv->iommufd = -1;
+    }
+
    if (priv->pidMonitored >= 0) {
        virEventRemoveHandle(priv->pidMonitored);
        priv->pidMonitored = -1;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..cecfed94a7 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
+#include <dirent.h>

We should not need this in qemu_process.

#if WITH_SYS_SYSCALL_H
# include <sys/syscall.h>
#endif
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
    if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
        goto cleanup;

+    if (qemuProcessOpenVfioFds(vm) < 0)
+        goto cleanup;
+
    if (!(cmd = qemuBuildCommandLine(vm,
                                     incoming ? "defer" : NULL,
                                     vmop,
@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
    virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+    int fd = -1;
+
+    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("IOMMU FD support requires /dev/iommu device"));
+        } else {
+            virReportSystemError(errno, "%s",
+                                 _("cannot open /dev/iommu"));
+        }
+        return -1;
+    }
+
+    *iommuFd = fd;
+    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+    return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,

No need to pass the whole hostdev here. Then this function can live in
virpci.c

+                             char **vfioPath)
+{
+    virPCIDeviceAddress *addr;
+    g_autofree char *sysfsPath = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    int ret = -1;
+

Reply via email to