Open iommufd FDs from libvirt backend without exposing these FDs to XML users, i.e. one per domain for /dev/iommu and one per iommufd hostdev for /dev/vfio/devices/vfioX, and pass the FD to qemu command line.
Signed-off-by: Nathan Chen <nath...@nvidia.com> --- src/qemu/qemu_command.c | 44 +++++++- src/qemu/qemu_command.h | 3 +- src/qemu/qemu_domain.c | 8 ++ src/qemu/qemu_domain.h | 7 ++ src/qemu/qemu_hotplug.c | 2 +- src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 290 insertions(+), 6 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 6b3e2ffd0d..359dbb2621 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -4797,7 +4797,8 @@ qemuBuildVideoCommandLine(virCommand *cmd, virJSONValue * qemuBuildPCIHostdevDevProps(const virDomainDef *def, - virDomainHostdevDef *dev) + virDomainHostdevDef *dev, + virDomainObj *vm) { g_autoptr(virJSONValue) props = NULL; virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci; @@ -4807,6 +4808,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, const char *driver = NULL; /* 'ramfb' property must be omitted unless it's to be enabled */ bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON; + bool useIommufd = false; + qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL; + + if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + dev->iommufdId) { + useIommufd = true; + } /* caller has to assign proper passthrough driver name */ switch (pcisrc->driver.name) { @@ -4850,6 +4858,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, NULL) < 0) return NULL; + if (useIommufd && priv) { + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d", + pcisrc->addr.domain, pcisrc->addr.bus, + pcisrc->addr.slot, pcisrc->addr.function); + + int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName)); + if (virJSONValueObjectAdd(&props, + "S:fd", g_strdup_printf("%d", vfiofd), + NULL) < 0) + return NULL; + } + if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0) return NULL; @@ -5223,11 +5243,13 @@ qemuBuildHostdevSCSICommandLine(virCommand *cmd, static int qemuBuildHostdevCommandLine(virCommand *cmd, const virDomainDef *def, - virQEMUCaps *qemuCaps) + virQEMUCaps *qemuCaps, + virDomainObj *vm) { size_t i; g_autoptr(virJSONValue) props = NULL; int iommufd = 0; + qemuDomainObjPrivate *priv = vm->privateData; for (i = 0; i < def->nhostdevs; i++) { virDomainHostdevDef *hostdev = def->hostdevs[i]; @@ -5239,8 +5261,11 @@ qemuBuildHostdevCommandLine(virCommand *cmd, if (hostdev->iommufdId && iommufd == 0) { iommufd = 1; + virCommandPassFD(cmd, priv->iommufd, VIR_COMMAND_PASS_FD_CLOSE_PARENT); + if (qemuMonitorCreateObjectProps(&props, "iommufd", hostdev->iommufdId, + "S:fd", g_strdup_printf("%d", priv->iommufd), NULL) < 0) return -1; @@ -5270,7 +5295,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd, if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0) return -1; - if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev))) + if (hostdev->iommufdId) { + virDomainHostdevSubsysPCI *pcisrc = &hostdev->source.subsys.u.pci; + g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d", + pcisrc->addr.domain, pcisrc->addr.bus, + pcisrc->addr.slot, pcisrc->addr.function); + + int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName)); + + virCommandPassFD(cmd, vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT); + } + + if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm))) return -1; if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) < 0) @@ -10960,7 +10996,7 @@ qemuBuildCommandLine(virDomainObj *vm, if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0) return NULL; - if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0) + if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0) return NULL; if (migrateURI) diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h index ad068f1f16..380aac261f 100644 --- a/src/qemu/qemu_command.h +++ b/src/qemu/qemu_command.h @@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps, /* Current, best practice */ virJSONValue * qemuBuildPCIHostdevDevProps(const virDomainDef *def, - virDomainHostdevDef *dev); + virDomainHostdevDef *dev, + virDomainObj *vm); virJSONValue * qemuBuildRNGDevProps(const virDomainDef *def, diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index a2c7c88a7e..2086dbb575 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1954,6 +1954,11 @@ qemuDomainObjPrivateFree(void *data) virChrdevFree(priv->devs); + if (priv->iommufd >= 0) { + virEventRemoveHandle(priv->iommufd); + priv->iommufd = -1; + } + if (priv->pidMonitored >= 0) { virEventRemoveHandle(priv->pidMonitored); priv->pidMonitored = -1; @@ -1975,6 +1980,7 @@ qemuDomainObjPrivateFree(void *data) g_clear_pointer(&priv->blockjobs, g_hash_table_unref); g_clear_pointer(&priv->fds, g_hash_table_unref); + g_clear_pointer(&priv->vfioDeviceFds, g_hash_table_unref); /* This should never be non-NULL if we get here, but just in case... */ if (priv->eventThread) { @@ -2003,7 +2009,9 @@ qemuDomainObjPrivateAlloc(void *opaque) priv->blockjobs = virHashNew(virObjectUnref); priv->fds = virHashNew(g_object_unref); + priv->vfioDeviceFds = g_hash_table_new(g_str_hash, g_str_equal); + priv->iommufd = -1; priv->pidMonitored = -1; /* agent commands block by default, user can choose different behavior */ diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 1afd932764..6460323554 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -266,6 +266,10 @@ struct _qemuDomainObjPrivate { /* named file descriptor groups associated with the VM */ GHashTable *fds; + int iommufd; + + GHashTable *vfioDeviceFds; + char *memoryBackingDir; }; @@ -1172,3 +1176,6 @@ qemuDomainCheckCPU(virArch arch, bool qemuDomainMachineSupportsFloppy(const char *machine, virQEMUCaps *qemuCaps); + +int qemuProcessOpenVfioFds(virDomainObj *vm); +void qemuProcessCloseVfioFds(virDomainObj *vm); diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index e9568af125..e0e693e251 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -1633,7 +1633,7 @@ qemuDomainAttachHostPCIDevice(virQEMUDriver *driver, goto error; } - if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev))) + if (!(devprops = qemuBuildPCIHostdevDevProps(vm->def, hostdev, vm))) goto error; qemuDomainObjEnterMonitor(vm); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index a81c02c9d5..1bc779c6aa 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -25,6 +25,7 @@ #include <unistd.h> #include <signal.h> #include <sys/stat.h> +#include <dirent.h> #if WITH_SYS_SYSCALL_H # include <sys/syscall.h> #endif @@ -8025,6 +8026,9 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0) goto cleanup; + if (qemuProcessOpenVfioFds(vm) < 0) + goto cleanup; + if (!(cmd = qemuBuildCommandLine(vm, incoming ? "defer" : NULL, vmop, @@ -10206,3 +10210,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit, qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit); virObjectUnlock(vm); } + +/** + * qemuProcessOpenIommuFd: + * @vm: domain object + * @iommuFd: returned file descriptor + * + * Opens /dev/iommu file descriptor for the VM. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd) +{ + int fd = -1; + + VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name); + + if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", + _("IOMMU FD support requires /dev/iommu device")); + } else { + virReportSystemError(errno, "%s", + _("cannot open /dev/iommu")); + } + return -1; + } + + *iommuFd = fd; + VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name); + return 0; +} + +/** + * qemuProcessGetVfioDevicePath: + * @hostdev: host device definition + * @vfioPath: returned VFIO device path + * + * Constructs the VFIO device path for a PCI hostdev. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev, + char **vfioPath) +{ + virPCIDeviceAddress *addr; + g_autofree char *sysfsPath = NULL; + DIR *dir = NULL; + struct dirent *entry = NULL; + int ret = -1; + + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS || + hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("VFIO FD only supported for PCI hostdevs")); + return -1; + } + + addr = &hostdev->source.subsys.u.pci.addr; + + /* Build sysfs path: /sys/bus/pci/devices/DDDD:BB:DD.F/vfio-dev/ */ + sysfsPath = g_strdup_printf("/sys/bus/pci/devices/" + "%04x:%02x:%02x.%d/vfio-dev/", + addr->domain, addr->bus, + addr->slot, addr->function); + + if (virDirOpen(&dir, sysfsPath) < 0) { + virReportSystemError(errno, + _("cannot open VFIO sysfs directory %1$s"), + sysfsPath); + return -1; + } + + /* Find the vfio device name in the directory */ + while (virDirRead(dir, &entry, sysfsPath) > 0) { + if (STRPREFIX(entry->d_name, "vfio")) { + *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name); + ret = 0; + break; + } + } + + if (ret < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find VFIO device for PCI device %1$04x:%2$02x:%3$02x.%4$d"), + addr->domain, addr->bus, addr->slot, addr->function); + } + + virDirClose(dir); + return ret; +} + +/** + * qemuProcessOpenVfioDeviceFd: + * @hostdev: host device definition + * @vfioFd: returned file descriptor + * + * Opens the VFIO device file descriptor for a hostdev. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev, + int *vfioFd) +{ + g_autofree char *vfioPath = NULL; + int fd = -1; + + if (qemuProcessGetVfioDevicePath(hostdev, &vfioPath) < 0) + return -1; + + VIR_DEBUG("Opening VFIO device %s", vfioPath); + + if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("VFIO device %1$s not found - ensure device is bound to vfio-pci driver"), + vfioPath); + } else { + virReportSystemError(errno, + _("cannot open VFIO device %1$s"), vfioPath); + } + return -1; + } + + *vfioFd = fd; + VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath); + return 0; +} + +/** + * qemuProcessOpenVfioFds: + * @vm: domain object + * + * Opens all necessary VFIO file descriptors for the domain. + * + * Returns: 0 on success, -1 on failure + */ +int +qemuProcessOpenVfioFds(virDomainObj *vm) +{ + qemuDomainObjPrivate *priv = vm->privateData; + bool needsIommuFd = false; + size_t i; + + /* Check if we have any hostdevs that need VFIO FDs */ + for (i = 0; i < vm->def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = vm->def->hostdevs[i]; + int vfioFd = -1; + g_autofree char *fdname = NULL; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + + /* Check if this hostdev uses VFIO with IOMMU FD */ + if (hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + hostdev->iommufdId) { + + needsIommuFd = true; + + /* Open VFIO device FD */ + if (qemuProcessOpenVfioDeviceFd(hostdev, &vfioFd) < 0) + goto error; + + /* Store the FD */ + fdname = g_strdup_printf("vfio-%04x:%02x:%02x.%d", + hostdev->source.subsys.u.pci.addr.domain, + hostdev->source.subsys.u.pci.addr.bus, + hostdev->source.subsys.u.pci.addr.slot, + hostdev->source.subsys.u.pci.addr.function); + + g_hash_table_insert(priv->vfioDeviceFds, g_steal_pointer(&fdname), GINT_TO_POINTER(vfioFd)); + + VIR_DEBUG("Stored VFIO FD for device %s", fdname); + } + } + } + + /* Open IOMMU FD if needed */ + if (needsIommuFd) { + int iommuFd = -1; + + if (qemuProcessOpenIommuFd(vm, &iommuFd) < 0) + goto error; + + priv->iommufd = iommuFd; + + VIR_DEBUG("Stored IOMMU FD"); + } + + return 0; + + error: + qemuProcessCloseVfioFds(vm); + return -1; +} + +/** + * qemuProcessCloseVfioFds: + * @vm: domain object + * + * Closes all VFIO file descriptors for the domain. + */ +void +qemuProcessCloseVfioFds(virDomainObj *vm) +{ + qemuDomainObjPrivate *priv = vm->privateData; + GHashTableIter iter; + gpointer key, value; + + /* Close all VFIO device FDs */ + if (priv->vfioDeviceFds) { + g_hash_table_iter_init(&iter, priv->vfioDeviceFds); + while (g_hash_table_iter_next(&iter, &key, &value)) { + int fd = GPOINTER_TO_INT(value); + VIR_DEBUG("Closing VFIO device FD %d for %s", fd, (char*)key); + VIR_FORCE_CLOSE(fd); + } + g_hash_table_remove_all(priv->vfioDeviceFds); + } + + /* Close IOMMU FD */ + if (priv->iommufd >= 0) { + VIR_DEBUG("Closing IOMMU FD %d", priv->iommufd); + VIR_FORCE_CLOSE(priv->iommufd); + } +} -- 2.43.0