On a Friday in 2025, Nathan Chen via Devel wrote:
Open VFIO FDs from libvirt backend without exposing these FDs to XML users, i.e. one per iommufd hostdev for /dev/vfio/devices/vfioX, and pass the FD to qemu command line.Signed-off-by: Nathan Chen <[email protected]> --- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 26 ++++++++ src/qemu/qemu_domain.c | 39 ++++++++++++ src/qemu/qemu_domain.h | 17 +++++ src/qemu/qemu_process.c | 130 +++++++++++++++++++++++++++++++++++++++ src/util/virpci.c | 69 +++++++++++++++++++++ src/util/virpci.h | 2 + 8 files changed, 286 insertions(+) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 4fd8342950..da4ce9fc86 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -364,6 +364,8 @@ struct _virDomainHostdevDef { */ virDomainNetDef *parentnet; + virObject *privateData; +
The code adding the privateData should be in a separate patch. (done in my gitlab branch)
virDomainHostdevMode mode;
virDomainStartupPolicy startupPolicy;
bool managed;
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 4e57e4a8f6..ed2b0d381e 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -3159,6 +3159,7 @@ virPCIDeviceGetStubDriverName;
virPCIDeviceGetStubDriverType;
virPCIDeviceGetUnbindFromStub;
virPCIDeviceGetUsedBy;
+virPCIDeviceGetVfioPath;
virPCIDeviceGetVPD;
virPCIDeviceHasPCIExpressLink;
virPCIDeviceIsAssignable;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 95d1c2ee98..9b08f66175 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4756,6 +4756,12 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
const char *iommufdId = NULL;
/* 'ramfb' property must be omitted unless it's to be enabled */
bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+ bool useIommufd = false;
+
+ if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ pcisrc->driver.iommufd == VIR_TRISTATE_BOOL_YES) {
+ useIommufd = true;
No need for this separate bool, just move the two conditions where the bool is used.
+ }
/* caller has to assign proper passthrough driver name */
switch (pcisrc->driver.name) {
@@ -4802,6 +4808,17 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
NULL) < 0)
return NULL;
+ if (useIommufd && dev->privateData) {
privateData should be allocated unconditionally for all hostdevs, once that is done, it does not need to be checked here.
+ qemuDomainHostdevPrivate *hostdevPriv =
QEMU_DOMAIN_HOSTDEV_PRIVATE(dev);
+
+ if (hostdevPriv->vfioDeviceFd >= 0) {
For 'fd's, a != -1 comparison is preferred (any other negative value is a programming error)
+ if (virJSONValueObjectAdd(&props,
+ "S:fd", g_strdup_printf("%d",
hostdevPriv->vfioDeviceFd),
g_strdup_printf returns an allocated string, but virJSONValueObjectAdd does not "take ownership" of it. Try: g_autofree char *fdstr = g_strdup(..); at the start of the 'if' block and use 'fdstr' instead'
+ NULL) < 0)
+ return NULL;
+ }
+ }
+
if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
return NULL;
@@ -5260,6 +5277,15 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
return -1;
+ if (subsys->u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) {
+ qemuDomainHostdevPrivate *hostdevPriv =
QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev);
+
+ if (hostdevPriv && hostdevPriv->vfioDeviceFd >= 0) {
No need to check for hostdevPriv if we allocate it for all hostdevs.
+ virCommandPassFD(cmd, hostdevPriv->vfioDeviceFd,
+ VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+ }
+ }
+
if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
return -1;
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index ac56fc7cb4..7601bdbb2b 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1238,6 +1238,45 @@ qemuDomainNetworkPrivateFormat(const virDomainNetDef
*net,
}
+static virClass *qemuDomainHostdevPrivateClass;
+
+static void
+qemuDomainHostdevPrivateDispose(void *obj)
+{
+ qemuDomainHostdevPrivate *priv = obj;
+
+ VIR_FORCE_CLOSE(priv->vfioDeviceFd);
+}
+
+
+static int
+qemuDomainHostdevPrivateOnceInit(void)
+{
+ if (!VIR_CLASS_NEW(qemuDomainHostdevPrivate, virClassForObject()))
+ return -1;
+
+ return 0;
+}
+
+VIR_ONCE_GLOBAL_INIT(qemuDomainHostdevPrivate);
+
+virObject *
+qemuDomainHostdevPrivateNew(void)
This functions should also be added to
virQEMUDriverPrivateDataCallbacks as .hostdevNew,
then it can be called from the parser:
virDomainHostdevDefNew(virDomainXMLOption *xmlopt)
{
...
if (xmlopt && xmlopt->privateData.hostdevNew &&
!(def->privateData = xmlopt->privateData.hostdevNew())) {
VIR_FREE(def->info);
VIR_FREE(def);
return NULL;
}
...
}
(Yes, the callers would need to be adjusted too)
+{
+ qemuDomainHostdevPrivate *priv;
+
+ if (qemuDomainHostdevPrivateInitialize() < 0)
...
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 45fc32a663..bf245ee8af 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -106,6 +106,7 @@ #include "logging/log_manager.h" #include "logging/log_protocol.h" +#include "util/virpci.h"
No need for the util prefix. Also, can be grouped in the previous group.
#define VIR_FROM_THIS VIR_FROM_QEMU
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
goto cleanup;
+ if (qemuProcessOpenVfioFds(vm) < 0)
+ goto cleanup;
+
if (!(cmd = qemuBuildCommandLine(vm,
incoming ? "defer" : NULL,
vmop,
@@ -10267,3 +10271,129 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
This can return the fd on success directly, no need to return it via pointer.
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+ int *vfioFd)
+{
+ g_autofree char *vfioPath = NULL;
+ int fd = -1;
+
+
+ if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+ hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+ virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+ _("VFIO FD only supported for PCI hostdevs"));
+ return -1;
+ }
+
+ if (virPCIDeviceGetVfioPath(&hostdev->source.subsys.u.pci.addr, &vfioPath)
< 0)
+ return -1;
+
+ VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+ if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+ _("VFIO device %1$s not found - ensure device is bound
to vfio-pci driver"),
+ vfioPath);
+ } else {
+ virReportSystemError(errno,
+ _("cannot open VFIO device %1$s"), vfioPath);
+ }
+ return -1;
+ }
+
+ *vfioFd = fd;
+ VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+ return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+ size_t i;
+
+ /* Check if we have any hostdevs that need VFIO FDs */
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+ qemuDomainHostdevPrivate *hostdevPriv = NULL;
+
+ if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+ hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI)
{
+
+ if (hostdev->source.subsys.u.pci.driver.name ==
VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ hostdev->source.subsys.u.pci.driver.iommufd ==
VIR_TRISTATE_BOOL_YES) {
+
These two conditions above can be joint into one to save one level of indentation
+ if (!hostdev->privateData) {
+ if (!(hostdev->privateData =
qemuDomainHostdevPrivateNew()))
+ goto error;
+ }
+
This should be allocated in virDomainHostdevDefNew.
+ hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + /* Open VFIO device FD */ + if (qemuProcessOpenVfioDeviceFd(hostdev, &hostdevPriv->vfioDeviceFd) < 0) + goto error; +
+ VIR_DEBUG("Stored VFIO FD %d in hostdev %04x:%02x:%02x.%d private
data",
+ hostdevPriv->vfioDeviceFd,
+ hostdev->source.subsys.u.pci.addr.domain,
+ hostdev->source.subsys.u.pci.addr.bus,
+ hostdev->source.subsys.u.pci.addr.slot,
+ hostdev->source.subsys.u.pci.addr.function);
No need for this debug log since the function above just logged the same info
+ } + } + } + + return 0; + + error: + qemuProcessCloseVfioFds(vm);
No need to close the file descriptors here, they will be closed when the private data gets destroyed along with the rest of the live domain definition (i.e. the copy libvirt makes on domain startup)
+ return -1; +} + +/** + * qemuProcessCloseVfioFds: + * @vm: domain object + * + * Closes all VFIO file descriptors for the domain. + */ +void +qemuProcessCloseVfioFds(virDomainObj *vm)
This function is not necessary
+{
+ size_t i;
+
+ /* Close all VFIO device FDs */
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+ qemuDomainHostdevPrivate *hostdevPriv;
+
+ if (!hostdev->privateData)
+ continue;
+
+ hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev);
+
+ if (hostdevPriv->vfioDeviceFd >= 0) {
+ VIR_DEBUG("Closing VFIO device FD %d", hostdevPriv->vfioDeviceFd);
+ VIR_FORCE_CLOSE(hostdevPriv->vfioDeviceFd);
+ }
+ }
+}
Jano
signature.asc
Description: PGP signature
