On Thu, 29 Jan 2026 21:24:52 +0000 David Matlack <[email protected]> wrote:
> From: Vipin Sharma <[email protected]> > > Implement the live update file handler callbacks to preserve a vfio-pci > device across a Live Update. Subsequent commits will enable userspace to > then retrieve this file after the Live Update. > > Live Update support is scoped only to cdev files (i.e. not > VFIO_GROUP_GET_DEVICE_FD files). > > State about each device is serialized into a new ABI struct > vfio_pci_core_device_ser. The contents of this struct are preserved > across the Live Update to the next kernel using a combination of > Kexec-Handover (KHO) to preserve the page(s) holding the struct and the > Live Update Orchestrator (LUO) to preserve the physical address of the > struct. > > For now the only contents of struct vfio_pci_core_device_ser the > device's PCI segment number and BDF, so that the device can be uniquely > identified after the Live Update. > > Require that userspace disables interrupts on the device prior to > freeze() so that the device does not send any interrupts until new > interrupt handlers have been set up by the next kernel. > > Reset the device and restore its state in the freeze() callback. This > ensures the device can be received by the next kernel in a consistent > state. Eventually this will be dropped and the device can be preserved > across in a running state, but that requires further work in VFIO and > the core PCI layer. > > Note that LUO holds a reference to this file when it is preserved. So > VFIO is guaranteed that vfio_df_device_last_close() will not be called > on this device no matter what userspace does. > > Signed-off-by: Vipin Sharma <[email protected]> > Co-developed-by: David Matlack <[email protected]> > Signed-off-by: David Matlack <[email protected]> > --- > drivers/vfio/pci/vfio_pci.c | 2 +- > drivers/vfio/pci/vfio_pci_liveupdate.c | 84 +++++++++++++++++++++++++- > drivers/vfio/pci/vfio_pci_priv.h | 2 + > drivers/vfio/vfio.h | 13 ---- > drivers/vfio/vfio_main.c | 10 +-- > include/linux/kho/abi/vfio_pci.h | 15 +++++ > include/linux/vfio.h | 28 +++++++++ > 7 files changed, 129 insertions(+), 25 deletions(-) > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c > index 19e88322af2c..0260afb9492d 100644 > --- a/drivers/vfio/pci/vfio_pci.c > +++ b/drivers/vfio/pci/vfio_pci.c > @@ -125,7 +125,7 @@ static int vfio_pci_open_device(struct vfio_device > *core_vdev) > return 0; > } > > -static const struct vfio_device_ops vfio_pci_ops = { > +const struct vfio_device_ops vfio_pci_ops = { > .name = "vfio-pci", > .init = vfio_pci_core_init_dev, > .release = vfio_pci_core_release_dev, > diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c > b/drivers/vfio/pci/vfio_pci_liveupdate.c > index b84e63c0357b..f01de98f1b75 100644 > --- a/drivers/vfio/pci/vfio_pci_liveupdate.c > +++ b/drivers/vfio/pci/vfio_pci_liveupdate.c > @@ -8,25 +8,104 @@ > > #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > > +#include <linux/kexec_handover.h> > #include <linux/kho/abi/vfio_pci.h> > #include <linux/liveupdate.h> > #include <linux/errno.h> > +#include <linux/vfio.h> > > #include "vfio_pci_priv.h" > > static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler > *handler, > struct file *file) > { > - return false; > + struct vfio_device_file *df = to_vfio_device_file(file); > + > + if (!df) > + return false; > + > + /* Live Update support is limited to cdev files. */ > + if (df->group) > + return false; > + > + return df->device->ops == &vfio_pci_ops; > } Why can't we use vfio_device_cdev_opened() here and avoid all the new exposure in public headers? struct vfio_device *device = vfio_device_from_file(file); if (!device) return false; if (!vfio_device_cdev_opened(device)) return false; return device->ops == &vfio_pci_ops; > > static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) > { > - return -EOPNOTSUPP; > + struct vfio_device *device = vfio_device_from_file(args->file); > + struct vfio_pci_core_device_ser *ser; > + struct vfio_pci_core_device *vdev; > + struct pci_dev *pdev; > + > + vdev = container_of(device, struct vfio_pci_core_device, vdev); > + pdev = vdev->pdev; > + > + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) > + return -EINVAL; > + > + if (vfio_pci_is_intel_display(pdev)) > + return -EINVAL; Some comments describing what's missing, if these are TODO or DONTCARE would be useful. > + > + ser = kho_alloc_preserve(sizeof(*ser)); > + if (IS_ERR(ser)) > + return PTR_ERR(ser); > + > + ser->bdf = pci_dev_id(pdev); > + ser->domain = pci_domain_nr(pdev->bus); > + > + args->serialized_data = virt_to_phys(ser); > + return 0; > } > > static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args > *args) > { > + kho_unpreserve_free(phys_to_virt(args->serialized_data)); > +} > + > +static int vfio_pci_liveupdate_freeze(struct liveupdate_file_op_args *args) > +{ > + struct vfio_device *device = vfio_device_from_file(args->file); > + struct vfio_pci_core_device *vdev; > + struct pci_dev *pdev; > + int ret; > + > + vdev = container_of(device, struct vfio_pci_core_device, vdev); > + pdev = vdev->pdev; > + > + guard(mutex)(&device->dev_set->lock); > + > + /* > + * Userspace must disable interrupts on the device prior to freeze so > + * that the device does not send any interrupts until new interrupt > + * handlers have been established by the next kernel. > + */ > + if (vdev->irq_type != VFIO_PCI_NUM_IRQS) { > + pci_err(pdev, "Freeze failed! Interrupts are still enabled.\n"); > + return -EINVAL; > + } > + > + pci_dev_lock(pdev); device_lock() is a dangerous source of deadlocks, for instance how can we know the freeze isn't occurring with an outstanding driver unbind? > + > + ret = pci_load_saved_state(pdev, vdev->pci_saved_state); > + if (ret) > + goto out; > + > + /* > + * Reset the device and restore it back to its original state before > + * handing it to the next kernel. > + * > + * Eventually both of these should be dropped and the device should be > + * kept running with its current state across the Live Update. > + */ > + if (vdev->reset_works) > + ret = __pci_reset_function_locked(pdev); > + > + pci_restore_state(pdev); > + > +out: > + pci_dev_unlock(pdev); > + return ret; > } > > static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args) > @@ -42,6 +121,7 @@ static const struct liveupdate_file_ops > vfio_pci_liveupdate_file_ops = { > .can_preserve = vfio_pci_liveupdate_can_preserve, > .preserve = vfio_pci_liveupdate_preserve, > .unpreserve = vfio_pci_liveupdate_unpreserve, > + .freeze = vfio_pci_liveupdate_freeze, > .retrieve = vfio_pci_liveupdate_retrieve, > .finish = vfio_pci_liveupdate_finish, > .owner = THIS_MODULE, > diff --git a/drivers/vfio/pci/vfio_pci_priv.h > b/drivers/vfio/pci/vfio_pci_priv.h > index 68966ec64e51..d3da79b7b03c 100644 > --- a/drivers/vfio/pci/vfio_pci_priv.h > +++ b/drivers/vfio/pci/vfio_pci_priv.h > @@ -11,6 +11,8 @@ > /* Cap maximum number of ioeventfds per device (arbitrary) */ > #define VFIO_PCI_IOEVENTFD_MAX 1000 > > +extern const struct vfio_device_ops vfio_pci_ops; > + > struct vfio_pci_ioeventfd { > struct list_head next; > struct vfio_pci_core_device *vdev; > diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h > index 50128da18bca..6b89edbbf174 100644 > --- a/drivers/vfio/vfio.h > +++ b/drivers/vfio/vfio.h > @@ -16,17 +16,6 @@ struct iommufd_ctx; > struct iommu_group; > struct vfio_container; > > -struct vfio_device_file { > - struct vfio_device *device; > - struct vfio_group *group; > - > - u8 access_granted; > - u32 devid; /* only valid when iommufd is valid */ > - spinlock_t kvm_ref_lock; /* protect kvm field */ > - struct kvm *kvm; > - struct iommufd_ctx *iommufd; /* protected by struct > vfio_device_set::lock */ > -}; > - > void vfio_device_put_registration(struct vfio_device *device); > bool vfio_device_try_get_registration(struct vfio_device *device); > int vfio_df_open(struct vfio_device_file *df); > @@ -34,8 +23,6 @@ void vfio_df_close(struct vfio_device_file *df); > struct vfio_device_file * > vfio_allocate_device_file(struct vfio_device *device); > > -extern const struct file_operations vfio_device_fops; > - > #ifdef CONFIG_VFIO_NOIOMMU > extern bool vfio_noiommu __read_mostly; > #else > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index f7df90c423b4..276f615f0c28 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -1436,15 +1436,7 @@ const struct file_operations vfio_device_fops = { > .show_fdinfo = vfio_device_show_fdinfo, > #endif > }; > - > -static struct vfio_device *vfio_device_from_file(struct file *file) > -{ > - struct vfio_device_file *df = file->private_data; > - > - if (file->f_op != &vfio_device_fops) > - return NULL; > - return df->device; > -} > +EXPORT_SYMBOL_GPL(vfio_device_fops); Seems we just need to export vfio_device_from_file(). Thanks, Alex > > /** > * vfio_file_is_valid - True if the file is valid vfio file > diff --git a/include/linux/kho/abi/vfio_pci.h > b/include/linux/kho/abi/vfio_pci.h > index 37a845eed972..9bf58a2f3820 100644 > --- a/include/linux/kho/abi/vfio_pci.h > +++ b/include/linux/kho/abi/vfio_pci.h > @@ -9,6 +9,9 @@ > #ifndef _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H > #define _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H > > +#include <linux/compiler.h> > +#include <linux/types.h> > + > /** > * DOC: VFIO PCI Live Update ABI > * > @@ -25,4 +28,16 @@ > > #define VFIO_PCI_LUO_FH_COMPATIBLE "vfio-pci-v1" > > +/** > + * struct vfio_pci_core_device_ser - Serialized state of a single VFIO PCI > + * device. > + * > + * @bdf: The device's PCI bus, device, and function number. > + * @domain: The device's PCI domain number (segment). > + */ > +struct vfio_pci_core_device_ser { > + u16 bdf; > + u16 domain; > +} __packed; > + > #endif /* _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H */ > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index e90859956514..9aa1587fea19 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -81,6 +81,34 @@ struct vfio_device { > #endif > }; > > +struct vfio_device_file { > + struct vfio_device *device; > + struct vfio_group *group; > + > + u8 access_granted; > + u32 devid; /* only valid when iommufd is valid */ > + spinlock_t kvm_ref_lock; /* protect kvm field */ > + struct kvm *kvm; > + struct iommufd_ctx *iommufd; /* protected by struct > vfio_device_set::lock */ > +}; > + > +extern const struct file_operations vfio_device_fops; > + > +static inline struct vfio_device_file *to_vfio_device_file(struct file *file) > +{ > + if (file->f_op != &vfio_device_fops) > + return NULL; > + > + return file->private_data; > +} > + > +static inline struct vfio_device *vfio_device_from_file(struct file *file) > +{ > + struct vfio_device_file *df = to_vfio_device_file(file); > + > + return df ? df->device : NULL; > +} > + > /** > * struct vfio_device_ops - VFIO bus driver device callbacks > *

