On Thu, Jan 29, 2026 at 09:24:52PM +0000, David Matlack wrote:
> From: Vipin Sharma <[email protected]>
> 
> Implement the live update file handler callbacks to preserve a vfio-pci
> device across a Live Update. Subsequent commits will enable userspace to
> then retrieve this file after the Live Update.
> 
> Live Update support is scoped only to cdev files (i.e. not
> VFIO_GROUP_GET_DEVICE_FD files).
> 
> State about each device is serialized into a new ABI struct
> vfio_pci_core_device_ser. The contents of this struct are preserved
> across the Live Update to the next kernel using a combination of
> Kexec-Handover (KHO) to preserve the page(s) holding the struct and the
> Live Update Orchestrator (LUO) to preserve the physical address of the
> struct.
> 
> For now the only contents of struct vfio_pci_core_device_ser the
> device's PCI segment number and BDF, so that the device can be uniquely
> identified after the Live Update.
> 
> Require that userspace disables interrupts on the device prior to
> freeze() so that the device does not send any interrupts until new
> interrupt handlers have been set up by the next kernel.
> 
> Reset the device and restore its state in the freeze() callback. This
> ensures the device can be received by the next kernel in a consistent
> state. Eventually this will be dropped and the device can be preserved
> across in a running state, but that requires further work in VFIO and
> the core PCI layer.
> 
> Note that LUO holds a reference to this file when it is preserved. So
> VFIO is guaranteed that vfio_df_device_last_close() will not be called
> on this device no matter what userspace does.
> 
> Signed-off-by: Vipin Sharma <[email protected]>
> Co-developed-by: David Matlack <[email protected]>
> Signed-off-by: David Matlack <[email protected]>
> ---
>  drivers/vfio/pci/vfio_pci.c            |  2 +-
>  drivers/vfio/pci/vfio_pci_liveupdate.c | 84 +++++++++++++++++++++++++-
>  drivers/vfio/pci/vfio_pci_priv.h       |  2 +
>  drivers/vfio/vfio.h                    | 13 ----
>  drivers/vfio/vfio_main.c               | 10 +--
>  include/linux/kho/abi/vfio_pci.h       | 15 +++++
>  include/linux/vfio.h                   | 28 +++++++++
>  7 files changed, 129 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 19e88322af2c..0260afb9492d 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -125,7 +125,7 @@ static int vfio_pci_open_device(struct vfio_device 
> *core_vdev)
>       return 0;
>  }
>  
> -static const struct vfio_device_ops vfio_pci_ops = {
> +const struct vfio_device_ops vfio_pci_ops = {
>       .name           = "vfio-pci",
>       .init           = vfio_pci_core_init_dev,
>       .release        = vfio_pci_core_release_dev,
> diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c 
> b/drivers/vfio/pci/vfio_pci_liveupdate.c
> index b84e63c0357b..f01de98f1b75 100644
> --- a/drivers/vfio/pci/vfio_pci_liveupdate.c
> +++ b/drivers/vfio/pci/vfio_pci_liveupdate.c
> @@ -8,25 +8,104 @@
>  
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
>  
> +#include <linux/kexec_handover.h>
>  #include <linux/kho/abi/vfio_pci.h>
>  #include <linux/liveupdate.h>
>  #include <linux/errno.h>
> +#include <linux/vfio.h>
>  
>  #include "vfio_pci_priv.h"
>  
>  static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler 
> *handler,
>                                            struct file *file)
>  {
> -     return false;
> +     struct vfio_device_file *df = to_vfio_device_file(file);
> +
> +     if (!df)
> +             return false;
> +
> +     /* Live Update support is limited to cdev files. */
> +     if (df->group)
> +             return false;
> +
> +     return df->device->ops == &vfio_pci_ops;
>  }
>  
>  static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args)
>  {
> -     return -EOPNOTSUPP;
> +     struct vfio_device *device = vfio_device_from_file(args->file);
> +     struct vfio_pci_core_device_ser *ser;
> +     struct vfio_pci_core_device *vdev;
> +     struct pci_dev *pdev;
> +
> +     vdev = container_of(device, struct vfio_pci_core_device, vdev);
> +     pdev = vdev->pdev;
> +
> +     if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
> +             return -EINVAL;
> +
> +     if (vfio_pci_is_intel_display(pdev))
> +             return -EINVAL;
> +
> +     ser = kho_alloc_preserve(sizeof(*ser));
> +     if (IS_ERR(ser))
> +             return PTR_ERR(ser);
> +
> +     ser->bdf = pci_dev_id(pdev);
> +     ser->domain = pci_domain_nr(pdev->bus);
> +
> +     args->serialized_data = virt_to_phys(ser);
> +     return 0;
>  }
>  
>  static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args 
> *args)
>  {
> +     kho_unpreserve_free(phys_to_virt(args->serialized_data));
> +}
> +
> +static int vfio_pci_liveupdate_freeze(struct liveupdate_file_op_args *args)
> +{
> +     struct vfio_device *device = vfio_device_from_file(args->file);
> +     struct vfio_pci_core_device *vdev;
> +     struct pci_dev *pdev;
> +     int ret;
> +
> +     vdev = container_of(device, struct vfio_pci_core_device, vdev);
> +     pdev = vdev->pdev;
> +
> +     guard(mutex)(&device->dev_set->lock);
> +
> +     /*
> +      * Userspace must disable interrupts on the device prior to freeze so
> +      * that the device does not send any interrupts until new interrupt
> +      * handlers have been established by the next kernel.
> +      */
> +     if (vdev->irq_type != VFIO_PCI_NUM_IRQS) {
> +             pci_err(pdev, "Freeze failed! Interrupts are still enabled.\n");
> +             return -EINVAL;
> +     }
> +
> +     pci_dev_lock(pdev);
> +
> +     ret = pci_load_saved_state(pdev, vdev->pci_saved_state);
> +     if (ret)
> +             goto out;
> +
> +     /*
> +      * Reset the device and restore it back to its original state before
> +      * handing it to the next kernel.
> +      *
> +      * Eventually both of these should be dropped and the device should be
> +      * kept running with its current state across the Live Update.
> +      */
> +     if (vdev->reset_works)
> +             ret = __pci_reset_function_locked(pdev);

I see the 'Eventually both of these should be dropped' comment,
which acknowledges that a reset is a v1 crutch. However, I wanted to
clarify the fallback strategy here.

If vdev->reset_works is false, we skip the reset but still jump into the
new kernel. For devices that don't support FLR, are we comfortable jumping
with the device potentially still hot? 

> +
> +     pci_restore_state(pdev);
> +
> +out:
> +     pci_dev_unlock(pdev);
> +     return ret;
>  }
>  
>  static int vfio_pci_liveupdate_retrieve(struct liveupdate_file_op_args *args)
> @@ -42,6 +121,7 @@ static const struct liveupdate_file_ops 
> vfio_pci_liveupdate_file_ops = {
>       .can_preserve = vfio_pci_liveupdate_can_preserve,
>       .preserve = vfio_pci_liveupdate_preserve,
>       .unpreserve = vfio_pci_liveupdate_unpreserve,
> +     .freeze = vfio_pci_liveupdate_freeze,
>       .retrieve = vfio_pci_liveupdate_retrieve,
>       .finish = vfio_pci_liveupdate_finish,
>       .owner = THIS_MODULE,
> diff --git a/drivers/vfio/pci/vfio_pci_priv.h 
> b/drivers/vfio/pci/vfio_pci_priv.h
> index 68966ec64e51..d3da79b7b03c 100644
> --- a/drivers/vfio/pci/vfio_pci_priv.h
> +++ b/drivers/vfio/pci/vfio_pci_priv.h
> @@ -11,6 +11,8 @@
>  /* Cap maximum number of ioeventfds per device (arbitrary) */
>  #define VFIO_PCI_IOEVENTFD_MAX               1000
>  
> +extern const struct vfio_device_ops vfio_pci_ops;
> +
>  struct vfio_pci_ioeventfd {
>       struct list_head        next;
>       struct vfio_pci_core_device     *vdev;
> diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
> index 50128da18bca..6b89edbbf174 100644
> --- a/drivers/vfio/vfio.h
> +++ b/drivers/vfio/vfio.h
> @@ -16,17 +16,6 @@ struct iommufd_ctx;
>  struct iommu_group;
>  struct vfio_container;
>  
> -struct vfio_device_file {
> -     struct vfio_device *device;
> -     struct vfio_group *group;
> -
> -     u8 access_granted;
> -     u32 devid; /* only valid when iommufd is valid */
> -     spinlock_t kvm_ref_lock; /* protect kvm field */
> -     struct kvm *kvm;
> -     struct iommufd_ctx *iommufd; /* protected by struct 
> vfio_device_set::lock */
> -};
> -
>  void vfio_device_put_registration(struct vfio_device *device);
>  bool vfio_device_try_get_registration(struct vfio_device *device);
>  int vfio_df_open(struct vfio_device_file *df);
> @@ -34,8 +23,6 @@ void vfio_df_close(struct vfio_device_file *df);
>  struct vfio_device_file *
>  vfio_allocate_device_file(struct vfio_device *device);
>  
> -extern const struct file_operations vfio_device_fops;
> -
>  #ifdef CONFIG_VFIO_NOIOMMU
>  extern bool vfio_noiommu __read_mostly;
>  #else
> diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
> index f7df90c423b4..276f615f0c28 100644
> --- a/drivers/vfio/vfio_main.c
> +++ b/drivers/vfio/vfio_main.c
> @@ -1436,15 +1436,7 @@ const struct file_operations vfio_device_fops = {
>       .show_fdinfo    = vfio_device_show_fdinfo,
>  #endif
>  };
> -
> -static struct vfio_device *vfio_device_from_file(struct file *file)
> -{
> -     struct vfio_device_file *df = file->private_data;
> -
> -     if (file->f_op != &vfio_device_fops)
> -             return NULL;
> -     return df->device;
> -}
> +EXPORT_SYMBOL_GPL(vfio_device_fops);
>  
>  /**
>   * vfio_file_is_valid - True if the file is valid vfio file
> diff --git a/include/linux/kho/abi/vfio_pci.h 
> b/include/linux/kho/abi/vfio_pci.h
> index 37a845eed972..9bf58a2f3820 100644
> --- a/include/linux/kho/abi/vfio_pci.h
> +++ b/include/linux/kho/abi/vfio_pci.h
> @@ -9,6 +9,9 @@
>  #ifndef _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H
>  #define _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H
>  
> +#include <linux/compiler.h>
> +#include <linux/types.h>
> +
>  /**
>   * DOC: VFIO PCI Live Update ABI
>   *
> @@ -25,4 +28,16 @@
>  
>  #define VFIO_PCI_LUO_FH_COMPATIBLE "vfio-pci-v1"
>  
> +/**
> + * struct vfio_pci_core_device_ser - Serialized state of a single VFIO PCI
> + * device.
> + *
> + * @bdf: The device's PCI bus, device, and function number.
> + * @domain: The device's PCI domain number (segment).
> + */
> +struct vfio_pci_core_device_ser {
> +     u16 bdf;
> +     u16 domain;
> +} __packed;
> +
>  #endif /* _LINUX_LIVEUPDATE_ABI_VFIO_PCI_H */
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index e90859956514..9aa1587fea19 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -81,6 +81,34 @@ struct vfio_device {
>  #endif
>  };
>  
> +struct vfio_device_file {
> +     struct vfio_device *device;
> +     struct vfio_group *group;
> +
> +     u8 access_granted;
> +     u32 devid; /* only valid when iommufd is valid */
> +     spinlock_t kvm_ref_lock; /* protect kvm field */
> +     struct kvm *kvm;
> +     struct iommufd_ctx *iommufd; /* protected by struct 
> vfio_device_set::lock */
> +};
> +
> +extern const struct file_operations vfio_device_fops;
> +

There seem to be two extern declarations for vfio_device_fops in both
vfio_pci_priv.h and include/linux/vfio.h. Could we consolidate these?

> +static inline struct vfio_device_file *to_vfio_device_file(struct file *file)
> +{
> +     if (file->f_op != &vfio_device_fops)
> +             return NULL;
> +
> +     return file->private_data;
> +}
> +
> +static inline struct vfio_device *vfio_device_from_file(struct file *file)
> +{
> +     struct vfio_device_file *df = to_vfio_device_file(file);
> +
> +     return df ? df->device : NULL;
> +}
> +

I'm a little uncomfortable with this part. Why is it necessary to expose
the internal vfio_device_file structure to drivers? If this is only to 
support vfio_device_from_file(), could we not keep the structure private
and just export the helper function instead? 

Exposing internal state into the public API introduces some maintenance
constraints for e.g. if vfio_main.c ever changes how it tracks 
file-to-device mappings or its internal security state (like 
access_granted), it now has to worry about breaking external drivers.

I believe we expose the struct just to power these static inline helper
(mainly vfio_device_from_file) ? Instead, could we treat
`vfio_device_file` as an opaque type in the public header (like struct 
iommu_group) and move the implementation of vfio_device_from_file() into
vfio_main.c as an exported symbol? This gives drivers the vfio_device 
pointer they need without leaking the core's private internals. Maybe
something like the following (untested):

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index f7df90c423b4..71e3dda53187 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1446,6 +1446,18 @@ static struct vfio_device *vfio_device_from_file(struct 
file *file)
        return df->device;
 }

+struct vfio_device *vfio_device_from_file(struct file *file)
+{
+       struct vfio_device_file *df;
+
+       if (file->f_op != &vfio_device_fops)
+               return NULL;
+
+       df = file->private_data;
+       return df->device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_from_file);
+
 /**
  * vfio_file_is_valid - True if the file is valid vfio file
  * @file: VFIO group file or VFIO device file
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e90859956514..182f192c5641 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -81,6 +81,15 @@ struct vfio_device {
 #endif
 };

+
+struct vfio_device_file;
+
+extern const struct file_operations vfio_device_fops;
+
+/* Public API for drivers */
+struct vfio_device *vfio_device_from_file(struct file *file);
+ [...]
+

>  /**
>   * struct vfio_device_ops - VFIO bus driver device callbacks
>   *
> -- 
> 2.53.0.rc1.225.gd81095ad13-goog
> 

Thanks,
Praan

Reply via email to