Re: [PATCH QEMU v25 03/17] vfio: Add save and load functions for VFIO PCI devices

Alex Williamson Mon, 22 Jun 2020 13:29:04 -0700

On Sun, 21 Jun 2020 01:51:12 +0530
Kirti Wankhede <kwankh...@nvidia.com> wrote:


> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede <kwankh...@nvidia.com>
> Reviewed-by: Neo Jia <c...@nvidia.com>
> ---
>  hw/vfio/pci.c                 | 95 
> +++++++++++++++++++++++++++++++++++++++++++
>  include/hw/vfio/vfio-common.h |  2 +
>  2 files changed, 97 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 27f8872db2b1..5ba340aee1d4 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -41,6 +41,7 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/qemu-file.h"
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
>  #define PCI_VFIO(obj)    OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> @@ -2407,11 +2408,105 @@ static Object *vfio_pci_get_object(VFIODevice 
> *vbasedev)
>      return OBJECT(vdev);
>  }
>  
> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +
> +    qemu_put_buffer(f, vdev->emulated_config_bits, vdev->config_size);
> +    qemu_put_buffer(f, vdev->pdev.wmask, vdev->config_size);
> +    pci_device_save(pdev, f);
> +
> +    qemu_put_be32(f, vdev->interrupt);
> +    if (vdev->interrupt == VFIO_INT_MSIX) {
> +        msix_save(pdev, f);

msix_save() checks msix_present() so shouldn't we include this
unconditionally?  Can't there also be state in the vector table
regardless of whether we're currently running in MSI-X mode?

> +    }
> +}
> +
> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +    VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +    PCIDevice *pdev = &vdev->pdev;
> +    uint32_t interrupt_type;
> +    uint16_t pci_cmd;
> +    int i, ret;
> +
> +    qemu_get_buffer(f, vdev->emulated_config_bits, vdev->config_size);
> +    qemu_get_buffer(f, vdev->pdev.wmask, vdev->config_size);

This doesn't seem safe, why is it ok to indiscriminately copy these
arrays that are configured via support or masking of various device
features from the source to the target?

I think this still fails basic feature support negotiation.  For
instance, Intel IGD assignment modifies emulated_config_bits and wmask
to allow the VM BIOS to allocate fake stolen memory for the GPU and
store this value in config space.  This support can be controlled via a
QEMU build-time option, therefore the feature support on the target can
be different from the source.  If this sort of feature set doesn't
match between source and target, I think we'd want to abort the
migration, but we don't have any provisions for that here (a physical
IGD device is obviously just an example as it doesn't support migration
currently).

> +
> +    ret = pci_device_load(pdev, f);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    /* retore pci bar configuration */
> +    pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +    vfio_pci_write_config(pdev, PCI_COMMAND,
> +                        pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 
> 2);

s/!/~/?  Extra parenthesis too

> +    for (i = 0; i < PCI_ROM_SLOT; i++) {
> +        uint32_t bar = pci_default_read_config(pdev,
> +                                               PCI_BASE_ADDRESS_0 + i * 4, 
> 4);
> +
> +        vfio_pci_write_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, bar, 4);
> +    }
> +
> +    interrupt_type = qemu_get_be32(f);
> +
> +    if (interrupt_type == VFIO_INT_MSI) {
> +        uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +        bool msi_64bit;
> +
> +        /* restore msi configuration */
> +        msi_flags = pci_default_read_config(pdev,
> +                                            pdev->msi_cap + PCI_MSI_FLAGS, 
> 2);
> +        msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags & (!PCI_MSI_FLAGS_ENABLE), 2);
> +

What if I migrate from a device with MSI support to a device without
MSI support, or to a device with MSI support at a different offset, who
is responsible for triggering a migration fault?


> +        msi_addr_lo = pci_default_read_config(pdev,
> +                                        pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
> 4);
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_LO,
> +                              msi_addr_lo, 4);
> +
> +        if (msi_64bit) {
> +            msi_addr_hi = pci_default_read_config(pdev,
> +                                        pdev->msi_cap + PCI_MSI_ADDRESS_HI, 
> 4);
> +            vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI,
> +                                  msi_addr_hi, 4);
> +        }
> +
> +        msi_data = pci_default_read_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> PCI_MSI_DATA_32),
> +                2);
> +
> +        vfio_pci_write_config(pdev,
> +                pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> PCI_MSI_DATA_32),
> +                msi_data, 2);
> +
> +        vfio_pci_write_config(pdev, pdev->msi_cap + PCI_MSI_FLAGS,
> +                              msi_flags | PCI_MSI_FLAGS_ENABLE, 2);
> +    } else if (interrupt_type == VFIO_INT_MSIX) {
> +        uint16_t offset;
> +
> +        offset = pci_default_read_config(pdev,
> +                                       pdev->msix_cap + PCI_MSIX_FLAGS + 1, 
> 2);
> +        /* load enable bit and maskall bit */
> +        vfio_pci_write_config(pdev, pdev->msix_cap + PCI_MSIX_FLAGS + 1,
> +                              offset, 2);
> +        msix_load(pdev, f);

Isn't this ordering backwards, or at least less efficient?  The config
write will cause us to enable MSI-X; presumably we'd have nothing in
the vector table though.  Then msix_load() will write the vector
and pba tables and trigger a use notifier for each vector.  It seems
like that would trigger a bunch of SET_IRQS ioctls as if the guest
wrote individual unmasked vectors to the vector table, whereas if we
setup the vector table and then enable MSI-X, we do it with one ioctl.

Also same question as above, I'm not sure who is responsible for making
sure both devices support MSI-X and that the capability exists at the
same place on each.  Repeat for essentially every capability.  Are we
leaning on the migration regions to fail these migrations before we get
here?  If so, should we be?

Also, besides BARs, the command register, and MSI & MSI-X, there must
be other places where the guest can write config data through to the
device.  pci_device_{save,load}() only sets QEMU's config space.

A couple more theoretical (probably not too distant) examples related
to that; there's a resizable BAR capability that at some point we'll
probably need to allow the guest to interact with (ie. manipulation of
capability changes the reported region size for a BAR).  How would we
support that with this save/load scheme?  We'll likely also have SR-IOV
PFs assigned where we'll perhaps have support for emulating the SR-IOV
capability to call out to a privileged userspace helper to enable VFs,
how does this get extended to support that type of emulation?

I'm afraid that making carbon copies of emulated_config_bits, wmask,
and invoking pci_device_save/load() doesn't address my concerns that
saving and restoring config space between source and target really
seems like a much more important task than outlined here.  Thanks,

Alex

> +    }
> +    vfio_pci_write_config(pdev, PCI_COMMAND, pci_cmd, 2);
> +    return 0;
> +}
> +
>  static VFIODeviceOps vfio_pci_ops = {
>      .vfio_compute_needs_reset = vfio_pci_compute_needs_reset,
>      .vfio_hot_reset_multi = vfio_pci_hot_reset_multi,
>      .vfio_eoi = vfio_intx_eoi,
>      .vfio_get_object = vfio_pci_get_object,
> +    .vfio_save_config = vfio_pci_save_config,
> +    .vfio_load_config = vfio_pci_load_config,
>  };
>  
>  int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp)
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 74261feaeac9..d69a7f3ae31e 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -120,6 +120,8 @@ struct VFIODeviceOps {
>      int (*vfio_hot_reset_multi)(VFIODevice *vdev);
>      void (*vfio_eoi)(VFIODevice *vdev);
>      Object *(*vfio_get_object)(VFIODevice *vdev);
> +    void (*vfio_save_config)(VFIODevice *vdev, QEMUFile *f);
> +    int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f);
>  };
>  
>  typedef struct VFIOGroup {

Re: [PATCH QEMU v25 03/17] vfio: Add save and load functions for VFIO PCI devices

Reply via email to