On 7/2/2025 3:17 AM, Cédric Le Goater wrote:
Hello Steve,

On 7/1/25 18:12, Steven Sistare wrote:
Hi Cedric, what do we need to do to get this patch in, and the patch "preserve 
INTx"?
Just review, or are there conflicts to resolve?

I haven't looked at it yet. I will before the end of the week.

I should send the last VFIO PR for QEMU 10.1 on Friday. On PTO
next week.

Hi Zhenzhong,
  With Cedric out next week, we have very little time to finish the iommufd 
series.
I can post V6 today if you are satisfied with my most recent comments, and if
you review patch 29 "vfio/iommufd: register container for cpr".

- Steve

On 6/10/2025 11:39 AM, Steve Sistare wrote:
Save the MSI message area as part of vfio-pci vmstate, and preserve the
interrupt and notifier eventfd's.  migrate_incoming loads the MSI data,
then the vfio-pci post_load handler finds the eventfds in CPR state,
rebuilds vector data structures, and attaches the interrupts to the new
KVM instance.

Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
  hw/vfio/pci.h              |  2 +
  include/hw/vfio/vfio-cpr.h |  8 ++++
  hw/vfio/cpr.c              | 97 ++++++++++++++++++++++++++++++++++++++++++++++
  hw/vfio/pci.c              | 54 ++++++++++++++++++++++++--
  4 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 6e4840d..4d1203c 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -217,6 +217,8 @@ void vfio_pci_add_kvm_msi_virq(VFIOPCIDevice *vdev, 
VFIOMSIVector *vector,
  void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
  void vfio_pci_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev);
  bool vfio_pci_intx_enable(VFIOPCIDevice *vdev, Error **errp);
+void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev);
+void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr);
  uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
  void vfio_pci_write_config(PCIDevice *pdev,
diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
index 8bf85b9..25e74ee 100644
--- a/include/hw/vfio/vfio-cpr.h
+++ b/include/hw/vfio/vfio-cpr.h
@@ -15,6 +15,7 @@
  struct VFIOContainer;
  struct VFIOContainerBase;
  struct VFIOGroup;
+struct VFIOPCIDevice;
  typedef struct VFIOContainerCPR {
      Error *blocker;
@@ -52,6 +53,13 @@ void vfio_cpr_giommu_remap(struct VFIOContainerBase 
*bcontainer,
  bool vfio_cpr_ram_discard_register_listener(
      struct VFIOContainerBase *bcontainer, MemoryRegionSection *section);
+void vfio_cpr_save_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
+                             int nr, int fd);
+int vfio_cpr_load_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
+                            int nr);
+void vfio_cpr_delete_vector_fd(struct VFIOPCIDevice *vdev, const char *name,
+                               int nr);
+
  extern const VMStateDescription vfio_cpr_pci_vmstate;
  #endif /* HW_VFIO_VFIO_CPR_H */
diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
index fdbb58e..e467373 100644
--- a/hw/vfio/cpr.c
+++ b/hw/vfio/cpr.c
@@ -9,6 +9,8 @@
  #include "hw/vfio/vfio-device.h"
  #include "hw/vfio/vfio-cpr.h"
  #include "hw/vfio/pci.h"
+#include "hw/pci/msix.h"
+#include "hw/pci/msi.h"
  #include "migration/cpr.h"
  #include "qapi/error.h"
  #include "system/runstate.h"
@@ -40,6 +42,69 @@ void vfio_cpr_unregister_container(VFIOContainerBase 
*bcontainer)
      migration_remove_notifier(&bcontainer->cpr_reboot_notifier);
  }
+#define STRDUP_VECTOR_FD_NAME(vdev, name)   \
+    g_strdup_printf("%s_%s", (vdev)->vbasedev.name, (name))
+
+void vfio_cpr_save_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr,
+                             int fd)
+{
+    g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+    cpr_save_fd(fdname, nr, fd);
+}
+
+int vfio_cpr_load_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+    g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+    return cpr_find_fd(fdname, nr);
+}
+
+void vfio_cpr_delete_vector_fd(VFIOPCIDevice *vdev, const char *name, int nr)
+{
+    g_autofree char *fdname = STRDUP_VECTOR_FD_NAME(vdev, name);
+    cpr_delete_fd(fdname, nr);
+}
+
+static void vfio_cpr_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors,
+                                   bool msix)
+{
+    int i, fd;
+    bool pending = false;
+    PCIDevice *pdev = &vdev->pdev;
+
+    vdev->nr_vectors = nr_vectors;
+    vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
+    vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
+
+    vfio_pci_prepare_kvm_msi_virq_batch(vdev);
+
+    for (i = 0; i < nr_vectors; i++) {
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+
+        fd = vfio_cpr_load_vector_fd(vdev, "interrupt", i);
+        if (fd >= 0) {
+            vfio_pci_vector_init(vdev, i);
+            vfio_pci_msi_set_handler(vdev, i);
+        }
+
+        if (vfio_cpr_load_vector_fd(vdev, "kvm_interrupt", i) >= 0) {
+            vfio_pci_add_kvm_msi_virq(vdev, vector, i, msix);
+        } else {
+            vdev->msi_vectors[i].virq = -1;
+        }
+
+        if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+            set_bit(i, vdev->msix->pending);
+            pending = true;
+        }
+    }
+
+    vfio_pci_commit_kvm_msi_virq_batch(vdev);
+
+    if (msix) {
+        memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
+    }
+}
+
  /*
   * The kernel may change non-emulated config bits.  Exclude them from the
   * changed-bits check in get_pci_config_device.
@@ -58,13 +123,45 @@ static int vfio_cpr_pci_pre_load(void *opaque)
      return 0;
  }
+static int vfio_cpr_pci_post_load(void *opaque, int version_id)
+{
+    VFIOPCIDevice *vdev = opaque;
+    PCIDevice *pdev = &vdev->pdev;
+    int nr_vectors;
+
+    if (msix_enabled(pdev)) {
+        vfio_pci_msix_set_notifiers(vdev);
+        nr_vectors = vdev->msix->entries;
+        vfio_cpr_claim_vectors(vdev, nr_vectors, true);
+
+    } else if (msi_enabled(pdev)) {
+        nr_vectors = msi_nr_vectors_allocated(pdev);
+        vfio_cpr_claim_vectors(vdev, nr_vectors, false);
+
+    } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
+        g_assert_not_reached();      /* completed in a subsequent patch */
+    }
+
+    return 0;
+}
+
+static bool pci_msix_present(void *opaque, int version_id)
+{
+    PCIDevice *pdev = opaque;
+
+    return msix_present(pdev);
+}
+
  const VMStateDescription vfio_cpr_pci_vmstate = {
      .name = "vfio-cpr-pci",
      .version_id = 0,
      .minimum_version_id = 0,
      .pre_load = vfio_cpr_pci_pre_load,
+    .post_load = vfio_cpr_pci_post_load,
      .needed = cpr_incoming_needed,
      .fields = (VMStateField[]) {
+        VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
+        VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, pci_msix_present),
          VMSTATE_END_OF_LIST()
      }
  };
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 4cda6dc..b3dbb84 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -29,6 +29,7 @@
  #include "hw/pci/pci_bridge.h"
  #include "hw/qdev-properties.h"
  #include "hw/qdev-properties-system.h"
+#include "hw/vfio/vfio-cpr.h"
  #include "migration/vmstate.h"
  #include "migration/cpr.h"
  #include "qobject/qdict.h"
@@ -57,13 +58,25 @@ static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
  static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
  static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
+/* Create new or reuse existing eventfd */
  static bool vfio_notifier_init(VFIOPCIDevice *vdev, EventNotifier *e,
                                 const char *name, int nr, Error **errp)
  {
-    int ret = event_notifier_init(e, 0);
+    int fd = vfio_cpr_load_vector_fd(vdev, name, nr);
+    int ret = 0;
-    if (ret) {
-        error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
+    if (fd >= 0) {
+        event_notifier_init_fd(e, fd);
+    } else {
+        ret = event_notifier_init(e, 0);
+        if (ret) {
+            error_setg_errno(errp, -ret, "vfio_notifier_init %s failed", name);
+        } else {
+            fd = event_notifier_get_fd(e);
+            if (fd >= 0) {
+                vfio_cpr_save_vector_fd(vdev, name, nr, fd);
+            }
+        }
      }
      return !ret;
  }
@@ -71,6 +84,7 @@ static bool vfio_notifier_init(VFIOPCIDevice *vdev, 
EventNotifier *e,
  static void vfio_notifier_cleanup(VFIOPCIDevice *vdev, EventNotifier *e,
                                    const char *name, int nr)
  {
+    vfio_cpr_delete_vector_fd(vdev, name, nr);
      event_notifier_cleanup(e);
  }
@@ -394,6 +408,14 @@ static void vfio_msi_interrupt(void *opaque)
      notify(&vdev->pdev, nr);
  }
+void vfio_pci_msi_set_handler(VFIOPCIDevice *vdev, int nr)
+{
+    VFIOMSIVector *vector = &vdev->msi_vectors[nr];
+    int fd = event_notifier_get_fd(&vector->interrupt);
+
+    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+}
+
  /*
   * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an 
invalid
   * fd to kernel.
@@ -580,6 +602,15 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
      int ret;
      bool resizing = !!(vdev->nr_vectors < nr + 1);
+    /*
+     * Ignore the callback from msix_set_vector_notifiers during resume.
+     * The necessary subset of these actions is called from
+     * vfio_cpr_claim_vectors during post load.
+     */
+    if (cpr_is_incoming()) {
+        return 0;
+    }
+
      trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
      vector = &vdev->msi_vectors[nr];
@@ -686,6 +717,12 @@ static void vfio_msix_vector_release(PCIDevice *pdev, 
unsigned int nr)
      }
  }
+void vfio_pci_msix_set_notifiers(VFIOPCIDevice *vdev)
+{
+    msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
+                              vfio_msix_vector_release, NULL);
+}
+
  void vfio_pci_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
  {
      assert(!vdev->defer_kvm_irq_routing);
@@ -2962,6 +2999,11 @@ static void vfio_register_err_notifier(VFIOPCIDevice 
*vdev)
      fd = event_notifier_get_fd(&vdev->err_notifier);
      qemu_set_fd_handler(fd, vfio_err_notifier_handler, NULL, vdev);
+    /* Do not alter irq_signaling during vfio_realize for cpr */
+    if (cpr_is_incoming()) {
+        return;
+    }
+
      if (!vfio_device_irq_set_signaling(&vdev->vbasedev, 
VFIO_PCI_ERR_IRQ_INDEX, 0,
                                         VFIO_IRQ_SET_ACTION_TRIGGER, fd, 
&err)) {
          error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
@@ -3029,6 +3071,12 @@ static void vfio_register_req_notifier(VFIOPCIDevice 
*vdev)
      fd = event_notifier_get_fd(&vdev->req_notifier);
      qemu_set_fd_handler(fd, vfio_req_notifier_handler, NULL, vdev);
+    /* Do not alter irq_signaling during vfio_realize for cpr */
+    if (cpr_is_incoming()) {
+        vdev->req_enabled = true;
+        return;
+    }
+
      if (!vfio_device_irq_set_signaling(&vdev->vbasedev, 
VFIO_PCI_REQ_IRQ_INDEX, 0,
                                         VFIO_IRQ_SET_ACTION_TRIGGER, fd, 
&err)) {
          error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);




Reply via email to