>-----Original Message-----
>From: Steve Sistare <steven.sist...@oracle.com>
>Subject: [PATCH V2 2/2] vfio/pci: preserve pending interrupts
>
>cpr-transfer may lose a VFIO interrupt because the KVM instance is
>destroyed and recreated. If an interrupt arrives in the middle, it is
>dropped. To fix, stop pending new interrupts during cpr save, and pick
>up the pieces. In more detail:
>
>Stop the VCPUs. Call kvm_irqchip_remove_irqfd_notifier_gsi --> KVM_IRQFD
>to
>deassign the irqfd gsi that routes interrupts directly to the VCPU and KVM.
>After this call, interrupts fall back to the kernel vfio_msihandler, which
>writes to QEMU's kvm_interrupt eventfd. CPR already preserves that
>eventfd. When the route is re-established in new QEMU, the kernel tests
>the eventfd and injects an interrupt to KVM if necessary.
With this patch, producer is detached from the kvm consumer, do we still need
to close kvm fd on source QEMU?
Zhenzhong
>
>Deassign INTx in a similar manner. For both MSI and INTx, remove the
>eventfd handler so old QEMU does not consume an event.
>
>If an interrupt was already pended to KVM prior to the completion of
>kvm_irqchip_remove_irqfd_notifier_gsi, it will be recovered by the
>subsequent call to cpu_synchronize_all_states, which pulls KVM interrupt
>state to userland prior to saving it in vmstate.
>
>Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
>---
> hw/vfio/cpr.c | 91
>++++++++++++++++++++++++++++++++++++++
> hw/vfio/pci.c | 2 +
> hw/vfio/pci.h | 1 +
> include/hw/vfio/vfio-cpr.h | 6 +++
> 4 files changed, 100 insertions(+)
>
>diff --git a/hw/vfio/cpr.c b/hw/vfio/cpr.c
>index 2a244fc4b6..bca74ea20a 100644
>--- a/hw/vfio/cpr.c
>+++ b/hw/vfio/cpr.c
>@@ -198,3 +198,94 @@ void vfio_cpr_add_kvm_notifier(void)
> MIG_MODE_CPR_TRANSFER);
> }
> }
>+
>+static int set_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
>+ EventNotifier *rn, int virq, bool
>enable)
>+{
>+ if (enable) {
>+ return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, virq);
>+ } else {
>+ return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, virq);
>+ }
>+}
>+
>+static int vfio_cpr_set_msi_virq(VFIOPCIDevice *vdev, Error **errp, bool
>enable)
>+{
>+ const char *op = (enable ? "enable" : "disable");
>+ PCIDevice *pdev = &vdev->pdev;
>+ int i, nr_vectors, ret = 0;
>+
>+ if (msix_enabled(pdev)) {
>+ nr_vectors = vdev->msix->entries;
>+
>+ } else if (msi_enabled(pdev)) {
>+ nr_vectors = msi_nr_vectors_allocated(pdev);
>+
>+ } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) {
>+ ret = set_irqfd_notifier_gsi(kvm_state, &vdev->intx.interrupt,
>+ &vdev->intx.unmask,
>vdev->intx.route.irq,
>+ enable);
>+ if (ret) {
>+ error_setg_errno(errp, -ret, "failed to %s INTx irq %d",
>+ op, vdev->intx.route.irq);
>+ return ret;
>+ }
>+ vfio_pci_intx_set_handler(vdev, enable);
>+ return ret;
>+
>+ } else {
>+ return 0;
>+ }
>+
>+ for (i = 0; i < nr_vectors; i++) {
>+ VFIOMSIVector *vector = &vdev->msi_vectors[i];
>+ if (vector->use) {
>+ ret = set_irqfd_notifier_gsi(kvm_state,
>&vector->kvm_interrupt,
>+ NULL, vector->virq,
>enable);
>+ if (ret) {
>+ error_setg_errno(errp, -ret,
>+ "failed to %s msi vector %d
>virq %d",
>+ op, i, vector->virq);
>+ return ret;
>+ }
>+ vfio_pci_msi_set_handler(vdev, i, enable);
>+ }
>+ }
>+
>+ return ret;
>+}
>+
>+/*
>+ * When CPR starts, detach IRQs from the VFIO device so future interrupts
>+ * are posted to kvm_interrupt, which is preserved in new QEMU.
>Interrupts
>+ * that were already posted to the old KVM instance, but not delivered to the
>+ * VCPU, are recovered via KVM_GET_LAPIC and pushed to the new KVM
>instance
>+ * in new QEMU.
>+ *
>+ * If CPR fails, reattach the IRQs.
>+ */
>+static int vfio_cpr_pci_notifier(NotifierWithReturn *notifier,
>+ MigrationEvent *e, Error **errp)
>+{
>+ VFIOPCIDevice *vdev =
>+ container_of(notifier, VFIOPCIDevice, cpr.transfer_notifier);
>+
>+ if (e->type == MIG_EVENT_PRECOPY_SETUP) {
>+ return vfio_cpr_set_msi_virq(vdev, errp, false);
>+ } else if (e->type == MIG_EVENT_PRECOPY_FAILED) {
>+ return vfio_cpr_set_msi_virq(vdev, errp, true);
>+ }
>+ return 0;
>+}
>+
>+void vfio_cpr_pci_register_device(VFIOPCIDevice *vdev)
>+{
>+ migration_add_notifier_mode(&vdev->cpr.transfer_notifier,
>+ vfio_cpr_pci_notifier,
>+ MIG_MODE_CPR_TRANSFER);
>+}
>+
>+void vfio_cpr_pci_unregister_device(VFIOPCIDevice *vdev)
>+{
>+ migration_remove_notifier(&vdev->cpr.transfer_notifier);
>+}
>diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>index 8b471c054a..22a4125131 100644
>--- a/hw/vfio/pci.c
>+++ b/hw/vfio/pci.c
>@@ -2993,6 +2993,7 @@ void vfio_pci_put_device(VFIOPCIDevice *vdev)
> {
> vfio_display_finalize(vdev);
> vfio_bars_finalize(vdev);
>+ vfio_cpr_pci_unregister_device(vdev);
> g_free(vdev->emulated_config_bits);
> g_free(vdev->rom);
> /*
>@@ -3442,6 +3443,7 @@ static void vfio_pci_realize(PCIDevice *pdev, Error
>**errp)
> vfio_pci_register_err_notifier(vdev);
> vfio_pci_register_req_notifier(vdev);
> vfio_setup_resetfn_quirk(vdev);
>+ vfio_cpr_pci_register_device(vdev);
>
> return;
>
>diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>index 80c8fcfa07..7989b94eb3 100644
>--- a/hw/vfio/pci.h
>+++ b/hw/vfio/pci.h
>@@ -194,6 +194,7 @@ struct VFIOPCIDevice {
> bool skip_vsc_check;
> VFIODisplay *dpy;
> Notifier irqchip_change_notifier;
>+ VFIOPCICPR cpr;
> };
>
> /* Use uin32_t for vendor & device so PCI_ANY_ID expands and cannot
>match hw */
>diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
>index 80ad20d216..d37daffbc5 100644
>--- a/include/hw/vfio/vfio-cpr.h
>+++ b/include/hw/vfio/vfio-cpr.h
>@@ -38,6 +38,10 @@ typedef struct VFIODeviceCPR {
> uint32_t ioas_id;
> } VFIODeviceCPR;
>
>+typedef struct VFIOPCICPR {
>+ NotifierWithReturn transfer_notifier;
>+} VFIOPCICPR;
>+
> bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
> Error **errp);
> void vfio_legacy_cpr_unregister_container(struct VFIOContainer
>*container);
>@@ -77,5 +81,7 @@ extern const VMStateDescription vfio_cpr_pci_vmstate;
> extern const VMStateDescription vmstate_cpr_vfio_devices;
>
> void vfio_cpr_add_kvm_notifier(void);
>+void vfio_cpr_pci_register_device(struct VFIOPCIDevice *vdev);
>+void vfio_cpr_pci_unregister_device(struct VFIOPCIDevice *vdev);
>
> #endif /* HW_VFIO_VFIO_CPR_H */
>--
>2.39.3