On Wed, 15 Jun 2022 07:52:16 -0700 Steve Sistare <steven.sist...@oracle.com> wrote:
> Preserve vfio INTX state across cpr restart. Preserve VFIOINTx fields as > follows: > pin : Recover this from the vfio config in kernel space > interrupt : Preserve its eventfd descriptor across exec. > unmask : Ditto > route.irq : This could perhaps be recovered in vfio_pci_post_load by > calling pci_device_route_intx_to_irq(pin), whose implementation reads > config space for a bridge device such as ich9. However, there is no > guarantee that the bridge vmstate is read before vfio vmstate. Rather > than fiddling with MigrationPriority for vmstate handlers, explicitly > save route.irq in vfio vmstate. > pending : save in vfio vmstate. > mmap_timeout, mmap_timer : Re-initialize > bool kvm_accel : Re-initialize > > In vfio_realize, defer calling vfio_intx_enable until the vmstate > is available, in vfio_pci_post_load. Modify vfio_intx_enable and > vfio_intx_kvm_enable to skip vfio initialization, but still perform > kvm initialization. > > Signed-off-by: Steve Sistare <steven.sist...@oracle.com> > --- > hw/vfio/pci.c | 92 > +++++++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 83 insertions(+), 9 deletions(-) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 2fd7121..b8aee91 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -173,14 +173,45 @@ static void vfio_intx_eoi(VFIODevice *vbasedev) > vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX); > } > > +#ifdef CONFIG_KVM > +static bool vfio_no_kvm_intx(VFIOPCIDevice *vdev) > +{ > + return vdev->no_kvm_intx || !kvm_irqfds_enabled() || > + vdev->intx.route.mode != PCI_INTX_ENABLED || > + !kvm_resamplefds_enabled(); > +} > +#endif > + > +static void vfio_intx_reenable_kvm(VFIOPCIDevice *vdev, Error **errp) > +{ > +#ifdef CONFIG_KVM > + if (vfio_no_kvm_intx(vdev)) { > + return; > + } > + > + if (vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) { > + error_setg(errp, "vfio_notifier_init intx-unmask failed"); > + return; > + } > + > + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, > + &vdev->intx.interrupt, > + &vdev->intx.unmask, > + vdev->intx.route.irq)) { > + error_setg_errno(errp, errno, "failed to setup resample irqfd"); Does not unwind with vfio_notifier_cleanup(). This also exactly duplicates code in vfio_intx_enable_kvm(), which suggests it needs further refactoring to a common helper. > + return; > + } > + > + vdev->intx.kvm_accel = true; > +#endif > +} > + > static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp) > { > #ifdef CONFIG_KVM > int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt); > > - if (vdev->no_kvm_intx || !kvm_irqfds_enabled() || > - vdev->intx.route.mode != PCI_INTX_ENABLED || > - !kvm_resamplefds_enabled()) { > + if (vfio_no_kvm_intx(vdev)) { > return; > } > > @@ -328,7 +359,13 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error > **errp) > return 0; > } > > - vfio_disable_interrupts(vdev); > + /* > + * Do not alter interrupt state during vfio_realize and cpr-load. The > + * reused flag is cleared thereafter. > + */ > + if (!vdev->vbasedev.reused) { > + vfio_disable_interrupts(vdev); > + } > > vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ > pci_config_set_interrupt_pin(vdev->pdev.config, pin); > @@ -353,6 +390,11 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error > **errp) > fd = event_notifier_get_fd(&vdev->intx.interrupt); > qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev); > > + if (vdev->vbasedev.reused) { > + vfio_intx_reenable_kvm(vdev, &err); > + goto finish; > + } > + This only jumps over the vfio_set_irq_signaling() and vfio_intx_enable_kvm(), largely replacing the latter with chunks of code taken from it. Doesn't seem like the right factoring. > if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0, > VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { > qemu_set_fd_handler(fd, NULL, NULL, vdev); > @@ -365,6 +407,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error > **errp) > warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); > } > > +finish: > vdev->interrupt = VFIO_INT_INTx; > > trace_vfio_intx_enable(vdev->vbasedev.name); > @@ -3195,9 +3238,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > vfio_intx_routing_notifier); > vdev->irqchip_change_notifier.notify = vfio_irqchip_change; > kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier); > - ret = vfio_intx_enable(vdev, errp); > - if (ret) { > - goto out_deregister; > + > + /* Wait until cpr-load reads intx routing data to enable */ > + if (!vdev->vbasedev.reused) { > + ret = vfio_intx_enable(vdev, errp); > + if (ret) { > + goto out_deregister; > + } > } > } > > @@ -3474,6 +3521,7 @@ static int vfio_pci_post_load(void *opaque, int > version_id) > VFIOPCIDevice *vdev = opaque; > PCIDevice *pdev = &vdev->pdev; > int nr_vectors; > + int ret = 0; > > if (msix_enabled(pdev)) { > msix_set_vector_notifiers(pdev, vfio_msix_vector_use, > @@ -3486,10 +3534,35 @@ static int vfio_pci_post_load(void *opaque, int > version_id) > vfio_claim_vectors(vdev, nr_vectors, false); > > } else if (vfio_pci_read_config(pdev, PCI_INTERRUPT_PIN, 1)) { > - assert(0); /* completed in a subsequent patch */ > + Error *err = 0; > + ret = vfio_intx_enable(vdev, &err); > + if (ret) { > + error_report_err(err); > + } > } > > - return 0; > + return ret; > +} > + > +static const VMStateDescription vfio_intx_vmstate = { > + .name = "vfio-intx", > + .unmigratable = 1, unmigratable-vmstates-to-interfere-with-migration++ Thanks, Alex > + .version_id = 0, > + .minimum_version_id = 0, > + .fields = (VMStateField[]) { > + VMSTATE_BOOL(pending, VFIOINTx), > + VMSTATE_UINT32(route.mode, VFIOINTx), > + VMSTATE_INT32(route.irq, VFIOINTx), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +#define VMSTATE_VFIO_INTX(_field, _state) { \ > + .name = (stringify(_field)), \ > + .size = sizeof(VFIOINTx), \ > + .vmsd = &vfio_intx_vmstate, \ > + .flags = VMS_STRUCT, \ > + .offset = vmstate_offset_value(_state, _field, VFIOINTx), \ > } > > static bool vfio_pci_needed(void *opaque) > @@ -3509,6 +3582,7 @@ static const VMStateDescription vfio_pci_vmstate = { > .fields = (VMStateField[]) { > VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), > VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), > + VMSTATE_VFIO_INTX(intx, VFIOPCIDevice), > VMSTATE_END_OF_LIST() > } > };