On Thu, 9 Mar 2017 10:32:44 +0800 Jason Wang <jasow...@redhat.com> wrote:
> On 2017年03月09日 00:40, Igor Mammedov wrote: > > On Tue, 7 Mar 2017 14:47:30 +0200 > > Marcel Apfelbaum<mar...@redhat.com> wrote: > > > >> On 03/07/2017 11:09 AM, Jason Wang wrote: > >>> After commit 96a8821d2141 ("virtio: unbreak virtio-pci with IOMMU > >>> after caching ring translations"), IOMMU was required to be created in > >>> advance. This is because we can only get the correct dma_as after pci > >>> IOMMU (e.g intel_iommu) was initialized. This is suboptimal and > >>> inconvenient for user. This patch releases this by: > >>> > >>> - introduce a bus_master_ready method for PCIDeviceClass and trigger > >>> this during pci_init_bus_master > >>> - implement virtio-pci method and 1) reset the dma_as 2) re-register > >>> the memory listener to the new dma_as Instead of trying to fix up later it's possible to refuse adding iommu device if other devices has been added before it with -device/device_add. That would match current CLI semantics where device that others depend on should be listed on CLI before that others are listed. A bit hackish but something along this lines: diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index 9349acb..9d8ecc6 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -460,7 +460,7 @@ void pci_device_deassert_intx(PCIDevice *dev); typedef AddressSpace *(*PCIIOMMUFunc)(PCIBus *, void *, int); AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque); +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp); static inline void pci_set_byte(uint8_t *config, uint8_t val) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index e0732cc..9d9a76b 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -1160,7 +1160,7 @@ static void amdvi_realize(DeviceState *dev, Error **err) sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->mmio); sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR); - pci_setup_iommu(bus, amdvi_host_dma_iommu, s); + pci_setup_iommu(bus, amdvi_host_dma_iommu, s, err); s->devid = object_property_get_int(OBJECT(&s->pci), "addr", err); msi_init(&s->pci.dev, 0, 1, true, false, err); amdvi_init(s); diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 22d8226..f4f208c 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -2585,7 +2585,7 @@ static void vtd_realize(DeviceState *dev, Error **errp) g_free, g_free); vtd_init(s); sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, vtd_host_dma_iommu, dev, errp); /* Pseudo address space under root PCI bus. */ pcms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 273f1e4..2d01589 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -317,6 +317,21 @@ static void pci_host_bus_register(DeviceState *host) QLIST_INSERT_HEAD(&pci_host_bridges, host_bridge, next); } +static bool pcibus_has_devices(PCIBus *bus) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(bus->devices); ++i) { + if (bus->devices[i]) { + DeviceState *dev = DEVICE(bus->devices[i]); + if (dev->opts) { + return true; + } + } + } + return false; +} + PCIBus *pci_find_primary_bus(void) { PCIBus *primary_bus = NULL; @@ -2534,8 +2549,12 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } -void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque) +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, void *opaque, Error **errp) { + if (pcibus_has_devices(bus)) { + error_setg(errp, "IOMMU must be created before any other PCI devices"); + return; + } bus->iommu_fn = fn; bus->iommu_opaque = opaque; } > >>> > >> Hi Jason, > >> > >>> Cc: Paolo Bonzini<pbonz...@redhat.com> > >>> Signed-off-by: Jason Wang<jasow...@redhat.com> > >>> --- > >>> Changes from V2: > >>> - delay pci_init_bus_master() after pci device is realized to make > >>> bus_master_ready a more generic method > >>> --- > >>> hw/pci/pci.c | 11 ++++++++--- > >>> hw/virtio/virtio-pci.c | 9 +++++++++ > >>> hw/virtio/virtio.c | 19 +++++++++++++++++++ > >>> include/hw/pci/pci.h | 1 + > >>> include/hw/virtio/virtio.h | 1 + > >>> 5 files changed, 38 insertions(+), 3 deletions(-) > >>> > >>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c > >>> index 273f1e4..22e6ad9 100644 > >>> --- a/hw/pci/pci.c > >>> +++ b/hw/pci/pci.c > >>> @@ -83,6 +83,7 @@ static const VMStateDescription vmstate_pcibus = { > >>> static void pci_init_bus_master(PCIDevice *pci_dev) > >>> { > >>> AddressSpace *dma_as = pci_device_iommu_address_space(pci_dev); > >>> + PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(pci_dev); > >>> > >>> memory_region_init_alias(&pci_dev->bus_master_enable_region, > >>> OBJECT(pci_dev), "bus master", > >>> @@ -90,6 +91,9 @@ static void pci_init_bus_master(PCIDevice *pci_dev) > >>> memory_region_set_enabled(&pci_dev->bus_master_enable_region, > >>> false); > >>> address_space_init(&pci_dev->bus_master_as, > >>> &pci_dev->bus_master_enable_region, > >>> pci_dev->name); > >>> + if (pc->bus_master_ready) { > >>> + pc->bus_master_ready(pci_dev); > >>> + } > >>> } > >>> > >>> static void pcibus_machine_done(Notifier *notifier, void *data) > >>> @@ -995,9 +999,6 @@ static PCIDevice *do_pci_register_device(PCIDevice > >>> *pci_dev, PCIBus *bus, > >>> pci_dev->devfn = devfn; > >>> pci_dev->requester_id_cache = pci_req_id_cache_get(pci_dev); > >>> > >>> - if (qdev_hotplug) { > >>> - pci_init_bus_master(pci_dev); > >>> - } > >>> pstrcpy(pci_dev->name, sizeof(pci_dev->name), name); > >>> pci_dev->irq_state = 0; > >>> pci_config_alloc(pci_dev); > >>> @@ -1995,6 +1996,10 @@ static void pci_qdev_realize(DeviceState *qdev, > >>> Error **errp) > >>> } > >>> } > >>> > >>> + if (qdev_hotplug) { > >>> + pci_init_bus_master(pci_dev); > >>> + } > >>> + > >> How does it help if we move qdev_hotplug check outside > >> "do_pci_register_device"? > >> I suppose you want the code to run after the "realize" function? > >> If so, what happens if a "realize" function of another device needs the > >> bus_master_as > >> (at hotplug time)? > > Conceptually, > > I'm not sure that inherited device class realize > > should be aware of uninitialized bus_master, > > which belongs to PCI device, nor should it initialize > > it by calling pci_init_bus_master() explicitly, > > it's parent's class job (PCIDevice). > > Yes, I was trying to propose a workaround for 2.9. I'm sure we will > refine the ordering in 2.10. And consider we have asked libvirt to > create IOMMU first, I think I will withdraw the patch. > > > > > more close to current code: > > if xen-pci-passthrough were hotplugged then it looks like > > this hunk could break it: > > hw/xen/xen_pt.c: > > memory_listener_register(&s->memory_listener, &s->dev.bus_master_as); > > would happen with uninitialized bus_master_as > > as realize is called before pci_init_bus_master(); > > Yes, this won't work. This is exactly the same issue as virtio, but this > will also break if it was created before an IOMMU. > > > > > So the same question as Marcel's but other way around, > > why this hunk "has to" be moved. > > > > > > Right. > > Thanks