sPAPR IOMMU is managing two copies of an TCE table: 1) a guest view of the table - this is what emulated devices use and this is where H_GET_TCE reads from; 2) a hardware TCE table - only present if there is at least one vfio-pci device on a PHB; it is updated via a memory listener on a PHB address space which forwards map/unmap requests to vfio-pci IOMMU host driver.
At the moment presence of vfio-pci devices on a bus affect the way the guest view table is allocated. If there is no vfio-pci on a PHB and the host kernel supports KVM acceleration of H_PUT_TCE, a table is allocated in KVM. However, if there is vfio-pci and we do yet not support KVM acceleration for these, the table has to be allocated by the userspace. When vfio-pci device is hotplugged and there were no vfio-pci devices already, the guest view table could have been allocated by KVM which means that H_PUT_TCE is handled by the host kernel and since we do not support vfio-pci in KVM, the hardware table will not be updated. This reallocates the guest view table in QEMU if the first vfio-pci device has just been plugged. spapr_tce_realloc_userspace() handles this. This replays all the mappings to make sure that the tables are in sync. This will not have a visible effect though as for a new device the guest kernel will allocate-and-map new addresses and therefore existing mappings from emulated devices will not be used by vfio-pci devices. This adds calls to spapr_phb_dma_capabilities_update() in PCI hotplug hooks . Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> --- hw/ppc/spapr_iommu.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++--- hw/ppc/spapr_pci.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/hw/ppc/spapr.h | 2 ++ trace-events | 2 ++ 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 45c00d8..5e6bdb4 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -78,12 +78,13 @@ static uint64_t *spapr_tce_alloc_table(uint32_t liobn, uint32_t nb_table, uint32_t page_shift, int *fd, - bool vfio_accel) + bool vfio_accel, + bool force_userspace) { uint64_t *table = NULL; uint64_t window_size = (uint64_t)nb_table << page_shift; - if (kvm_enabled() && !(window_size >> 32)) { + if (kvm_enabled() && !force_userspace && !(window_size >> 32)) { table = kvmppc_create_spapr_tce(liobn, window_size, fd, vfio_accel); } @@ -222,7 +223,8 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel) tcet->nb_table, tcet->page_shift, &tcet->fd, - vfio_accel); + vfio_accel, + false); memory_region_set_size(&tcet->iommu, (uint64_t)tcet->nb_table << tcet->page_shift); @@ -495,6 +497,48 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, return 0; } +static int spapr_tce_do_replay(sPAPRTCETable *tcet, uint64_t *table) +{ + target_ulong ioba = tcet->bus_offset, pgsz = (1ULL << tcet->page_shift); + long i, ret = 0; + + for (i = 0; i < tcet->nb_table; ++i, ioba += pgsz) { + ret = put_tce_emu(tcet, ioba, table[i]); + if (ret) + break; + } + + return ret; +} + +int spapr_tce_replay(sPAPRTCETable *tcet) +{ + return spapr_tce_do_replay(tcet, tcet->table); +} + +int spapr_tce_realloc_userspace(sPAPRTCETable *tcet, bool replay) +{ + int ret = 0, oldfd; + uint64_t *oldtable; + + oldtable = tcet->table; + oldfd = tcet->fd; + tcet->table = spapr_tce_alloc_table(tcet->liobn, + tcet->nb_table, + tcet->page_shift, + &tcet->fd, + false, + true); /* force_userspace */ + + if (replay) { + ret = spapr_tce_do_replay(tcet, oldtable); + } + + spapr_tce_free_table(oldtable, oldfd, tcet->nb_table); + + return ret; +} + int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, sPAPRTCETable *tcet) { diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index ca3772e..1f980fa 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -716,6 +716,33 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &phb->iommu_as; } +static int spapr_phb_dma_update(Object *child, void *opaque) +{ + int ret = 0; + sPAPRTCETable *tcet = (sPAPRTCETable *) + object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE); + + if (!tcet) { + return 0; + } + + if (tcet->fd >= 0) { + /* + * We got first vfio-pci device on accelerated table. + * VFIO acceleration is not possible. + * Reallocate table in userspace and replay mappings. + */ + ret = spapr_tce_realloc_userspace(tcet, true); + trace_spapr_pci_dma_realloc_update(tcet->liobn, ret); + } else { + /* There was no acceleration, so just replay mappings. */ + ret = spapr_tce_replay(tcet); + trace_spapr_pci_dma_update(tcet->liobn, ret); + } + + return 0; +} + static int spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb) { int ret; @@ -776,6 +803,20 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb) return 0; } +static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb) +{ + int ret = 0; + bool had_vfio = sphb->has_vfio; + + spapr_phb_dma_capabilities_update(sphb); + + if (!had_vfio && sphb->has_vfio) { + object_child_foreach(OBJECT(sphb), spapr_phb_dma_update, NULL); + } + + return ret; +} + /* Macros to operate with address in OF binding to PCI */ #define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p)) #define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */ @@ -1042,6 +1083,7 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, if (dev->hotplugged) { fdt = spapr_create_pci_child_dt(phb, pdev, drc_index, drc_name, &fdt_start_offset); + spapr_phb_hotplug_dma_sync(phb); } drck->attach(drc, DEVICE(pdev), @@ -1065,6 +1107,7 @@ static void spapr_phb_remove_pci_device_cb(DeviceState *dev, void *opaque) */ pci_device_reset(PCI_DEVICE(dev)); object_unparent(OBJECT(dev)); + spapr_phb_hotplug_dma_sync((sPAPRPHBState *)opaque); } static void spapr_phb_remove_pci_device(sPAPRDRConnector *drc, diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index e32e787..4645f16 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -588,6 +588,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, uint32_t liobn, uint64_t window, uint32_t size); int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, sPAPRTCETable *tcet); +int spapr_tce_replay(sPAPRTCETable *tcet); +int spapr_tce_realloc_userspace(sPAPRTCETable *tcet, bool replay); void spapr_pci_switch_vga(bool big_endian); void spapr_hotplug_req_add_event(sPAPRDRConnector *drc); void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc); diff --git a/trace-events b/trace-events index a93af9a..3cd8bf7 100644 --- a/trace-events +++ b/trace-events @@ -1300,6 +1300,8 @@ spapr_pci_rtas_ibm_query_interrupt_source_number(unsigned ioa, unsigned intr) "q spapr_pci_msi_write(uint64_t addr, uint64_t data, uint32_t dt_irq) "@%"PRIx64"<=%"PRIx64" IRQ %u" spapr_pci_lsi_set(const char *busname, int pin, uint32_t irq) "%s PIN%d IRQ %u" spapr_pci_msi_retry(unsigned config_addr, unsigned req_num, unsigned max_irqs) "Guest device at %x asked %u, have only %u" +spapr_pci_dma_update(uint64_t liobn, long ret) "liobn=%"PRIx64" tcet=%ld" +spapr_pci_dma_realloc_update(uint64_t liobn, long ret) "liobn=%"PRIx64" tcet=%ld" # hw/pci/pci.c pci_update_mappings_del(void *d, uint32_t bus, uint32_t func, uint32_t slot, int bar, uint64_t addr, uint64_t size) "d=%p %02x:%02x.%x %d,%#"PRIx64"+%#"PRIx64 -- 2.4.0.rc3.8.gfb3e7d5