The patch enables VFIO on POWER. It literally does the following:
1. POWERPC IOMMU support (the kernel counterpart is required) 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation. 3. Added vfio_get_container_fd() to VFIO in order to initialize 1). 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to distinguish VFIO's DMA context from the emulated one. WIth the pathes posted today a bit earlier, this patch fully supports VFIO what includes MSIX as well, Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> --- hw/ppc/Makefile.objs | 3 ++ hw/spapr.h | 4 +++ hw/spapr_iommu.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ hw/spapr_pci.c | 23 ++++++++++++- hw/spapr_pci.h | 2 ++ hw/vfio_pci.c | 76 +++++++++++++++++++++++++++++++++++++++++-- hw/vfio_pci.h | 2 ++ 7 files changed, 193 insertions(+), 4 deletions(-) diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs index f573a95..c46a049 100644 --- a/hw/ppc/Makefile.objs +++ b/hw/ppc/Makefile.objs @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o # Xilinx PPC peripherals obj-y += xilinx_ethlite.o +# VFIO PCI device assignment +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o + obj-y := $(addprefix ../,$(obj-y)) diff --git a/hw/spapr.h b/hw/spapr.h index b37f337..9dca704 100644 --- a/hw/spapr.h +++ b/hw/spapr.h @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname, int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, DMAContext *dma); +void spapr_vfio_init_dma(int fd, uint32_t liobn, + uint64_t *dma32_window_start, + uint64_t *dma32_window_size); + #endif /* !defined (__HW_SPAPR_H__) */ diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c index 50c288d..0a194e8 100644 --- a/hw/spapr_iommu.c +++ b/hw/spapr_iommu.c @@ -16,6 +16,8 @@ * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, see <http://www.gnu.org/licenses/>. */ +#include <sys/ioctl.h> + #include "hw.h" #include "kvm.h" #include "qdev.h" @@ -23,6 +25,7 @@ #include "dma.h" #include "hw/spapr.h" +#include "hw/linux-vfio.h" #include <libfdt.h> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce) return 0; } +/* -------- API for POWERPC IOMMU -------- */ + +#define POWERPC_IOMMU 2 + +struct tce_iommu_info { + __u32 argsz; + __u32 dma32_window_start; + __u32 dma32_window_size; +}; + +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct tce_iommu_dma_map { + __u32 argsz; + __u64 va; + __u64 dmaaddr; +}; + +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) + +typedef struct sPAPRVFIOTable { + int fd; + uint32_t liobn; + QLIST_ENTRY(sPAPRVFIOTable) list; +} sPAPRVFIOTable; + +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; + +void spapr_vfio_init_dma(int fd, uint32_t liobn, + uint64_t *dma32_window_start, + uint64_t *dma32_window_size) +{ + sPAPRVFIOTable *t; + struct tce_iommu_info info = { .argsz = sizeof(info) }; + + if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) { + fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno); + return; + } + *dma32_window_start = info.dma32_window_start; + *dma32_window_size = info.dma32_window_size; + + t = g_malloc0(sizeof(*t)); + t->fd = fd; + t->liobn = liobn; + + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); +} + +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) +{ + sPAPRVFIOTable *t; + struct tce_iommu_dma_map map = { + .argsz = sizeof(map), + .va = 0, + .dmaaddr = ioba, + }; + + QLIST_FOREACH(t, &vfio_tce_tables, list) { + if (t->liobn != liobn) { + continue; + } + if (tce) { + map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK); + if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) { + fprintf(stderr, "TCE_MAP_DMA: %d\n", errno); + return H_PARAMETER; + } + } else { + if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) { + fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno); + return H_PARAMETER; + } + } + return H_SUCCESS; + } + return H_CONTINUE; /* positive non-zero value */ +} + static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, target_ulong opcode, target_ulong *args) { @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, if (0 >= ret) { return ret ? H_PARAMETER : H_SUCCESS; } + ret = put_tce_vfio(liobn, ioba, tce); + if (0 >= ret) { + return ret ? H_PARAMETER : H_SUCCESS; + } #ifdef DEBUG_TCE fprintf(stderr, "%s on liobn=" TARGET_FMT_lx " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx " ret=%d\n", diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c index 5f89003..3375c3f 100644 --- a/hw/spapr_pci.c +++ b/hw/spapr_pci.c @@ -29,6 +29,7 @@ #include "pci_host.h" #include "hw/spapr.h" #include "hw/spapr_pci.h" +#include "hw/vfio_pci.h" #include "exec-memory.h" #include <libfdt.h> #include "trace.h" @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level) level); } +static int pci_spapr_get_irq(void *opaque, int irq_num) +{ + sPAPRPHBState *phb = opaque; + return phb->lsi_table[irq_num].dt_irq; +} + static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, unsigned size) { @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s) bus = pci_register_bus(&phb->host_state.busdev.qdev, phb->busname ? phb->busname : phb->dtbusname, - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, + pci_spapr_set_irq, pci_spapr_get_irq, + pci_spapr_map_irq, phb, &phb->memspace, &phb->iospace, PCI_DEVFN(0, 0), PCI_NUM_PINS); phb->host_state.bus = bus; @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = { DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), + DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0), DEFINE_PROP_END_OF_LIST(), }; @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr, /* Finalize PCI setup, called when all devices are already created */ int spapr_finalize_pci_setup(sPAPRPHBState *phb) { + if (phb->is_vfio) { + int fd = vfio_get_container_fd(phb->host_state.bus); + + if (fd < 0) { + return fd; + } + spapr_vfio_init_dma(fd, phb->dma_liobn, + &phb->dma_window_start, + &phb->dma_window_size); + return 0; + } + phb->dma_window_start = 0; phb->dma_window_size = 0x40000000; phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h index 3aae273..a4f031b 100644 --- a/hw/spapr_pci.h +++ b/hw/spapr_pci.h @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState { int nvec; } msi_table[SPAPR_MSIX_MAX_DEVS]; + uint8_t is_vfio; + QLIST_ENTRY(sPAPRPHBState) list; } sPAPRPHBState; diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c index 1ac287f..cc0b974 100644 --- a/hw/vfio_pci.c +++ b/hw/vfio_pci.c @@ -21,7 +21,6 @@ #include <dirent.h> #include <stdio.h> #include <unistd.h> -#include <sys/io.h> #include <sys/ioctl.h> #include <sys/mman.h> #include <sys/types.h> @@ -44,6 +43,17 @@ #include "vfio_pci.h" #include "linux-vfio.h" +#ifndef TARGET_PPC64 +#include <sys/io.h> +#define VFIO_IOMMU_EXTENSION VFIO_X86_IOMMU +#else +#include "hw/pci_internals.h" +#include "hw/xics.h" +#include "hw/spapr.h" +#define POWERPC_IOMMU 2 +#define VFIO_IOMMU_EXTENSION POWERPC_IOMMU +#endif + //#define DEBUG_VFIO #ifdef DEBUG_VFIO #define DPRINTF(fmt, ...) \ @@ -235,6 +245,7 @@ struct vfio_irq_set_fd { static void vfio_enable_intx_kvm(VFIODevice *vdev) { +#ifndef TARGET_PPC64 #ifdef CONFIG_KVM struct vfio_irq_set_fd irq_set_fd = { .irq_set = { @@ -298,10 +309,12 @@ fail: qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev); vfio_unmask_intx(vdev); #endif +#endif } static void vfio_disable_intx_kvm(VFIODevice *vdev) { +#ifndef TARGET_PPC64 #ifdef CONFIG_KVM struct vfio_irq_set_fd irq_set_fd = { .irq_set = { @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev) DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__, vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func); #endif +#endif } +#ifndef TARGET_PPC64 static void vfio_update_irq(Notifier *notify, void *data) { VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq); @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data) /* Re-enable the interrupt in cased we missed an EOI */ vfio_eoi(&vdev->intx.eoi, NULL); } +#endif static int vfio_enable_intx(VFIODevice *vdev) { @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev) vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */ vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin); vdev->intx.eoi.notify = vfio_eoi; +#ifndef TARGET_PPC64 ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq); vdev->intx.update_irq.notify = vfio_update_irq; pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq); +#else + xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq); +#endif if (event_notifier_init(&vdev->intx.interrupt, 0)) { error_report("vfio: Error: event_notifier_init failed\n"); @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev) vfio_disable_intx_kvm(vdev); vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX); +#ifndef TARGET_PPC64 pci_remove_irq_update_notifier(&vdev->intx.update_irq); ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq); +#else + xics_remove_eoi_notifier(&vdev->intx.eoi); +#endif fd = event_notifier_get_fd(&vdev->intx.interrupt); qemu_set_fd_handler(fd, NULL, NULL, vdev); @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, } fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt); - +#ifndef TARGET_PPC64 vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg); if (vdev->msi_vectors[vector].virq < 0 || kvm_irqchip_add_irqfd(kvm_state, fd, @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev, qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, &vdev->msi_vectors[vector]); } - +#else + vdev->msi_vectors[vector].virq = -1; + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, + &vdev->msi_vectors[vector]); +#endif if (vdev->nr_vectors < vector + 1) { int i; @@ -692,6 +720,7 @@ retry: fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt); msg = msi_get_msg(&vdev->pdev, i); +#ifndef TARGET_PPC64 vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg); if (vdev->msi_vectors[i].virq < 0 || kvm_irqchip_add_irqfd(kvm_state, fd, @@ -699,6 +728,12 @@ retry: qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, &vdev->msi_vectors[i]); } +#else + vdev->msi_vectors[i].virq = -1; + qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, + &vdev->msi_vectors[i]); + msg = msg; +#endif } ret = vfio_enable_vectors(vdev, false); @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group) memory_listener_register(&container->listener, get_system_memory()); +#define POWERPC_IOMMU 2 + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) { + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); + if (ret) { + error_report("vfio: failed to set group container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } + + ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU); + if (ret) { + error_report("vfio: failed to set iommu for container: %s\n", + strerror(errno)); + g_free(container); + close(fd); + return -1; + } } else { error_report("vfio: No available IOMMU models\n"); g_free(container); @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void) } type_init(register_vfio_pci_dev_type) + +int vfio_get_container_fd(struct PCIBus *pbus) +{ + BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children); + VFIODevice *vdev1st; + + if (!kid1st) { + printf("No device registered on PCI bus \"%s\", no DMA enabled\n", + pbus->qbus.name); + return -1; + } + vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev); + + return vdev1st->group->container->fd; +} + diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h index 226607c..0d13341 100644 --- a/hw/vfio_pci.h +++ b/hw/vfio_pci.h @@ -105,4 +105,6 @@ typedef struct VFIOGroup { #define VFIO_FLAG_IOMMU_SHARED_BIT 0 #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT) +int vfio_get_container_fd(struct PCIBus *pbus); + #endif /* __VFIO_H__ */ -- 1.7.10