On 07.08.2013, at 10:21, Alexey Kardashevskiy wrote: > The patch adds a spapr-pci-vfio-host-bridge device type > which is a PCI Host Bridge with VFIO support. The new device > inherits from the spapr-pci-host-bridge device and adds > the following properties: > iommu - IOMMU group ID which represents a Partitionable > Endpoint, QEMU/ppc64 uses a separate PHB for > an IOMMU group so the guest kernel has to have > PCI Domain support enabled. > forceaddr (optional, 0 by default) - forces QEMU to copy > device:function from the host address as > certain guest drivers expect devices to appear in > particular locations; > mf (optional, 0 by default) - forces multifunction bit for > the function #0 of a found device, only makes sense > for multifunction devices and only with the forceaddr > property set. It would not be required if there > was a way to know in advance whether a device is > multifunctional or not. > scan (optional, 1 by default) - if non-zero, the new PHB walks > through all non-bridge devices in the group and tries > adding them to the PHB; if zero, all devices in the group > have to be configured manually via the QEMU command line. > > The patch also adds a VFIO IOMMU type support to the existing > sPAPR TCE list in spapr_iommu.c. > > The patch also uses the host kernel support of a new KVM_CAP_SPAPR_TCE_IOMMU > capability and KVM_CREATE_SPAPR_TCE_IOMMU ioctl which let QEMU tell > the host what LIOBN is used for an IOMMU group. This ioctl turns real mode TCE > requests handling on which accelerates actual throughput in 2.5-5 times. > > Examples: > 1) Scan and add all devices from IOMMU group with ID=1 to QEMU's PHB #6: > -device spapr-pci-vfio-host-bridge,id=DEVICENAME,iommu=1,index=6 > > 2) Configure and Add 3 functions of a multifunctional device to QEMU: > (the NEC PCI USB card is used as an example here): > -device spapr-pci-vfio-host-bridge,id=USB,iommu=4,scan=0,index=7 \ > -device vfio-pci,host=4:0:1.0,addr=1.0,bus=USB,multifunction=true > -device vfio-pci,host=4:0:1.1,addr=1.1,bus=USB > -device vfio-pci,host=4:0:1.2,addr=1.2,bus=USB > > Cc: David Gibson <da...@gibson.dropbear.id.au> > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> > --- > hw/ppc/spapr_iommu.c | 176 ++++++++++++++++++++++++++++++++----- > hw/ppc/spapr_pci.c | 209 +++++++++++++++++++++++++++++++++++++++++--- > include/hw/pci-host/spapr.h | 12 +++ > include/hw/ppc/spapr.h | 19 ++++ > target-ppc/kvm.c | 33 +++++++ > target-ppc/kvm_ppc.h | 12 +++ > trace-events | 4 + > 7 files changed, 429 insertions(+), 36 deletions(-) > > diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c > index 22b09be..096b6a9 100644 > --- a/hw/ppc/spapr_iommu.c > +++ b/hw/ppc/spapr_iommu.c > @@ -16,12 +16,14 @@ > * You should have received a copy of the GNU Lesser General Public > * License along with this library; if not, see > <http://www.gnu.org/licenses/>. > */ > + > #include "hw/hw.h" > #include "sysemu/kvm.h" > #include "hw/qdev.h" > #include "kvm_ppc.h" > #include "sysemu/dma.h" > #include "exec/address-spaces.h" > +#include "trace.h" > > #include "hw/ppc/spapr.h" > > @@ -244,6 +246,74 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, > target_ulong ioba, > return H_SUCCESS; > } > > +static IOMMUTLBEntry spapr_vfio_translate_iommu(MemoryRegion *iommu, hwaddr > addr) > +{ > + IOMMUTLBEntry entry; > + /* Must never be called */ > + assert(0); > + return entry; > +} > + > +static MemoryRegionIOMMUOps spapr_vfio_iommu_ops = { > + .translate = spapr_vfio_translate_iommu, > +}; > + > +static int spapr_tce_table_vfio_realize(DeviceState *dev) > +{ > + sPAPRTCETable *tcet = SPAPR_TCE_TABLE(dev); > + > + memory_region_init_iommu(&tcet->iommu, NULL, &spapr_vfio_iommu_ops, > + "iommu-vfio-spapr", (uint64_t)INT64_MAX+1); > + > + QLIST_INSERT_HEAD(&spapr_tce_tables, tcet, list); > + > + return 0; > +} > + > +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn, > + int group_fd) > +{ > + sPAPRTCETable *tcet; > + int fd; > + > + if (spapr_tce_find_by_liobn(liobn)) { > + fprintf(stderr, "Attempted to create TCE table with duplicate" > + " LIOBN 0x%x\n", liobn); > + return NULL; > + } > + > + fd = kvmppc_create_spapr_tce_iommu(liobn, group_fd); > + > + tcet = SPAPR_TCE_TABLE(object_new(TYPE_SPAPR_TCE_TABLE_VFIO)); > + tcet->liobn = liobn; > + tcet->fd = fd; > + object_property_add_child(OBJECT(owner), "tce-table", OBJECT(tcet), > NULL); > + > + qdev_init_nofail(DEVICE(tcet)); > + > + return tcet; > +} > + > +static target_ulong put_tce_vfio(sPAPRTCETable *tcet, target_ulong ioba, > + target_ulong tce) > +{ > + IOMMUTLBEntry entry; > + > + entry.iova = ioba & ~SPAPR_TCE_PAGE_MASK; > + entry.translated_addr = tce & ~SPAPR_TCE_PAGE_MASK; > + entry.addr_mask = SPAPR_TCE_PAGE_MASK; > + entry.perm = 0; > + if ((tce & SPAPR_TCE_RO) == SPAPR_TCE_RO) { > + entry.perm |= IOMMU_RO; > + } > + if ((tce & SPAPR_TCE_WO) == SPAPR_TCE_WO) { > + entry.perm |= IOMMU_WO; > + } > + memory_region_notify_iommu(&tcet->iommu, entry); > + > + return H_SUCCESS; > +} > + > static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > target_ulong opcode, target_ulong > *args) > @@ -255,18 +325,36 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, > target_ulong npages = args[3]; > target_ulong ret = 0; > sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn); > + sPAPRTCETableClass *info; > > - if (tcet) { > - for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) { > - target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) + > - i * sizeof(target_ulong)); > - ret = put_tce_emu(tcet, ioba, tce); > - if (ret) { > - break; > - } > + if (!tcet) { > + return H_PARAMETER; > + } > + > + info = SPAPR_TCE_TABLE_GET_CLASS(tcet); > + if (!info || !info->put_tce) { > + return H_PARAMETER; > + } > + > + if ((tce_list & SPAPR_TCE_PAGE_MASK) || (npages > 512)) { > + return H_PARAMETER; > + } > + > + if (liobn & 0xFFFFFFFF00000000ULL) { > + hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN " > + TARGET_FMT_lx "\n", liobn); > + return H_PARAMETER; > + } > + > + for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) { > + target_ulong tce = ldq_phys((tce_list & ~SPAPR_TCE_PAGE_MASK) + > + i * sizeof(target_ulong)); > + ret = info->put_tce(tcet, ioba, tce); > + if (ret) { > + break; > } > - return ret; > } > + > #ifdef DEBUG_TCE > fprintf(stderr, "%s on liobn=" TARGET_FMT_lx > " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx > @@ -274,7 +362,7 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, > __func__, liobn, ioba, tce_list, ret); > #endif > > - return H_PARAMETER; > + return ret; > } > > static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, > @@ -287,17 +375,30 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > target_ulong npages = args[3]; > target_ulong ret = 0; > sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn); > + sPAPRTCETableClass *info; > + > + if (!tcet) { > + return H_PARAMETER; > + } > + > + info = SPAPR_TCE_TABLE_GET_CLASS(tcet); > + if (!info || !info->put_tce) { > + return H_PARAMETER; > + } > + > + if (liobn & 0xFFFFFFFF00000000ULL) { > + hcall_dprintf("spapr_vio_put_tce on out-of-boundsw LIOBN " > + TARGET_FMT_lx "\n", liobn); > + return H_PARAMETER; > + } > > ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); > > - if (tcet) { > - for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) { > - ret = put_tce_emu(tcet, ioba, tce_value); > - if (ret) { > - break; > - } > + for (i = 0; i < npages; ++i, ioba += SPAPR_TCE_PAGE_SIZE) { > + ret = info->put_tce(tcet, ioba, tce_value); > + if (ret) { > + break; > } > - return ret; > } > #ifdef DEBUG_TCE > fprintf(stderr, "%s on liobn=" TARGET_FMT_lx > @@ -306,7 +407,7 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > __func__, liobn, ioba, tce_value, ret); > #endif > > - return H_PARAMETER; > + return ret; > } > > static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, > @@ -316,12 +417,21 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > target_ulong ioba = args[1]; > target_ulong tce = args[2]; > sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn); > + target_ulong ret; > + sPAPRTCETableClass *info; > + > + if (!tcet) { > + return H_PARAMETER; > + } > + > + info = SPAPR_TCE_TABLE_GET_CLASS(tcet); > + if (!info || !info->put_tce) { > + return H_PARAMETER; > + } > > ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); > > - if (tcet) { > - return put_tce_emu(tcet, ioba, tce); > - } > + ret = info->put_tce(tcet, ioba, tce); > #ifdef DEBUG_TCE > fprintf(stderr, "%s on liobn=" TARGET_FMT_lx > " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx > @@ -329,7 +439,7 @@ static target_ulong h_put_tce(PowerPCCPU *cpu, > sPAPREnvironment *spapr, > __func__, liobn, ioba, tce, ret); > #endif > > - return H_PARAMETER; > + return ret; > } > > int spapr_dma_dt(void *fdt, int node_off, const char *propname, > @@ -376,9 +486,12 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const > char *propname, > static void spapr_tce_table_class_init(ObjectClass *klass, void *data) > { > DeviceClass *dc = DEVICE_CLASS(klass); > + sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass); > + > dc->vmsd = &vmstate_spapr_tce_table; > dc->init = spapr_tce_table_realize; > dc->reset = spapr_tce_reset; > + k->put_tce = put_tce_emu; > > QLIST_INIT(&spapr_tce_tables); > > @@ -393,12 +506,31 @@ static TypeInfo spapr_tce_table_info = { > .parent = TYPE_DEVICE, > .instance_size = sizeof(sPAPRTCETable), > .class_init = spapr_tce_table_class_init, > + .class_size = sizeof(sPAPRTCETableClass), > .instance_finalize = spapr_tce_table_finalize, > }; > > +static void spapr_tce_table_vfio_class_init(ObjectClass *klass, void *data) > +{ > + DeviceClass *dc = DEVICE_CLASS(klass); > + sPAPRTCETableClass *k = SPAPR_TCE_TABLE_CLASS(klass); > + > + dc->init = spapr_tce_table_vfio_realize; > + k->put_tce = put_tce_vfio; > +} > + > +static TypeInfo spapr_tce_table_vfio_info = { > + .name = TYPE_SPAPR_TCE_TABLE_VFIO, > + .parent = TYPE_SPAPR_TCE_TABLE, > + .instance_size = sizeof(sPAPRTCETable), > + .class_init = spapr_tce_table_vfio_class_init, > + .class_size = sizeof(sPAPRTCETableClass), > +}; > + > static void register_types(void) > { > type_register_static(&spapr_tce_table_info); > + type_register_static(&spapr_tce_table_vfio_info); > } > > type_init(register_types); > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index 869ca43..3f37cac 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c
I think we should move the vfio phb into a separate file and make it be a proper subclass without even the chance to randomly call normal spapr pci functions ;). Andreas, could you please check through this and see if you can spot a way to isolate it out? Alex > @@ -22,6 +22,9 @@ > * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > * THE SOFTWARE. > */ > +#include <sys/types.h> > +#include <dirent.h> > + > #include "hw/hw.h" > #include "hw/pci/pci.h" > #include "hw/pci/msi.h" > @@ -32,6 +35,7 @@ > #include "exec/address-spaces.h" > #include <libfdt.h> > #include "trace.h" > +#include "hw/misc/vfio.h" > > #include "hw/pci/pci_bus.h" > > @@ -496,7 +500,11 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, > void *opaque, int devfn) > return &phb->iommu_as; > } > > -static int spapr_phb_init(SysBusDevice *s) > +/* > + * This is the common initialization part for both emulated and VFIO PHBs > + * which includes everything but DMA and device scan (optional, VFIO only). > + */ > +static int _spapr_phb_init(SysBusDevice *s) > { > DeviceState *dev = DEVICE(s); > sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s); > @@ -610,19 +618,6 @@ static int spapr_phb_init(SysBusDevice *s) > PCI_DEVFN(0, 0), PCI_NUM_PINS, TYPE_PCI_BUS); > phb->bus = bus; > > - sphb->dma_window_start = 0; > - sphb->dma_window_size = 0x40000000; > - sphb->tcet = spapr_tce_new_table(dev, sphb->dma_liobn, > - sphb->dma_window_size); > - if (!sphb->tcet) { > - fprintf(stderr, "Unable to create TCE table for %s\n", > sphb->dtbusname); > - return -1; > - } > - address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet), > - sphb->dtbusname); > - > - pci_setup_iommu(bus, spapr_pci_dma_iommu, sphb); > - > QLIST_INSERT_HEAD(&spapr->phbs, sphb, list); > > /* Initialize the LSI table */ > @@ -641,6 +636,30 @@ static int spapr_phb_init(SysBusDevice *s) > return 0; > } > > +static int spapr_phb_init(SysBusDevice *s) > +{ > + sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s); > + int ret; > + > + ret = _spapr_phb_init(s); > + if (ret) > + return ret; > + > + sphb->dma_window_start = 0; > + sphb->dma_window_size = 0x40000000; > + sphb->tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn, > + sphb->dma_window_size); > + if (!sphb->tcet) { > + fprintf(stderr, "Unable to create TCE table for %s\n", > sphb->dtbusname); > + return -1; > + } > + address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet), > + sphb->dtbusname); > + pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb); > + > + return 0; > +} > + > static void spapr_phb_reset(DeviceState *qdev) > { > SysBusDevice *s = SYS_BUS_DEVICE(qdev); > @@ -749,6 +768,163 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, > int index) > return PCI_HOST_BRIDGE(dev); > } > > +/* sPAPR VFIO */ > +static Property spapr_phb_vfio_properties[] = { > + DEFINE_PROP_INT32("iommu", sPAPRPHBVFIOState, iommugroupid, -1), > + DEFINE_PROP_UINT8("scan", sPAPRPHBVFIOState, scan, 1), > + DEFINE_PROP_UINT8("mf", sPAPRPHBVFIOState, enable_multifunction, 0), > + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBVFIOState, force_addr, 0), > + DEFINE_PROP_END_OF_LIST(), > +}; > + > +static int spapr_pci_vfio_scan(sPAPRPHBVFIOState *svphb) > +{ > + PCIHostState *phb = PCI_HOST_BRIDGE(svphb); > + char *iommupath; > + DIR *dirp; > + struct dirent *entry; > + > + if (!svphb->scan) { > + trace_spapr_pci("autoscan disabled for ", svphb->phb.dtbusname); > + return 0; > + } > + > + iommupath = g_strdup_printf("/sys/kernel/iommu_groups/%d/devices/", > + svphb->iommugroupid); > + if (!iommupath) { > + return -ENOMEM; > + } > + > + dirp = opendir(iommupath); > + if (!dirp) { > + fprintf(stderr, "failed to scan group=%d\n", svphb->iommugroupid); > + g_free(iommupath); > + return -1; > + } > + > + while ((entry = readdir(dirp)) != NULL) { > + Error *err = NULL; > + char *tmp; > + FILE *deviceclassfile; > + unsigned deviceclass = 0, domainid, busid, devid, fnid; > + char addr[32]; > + DeviceState *dev; > + > + if (sscanf(entry->d_name, "%X:%X:%X.%x", > + &domainid, &busid, &devid, &fnid) != 4) { > + continue; > + } > + > + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); > + trace_spapr_pci("Reading device class from ", tmp); > + > + deviceclassfile = fopen(tmp, "r"); > + if (deviceclassfile) { > + int ret = fscanf(deviceclassfile, "%x", &deviceclass); > + fclose(deviceclassfile); > + if (ret != 1) { > + continue; > + } > + } > + g_free(tmp); > + > + if (!deviceclass) { > + continue; > + } > + if ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8)) { > + /* Skip bridges */ > + continue; > + } > + trace_spapr_pci("Creating device from ", entry->d_name); > + > + dev = qdev_create(&phb->bus->qbus, "vfio-pci"); > + if (!dev) { > + fprintf(stderr, "failed to create vfio-pci\n"); > + continue; > + } > + qdev_prop_parse(dev, "host", entry->d_name, &err); > + if (err != NULL) { > + continue; > + } > + if (svphb->force_addr) { > + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); > + err = NULL; > + qdev_prop_parse(dev, "addr", addr, &err); > + if (err != NULL) { > + continue; > + } > + } > + if (svphb->enable_multifunction) { > + qdev_prop_set_bit(dev, "multifunction", 1); > + } > + qdev_init_nofail(dev); > + } > + closedir(dirp); > + g_free(iommupath); > + > + return 0; > +} > + > +static int spapr_phb_vfio_init(SysBusDevice *s) > +{ > + sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(s); > + sPAPRPHBState *sphb = &svphb->phb; > + struct vfio_iommu_spapr_tce_info info = { .argsz = sizeof(info) }; > + int ret, group_fd; > + > + if (svphb->iommugroupid == -1) { > + fprintf(stderr, "Wrong IOMMU group ID %d\n", svphb->iommugroupid); > + return -1; > + } > + > + ret = _spapr_phb_init(s); > + if (ret) { > + return ret; > + } > + > + ret = vfio_container_spapr_get_info(&svphb->phb.iommu_as, > + svphb->iommugroupid, > + &info, &group_fd); > + if (ret) > + return ret; > + > + svphb->phb.dma_window_start = info.dma32_window_start; > + svphb->phb.dma_window_size = info.dma32_window_size; > + svphb->phb.tcet = spapr_vfio_new_table(DEVICE(sphb), > svphb->phb.dma_liobn, > + group_fd); > + > + address_space_init(&sphb->iommu_as, spapr_tce_get_iommu(sphb->tcet), > + sphb->dtbusname); > + pci_setup_iommu(sphb->parent_obj.bus, spapr_pci_dma_iommu, sphb); > + > + ret = spapr_pci_vfio_scan(svphb); > + > + return ret; > +} > + > +static void spapr_phb_vfio_reset(DeviceState *qdev) > +{ > + /* Do nothing */ > +} > + > +static void spapr_phb_vfio_class_init(ObjectClass *klass, void *data) > +{ > + SysBusDeviceClass *sdc = SYS_BUS_DEVICE_CLASS(klass); > + DeviceClass *dc = DEVICE_CLASS(klass); > + > + sdc->init = spapr_phb_vfio_init; > + dc->props = spapr_phb_vfio_properties; > + dc->reset = spapr_phb_vfio_reset; > + dc->vmsd = &vmstate_spapr_pci; > +} > + > +static const TypeInfo spapr_phb_vfio_info = { > + .name = TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE, > + .parent = TYPE_SPAPR_PCI_HOST_BRIDGE, > + .instance_size = sizeof(sPAPRPHBVFIOState), > + .class_init = spapr_phb_vfio_class_init, > +}; > + > /* Macros to operate with address in OF binding to PCI */ > #define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p)) > #define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */ > @@ -839,6 +1015,10 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > _FDT(fdt_setprop(fdt, bus_off, "interrupt-map", &interrupt_map, > sizeof(interrupt_map))); > > + if (!phb->dma_window_size) { > + fprintf(stderr, "Unexpected error: DMA window is zero, exiting\n"); > + exit(1); > + } > spapr_dma_dt(fdt, bus_off, "ibm,dma-window", > phb->dma_liobn, phb->dma_window_start, > phb->dma_window_size); > @@ -862,6 +1042,7 @@ void spapr_pci_rtas_init(void) > static void spapr_pci_register_types(void) > { > type_register_static(&spapr_phb_info); > + type_register_static(&spapr_phb_vfio_info); > } > > type_init(spapr_pci_register_types) > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 970b4a9..fab18e5 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -30,10 +30,14 @@ > #define SPAPR_MSIX_MAX_DEVS 32 > > #define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge" > +#define TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE "spapr-pci-vfio-host-bridge" > > #define SPAPR_PCI_HOST_BRIDGE(obj) \ > OBJECT_CHECK(sPAPRPHBState, (obj), TYPE_SPAPR_PCI_HOST_BRIDGE) > > +#define SPAPR_PCI_VFIO_HOST_BRIDGE(obj) \ > + OBJECT_CHECK(sPAPRPHBVFIOState, (obj), TYPE_SPAPR_PCI_VFIO_HOST_BRIDGE) > + > typedef struct sPAPRPHBState { > PCIHostState parent_obj; > > @@ -64,6 +68,14 @@ typedef struct sPAPRPHBState { > QLIST_ENTRY(sPAPRPHBState) list; > } sPAPRPHBState; > > +typedef struct sPAPRPHBVFIOState { > + sPAPRPHBState phb; > + > + struct VFIOContainer *container; > + int32_t iommugroupid; > + uint8_t scan, enable_multifunction, force_addr; > +} sPAPRPHBVFIOState; > + > #define SPAPR_PCI_BASE_BUID 0x800000020000000ULL > > #define SPAPR_PCI_WINDOW_BASE 0x10000000000ULL > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index 2dc3d06..a64e58a 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -353,12 +353,29 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr > rtas_addr, > > #define RTAS_ERROR_LOG_MAX 2048 > > +typedef struct sPAPRTCETableClass sPAPRTCETableClass; > typedef struct sPAPRTCETable sPAPRTCETable; > > #define TYPE_SPAPR_TCE_TABLE "spapr-tce-table" > #define SPAPR_TCE_TABLE(obj) \ > OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE) > > +#define TYPE_SPAPR_TCE_TABLE_VFIO "spapr-tce-table-vfio" > +#define SPAPR_TCE_TABLE_VFIO(obj) \ > + OBJECT_CHECK(sPAPRTCETable, (obj), TYPE_SPAPR_TCE_TABLE_VFIO) > + > +#define SPAPR_TCE_TABLE_CLASS(klass) \ > + OBJECT_CLASS_CHECK(sPAPRTCETableClass, (klass), TYPE_SPAPR_TCE_TABLE) > +#define SPAPR_TCE_TABLE_GET_CLASS(obj) \ > + OBJECT_GET_CLASS(sPAPRTCETableClass, (obj), TYPE_SPAPR_TCE_TABLE) > + > +struct sPAPRTCETableClass { > + DeviceClass parent_class; > + > + target_ulong (*put_tce)(sPAPRTCETable *tcet, target_ulong ioba, > + target_ulong tce); > +}; > + > struct sPAPRTCETable { > DeviceState parent; > uint32_t liobn; > @@ -375,6 +392,8 @@ void spapr_events_init(sPAPREnvironment *spapr); > void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq); > sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, > size_t window_size); > +sPAPRTCETable *spapr_vfio_new_table(DeviceState *owner, uint32_t liobn, > + int group_fd); > MemoryRegion *spapr_tce_get_iommu(sPAPRTCETable *tcet); > void spapr_tce_set_bypass(sPAPRTCETable *tcet, bool bypass); > int spapr_dma_dt(void *fdt, int node_off, const char *propname, > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 3d0e398..eb59d7d 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -61,6 +61,7 @@ static int cap_ppc_smt; > static int cap_ppc_rma; > static int cap_spapr_tce; > static int cap_spapr_multitce; > +static int cap_spapr_tce_iommu; > static int cap_hior; > static int cap_one_reg; > static int cap_epr; > @@ -98,6 +99,7 @@ int kvm_arch_init(KVMState *s) > cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA); > cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE); > cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE); > + cap_spapr_tce_iommu = kvm_check_extension(s, KVM_CAP_SPAPR_TCE_IOMMU); > cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG); > cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR); > cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR); > @@ -1669,6 +1671,37 @@ int kvmppc_remove_spapr_tce(void *table, int fd, > uint32_t window_size) > return 0; > } > > +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd) > +{ > + int fd = 0; > + struct kvm_create_spapr_tce_iommu args = { > + .liobn = liobn, > + .fd = group_fd > + }; > + > + if (!kvm_enabled() || !cap_spapr_tce_iommu) { > + fprintf(stderr, "KVM VFIO: TCE IOMMU capability is not present, DMA > may be slow\n"); > + return -1; > + } > + > + fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE_IOMMU, &args); > + if (fd < 0) { > + fprintf(stderr, "KVM VFIO: Failed to create TCE table for liobn > 0x%x, ret = %d, DMA may be slow\n", > + liobn, fd); > + } > + > + return fd; > +} > + > +int kvmppc_remove_spapr_tce_iommu(int fd) > +{ > + if (fd < 0) { > + return -1; > + } > + > + return close(fd); > +} > + > int kvmppc_reset_htab(int shift_hint) > { > uint32_t shift = shift_hint; > diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h > index a2a903f..a223e63 100644 > --- a/target-ppc/kvm_ppc.h > +++ b/target-ppc/kvm_ppc.h > @@ -34,6 +34,8 @@ off_t kvmppc_alloc_rma(const char *name, MemoryRegion > *sysmem); > bool kvmppc_spapr_use_multitce(void); > void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd); > int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size); > +int kvmppc_create_spapr_tce_iommu(uint32_t liobn, int group_fd); > +int kvmppc_remove_spapr_tce_iommu(int fd); > int kvmppc_reset_htab(int shift_hint); > uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift); > #endif /* !CONFIG_USER_ONLY */ > @@ -144,6 +146,16 @@ static inline int kvmppc_remove_spapr_tce(void *table, > int pfd, > return -1; > } > > +static inline int kvmppc_create_spapr_tce_iommu(uint32_t liobn, uint32_t > iommu_id) > +{ > + return -1; > +} > + > +static inline int kvmppc_remove_spapr_tce_iommu(int fd) > +{ > + return -1; > +} > + > static inline int kvmppc_reset_htab(int shift_hint) > { > return -1; > diff --git a/trace-events b/trace-events > index 3856b5c..d1e54ad 100644 > --- a/trace-events > +++ b/trace-events > @@ -1113,6 +1113,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t > height, int32_t stride, > qxl_render_update_area_done(void *cookie) "%p" > > # hw/ppc/spapr_pci.c > +spapr_pci(const char *msg1, const char *msg2) "%s%s" > spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, > cfg=%x)" > spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) > "dev\"%s\" vector %u, addr=%"PRIx64 > spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, > requested %u" > @@ -1133,6 +1134,9 @@ xics_ics_write_xive(int nr, int srcno, int server, > uint8_t priority) "ics_write_ > xics_ics_reject(int nr, int srcno) "reject irq %#x [src %d]" > xics_ics_eoi(int nr) "ics_eoi: irq %#x" > > +# hw/ppc/spapr_iommu.c > +spapr_iommu(const char *op, uint32_t liobn, uint64_t ioba, uint64_t tce, int > ret) "%s %x ioba=%"PRIx64" tce=%"PRIx64" ret=%d" > + > # util/hbitmap.c > hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned > long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx" > hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, > uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 > -- > 1.8.3.2 >