On Mon, 2012-07-16 at 08:21 -0600, Alex Williamson wrote: > On Sat, 2012-07-14 at 12:34 +1000, Alexey Kardashevskiy wrote: > > On 14/07/12 01:07, Alex Williamson wrote: > > > On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote: > > >> It literally does the following: > > >> > > >> 1. POWERPC IOMMU support (the kernel counterpart is required) > > >> > > >> 2. The patch assumes that IOAPIC calls are going to be replaced > > >> with something generic. > > >> > > >> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU > > >> handler to call VFIO IOMMU driver. > > >> > > >> 4. Change sPAPR PHB to scan the PCI bus which is used for > > >> the IOMMU-VFIO group. Now it is enough to add the following to > > >> the QEMU command line to get VFIO up with all the devices from > > >> IOMMU group with id=3: > > >> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ > > >> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 > > >> > > >> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> > > >> --- > > >> hw/ppc/Makefile.objs | 3 ++ > > >> hw/spapr.h | 4 ++ > > >> hw/spapr_iommu.c | 69 ++++++++++++++++++++++++++++++- > > >> hw/spapr_iommu_vfio.h | 49 ++++++++++++++++++++++ > > >> hw/spapr_pci.c | 108 > > >> ++++++++++++++++++++++++++++++++++++++++++++++--- > > >> hw/spapr_pci.h | 4 ++ > > >> hw/vfio_pci.c | 30 ++++++++++++++ > > >> hw/vfio_pci.h | 2 + > > >> trace-events | 1 + > > >> 9 files changed, 264 insertions(+), 6 deletions(-) > > >> create mode 100644 hw/spapr_iommu_vfio.h > > >> > > >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs > > >> index f573a95..c46a049 100644 > > >> --- a/hw/ppc/Makefile.objs > > >> +++ b/hw/ppc/Makefile.objs > > >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o > > >> # Xilinx PPC peripherals > > >> obj-y += xilinx_ethlite.o > > >> > > >> +# VFIO PCI device assignment > > >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o > > >> + > > >> obj-y := $(addprefix ../,$(obj-y)) > > >> diff --git a/hw/spapr.h b/hw/spapr.h > > >> index b37f337..26e26f6 100644 > > >> --- a/hw/spapr.h > > >> +++ b/hw/spapr.h > > >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char > > >> *propname, > > >> int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, > > >> DMAContext *dma); > > >> > > >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn, > > >> + uint64_t *dma32_window_start, > > >> + uint64_t *dma32_window_size); > > >> + > > >> #endif /* !defined (__HW_SPAPR_H__) */ > > >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c > > >> index 50c288d..e48ced1 100644 > > >> --- a/hw/spapr_iommu.c > > >> +++ b/hw/spapr_iommu.c > > >> @@ -23,6 +23,8 @@ > > >> #include "dma.h" > > >> > > >> #include "hw/spapr.h" > > >> +#include "hw/spapr_iommu_vfio.h" > > >> +#include "hw/vfio_pci.h" > > >> > > >> #include <libfdt.h> > > >> > > >> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, > > >> target_ulong ioba, target_ulong tce) > > >> return 0; > > >> } > > >> > > >> +typedef struct sPAPRVFIOTable { > > >> + int group_id; > > >> + uint32_t liobn; > > >> + QLIST_ENTRY(sPAPRVFIOTable) list; > > >> +} sPAPRVFIOTable; > > >> + > > >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; > > >> + > > >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn, > > >> + uint64_t *dma32_window_start, > > >> + uint64_t *dma32_window_size) > > >> +{ > > >> + sPAPRVFIOTable *t; > > >> + struct tce_iommu_info info = { .argsz = sizeof(info) }; > > >> + > > >> + if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, > > >> &info)) { > > >> + perror("SPAPR_TCE_IOMMU_GET_INFO failed"); > > >> + return; > > >> + } > > >> + *dma32_window_start = info.dma32_window_start; > > >> + *dma32_window_size = info.dma32_window_size; > > >> + > > >> + t = g_malloc0(sizeof(*t)); > > >> + t->group_id = group_id; > > >> + t->liobn = liobn; > > >> + > > >> + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); > > >> +} > > >> + > > >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong > > >> tce) > > >> +{ > > >> + sPAPRVFIOTable *t; > > >> + struct tce_iommu_dma_map map = { > > >> + .argsz = sizeof(map), > > >> + .va = 0, > > >> + .dmaaddr = ioba, > > >> + }; > > >> + > > >> + QLIST_FOREACH(t, &vfio_tce_tables, list) { > > >> + if (t->liobn != liobn) { > > >> + continue; > > >> + } > > >> + if (tce) { > > >> + map.va = (uintptr_t)qemu_get_ram_ptr(tce & > > >> ~SPAPR_TCE_PAGE_MASK); > > >> + if (vfio_group_iommu_ioctl(t->group_id, > > >> SPAPR_TCE_IOMMU_MAP_DMA, > > >> + &map)) { > > >> + perror("TCE_MAP_DMA"); > > >> + return H_PARAMETER; > > >> + } > > >> + } else { > > >> + if (vfio_group_iommu_ioctl(t->group_id, > > >> SPAPR_TCE_IOMMU_UNMAP_DMA, > > >> + &map)) { > > >> + perror("TCE_UNMAP_DMA"); > > >> + return H_PARAMETER; > > >> + } > > >> + } > > >> + return H_SUCCESS; > > >> + } > > >> + return H_CONTINUE; /* positive non-zero value */ > > >> +} > > >> + > > >> static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, > > >> target_ulong opcode, target_ulong *args) > > >> { > > >> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, > > >> sPAPREnvironment *spapr, > > >> ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1); > > >> > > >> ret = put_tce_emu(liobn, ioba, tce); > > >> - if (0 >= ret) { > > >> + if (ret <= 0) { > > >> + return ret ? H_PARAMETER : H_SUCCESS; > > >> + } > > >> + ret = put_tce_vfio(liobn, ioba, tce); > > >> + if (ret <= 0) { > > >> return ret ? H_PARAMETER : H_SUCCESS; > > >> } > > >> #ifdef DEBUG_TCE > > >> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h > > >> new file mode 100644 > > >> index 0000000..711e3e4 > > >> --- /dev/null > > >> +++ b/hw/spapr_iommu_vfio.h > > >> @@ -0,0 +1,49 @@ > > >> +/* > > >> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE. > > >> + * This is the copy of the kernel header. > > >> + * > > >> + * Copyright (c) 2012 Alexey Kardashevskiy <a...@olabs.ru> > > >> + * > > >> + * This library is free software; you can redistribute it and/or > > >> + * modify it under the terms of the GNU Lesser General Public > > >> + * License as published by the Free Software Foundation; either > > >> + * version 2 of the License, or (at your option) any later version. > > >> + * > > >> + * This library is distributed in the hope that it will be useful, > > >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > >> + * Lesser General Public License for more details. > > >> + * > > >> + * You should have received a copy of the GNU Lesser General Public > > >> + * License along with this library; if not, see > > >> <http://www.gnu.org/licenses/>. > > >> + */ > > >> + > > >> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__) > > >> +#define __HW_SPAPR_IOMMU_VFIO_H__ > > >> + > > >> +#include "hw/linux-vfio.h" > > >> + > > >> +#define SPAPR_TCE_IOMMU 2 > > >> + > > >> +struct tce_iommu_info { > > >> + __u32 argsz; > > >> + __u32 flags; > > >> + __u32 dma32_window_start; > > >> + __u32 dma32_window_size; > > >> + __u64 dma64_window_start; > > >> + __u64 dma64_window_size; > > >> +}; > > >> + > > >> +#define SPAPR_TCE_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) > > >> + > > >> +struct tce_iommu_dma_map { > > >> + __u32 argsz; > > >> + __u32 flags; > > >> + __u64 va; > > >> + __u64 dmaaddr; > > >> +}; > > >> + > > >> +#define SPAPR_TCE_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) > > >> +#define SPAPR_TCE_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) > > >> + > > >> +#endif > > >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c > > >> index 014297b..836ec4f 100644 > > >> --- a/hw/spapr_pci.c > > >> +++ b/hw/spapr_pci.c > > >> @@ -22,6 +22,9 @@ > > >> * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > > >> DEALINGS IN > > >> * THE SOFTWARE. > > >> */ > > >> +#include <sys/types.h> > > >> +#include <dirent.h> > > >> + > > >> #include "hw.h" > > >> #include "pci.h" > > >> #include "msi.h" > > >> @@ -32,7 +35,6 @@ > > >> #include "exec-memory.h" > > >> #include <libfdt.h> > > >> #include "trace.h" > > >> - > > >> #include "hw/pci_internals.h" > > >> > > >> /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ > > >> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int > > >> irq_num, int level) > > >> level); > > >> } > > >> > > >> +static int pci_spapr_get_irq(void *opaque, int irq_num) > > >> +{ > > >> + sPAPRPHBState *phb = opaque; > > >> + return phb->lsi_table[irq_num].dt_irq; > > >> +} > > >> + > > >> static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, > > >> unsigned size) > > >> { > > >> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus > > >> *bus, void *opaque, > > >> return phb->dma; > > >> } > > >> > > >> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) > > >> +{ > > >> + char iommupath[256]; > > >> + DIR *dirp; > > >> + struct dirent *entry; > > >> + > > >> + if (!phb->scan) { > > >> + trace_spapr_pci("Autoscan disabled for ", phb->dtbusname); > > >> + return 0; > > >> + } > > >> + > > >> + snprintf(iommupath, sizeof(iommupath), > > >> + "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid); > > >> + dirp = opendir(iommupath); > > >> + > > >> + while ((entry = readdir(dirp)) != NULL) { > > >> + char *tmp; > > >> + FILE *deviceclassfile; > > >> + unsigned deviceclass = 0, domainid, busid, devid, fnid; > > >> + char addr[32]; > > >> + DeviceState *dev; > > >> + > > >> + if (sscanf(entry->d_name, "%X:%X:%X.%x", > > >> + &domainid, &busid, &devid, &fnid) != 4) { > > >> + continue; > > >> + } > > >> + > > >> + tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name); > > >> + trace_spapr_pci("Reading device class from ", tmp); > > >> + > > >> + deviceclassfile = fopen(tmp, "r"); > > >> + if (deviceclassfile) { > > >> + fscanf(deviceclassfile, "%x", &deviceclass); > > >> + fclose(deviceclassfile); > > >> + } > > >> + g_free(tmp); > > >> + > > >> + if (!deviceclass) { > > >> + continue; > > >> + } > > >> + if ((phb->scan < 2) && > > >> + ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) { > > >> + /* Skip _any_ bridge */ > > >> + continue; > > >> + } > > >> + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { > > >> + /* Tweak USB */ > > >> + phb->force_addr = 1; > > >> + phb->enable_multifunction = 1; > > >> + } > > >> + > > >> + trace_spapr_pci("Creating devicei from ", entry->d_name); > > >> + > > >> + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); > > >> + if (!dev) { > > >> + fprintf(stderr, "failed to create vfio-pci\n"); > > >> + continue; > > >> + } > > >> + qdev_prop_parse(dev, "host", entry->d_name); > > >> + if (phb->force_addr) { > > >> + snprintf(addr, sizeof(addr), "%x.%x", devid, fnid); > > >> + qdev_prop_parse(dev, "addr", addr); > > >> + } > > >> + if (phb->enable_multifunction) { > > >> + qdev_prop_set_bit(dev, "multifunction", 1); > > >> + } > > >> + qdev_init_nofail(dev); > > >> + } > > >> + closedir(dirp); > > >> + > > >> + return 0; > > >> +} > > >> + > > >> static int spapr_phb_init(SysBusDevice *s) > > >> { > > >> sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); > > >> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s) > > >> > > >> bus = pci_register_bus(&phb->host_state.busdev.qdev, > > >> phb->busname ? phb->busname : phb->dtbusname, > > >> - pci_spapr_set_irq, NULL, pci_spapr_map_irq, > > >> phb, > > >> + pci_spapr_set_irq, pci_spapr_get_irq, > > >> + pci_spapr_map_irq, phb, > > >> &phb->memspace, &phb->iospace, > > >> PCI_DEVFN(0, 0), PCI_NUM_PINS); > > >> phb->host_state.bus = bus; > > >> > > >> phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << > > >> 16); > > >> - phb->dma_window_start = 0; > > >> - phb->dma_window_size = 0x40000000; > > >> - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, > > >> phb->dma_window_size); > > >> pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); > > >> > > >> QLIST_INSERT_HEAD(&spapr->phbs, phb, list); > > >> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s) > > >> } > > >> } > > >> > > >> + if (phb->iommugroupid >= 0) { > > >> + if (spapr_pci_scan_vfio(phb) < 0) { > > >> + return -1; > > >> + } > > >> + spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn, > > >> + &phb->dma_window_start, > > >> + &phb->dma_window_size); > > >> + return 0; > > >> + } > > >> + > > >> + phb->dma_window_start = 0; > > >> + phb->dma_window_size = 0x40000000; > > >> + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, > > >> + phb->dma_window_size); > > >> + > > >> return 0; > > >> } > > >> > > >> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = { > > >> DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), > > >> DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, > > >> 0x10000), > > >> DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), > > >> + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), > > >> + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), > > >> + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), > > >> + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), > > >> DEFINE_PROP_END_OF_LIST(), > > >> }; > > >> > > >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h > > >> index 145071c..f514823 100644 > > >> --- a/hw/spapr_pci.h > > >> +++ b/hw/spapr_pci.h > > >> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState { > > >> int nvec; > > >> } msi_table[SPAPR_MSIX_MAX_DEVS]; > > >> > > >> + int32_t iommugroupid; > > >> + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything > > >> */ > > >> + uint8_t enable_multifunction, force_addr; > > >> + > > >> QLIST_ENTRY(sPAPRPHBState) list; > > >> } sPAPRPHBState; > > >> > > >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c > > >> index 1ac287f..fc84fb4 100644 > > >> --- a/hw/vfio_pci.c > > >> +++ b/hw/vfio_pci.c > > >> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup > > >> *group) > > >> > > >> memory_listener_register(&container->listener, > > >> get_system_memory()); > > >> > > >> + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) { > > >> + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); > > >> + if (ret) { > > >> + error_report("vfio: failed to set group container: %s\n", > > >> + strerror(errno)); > > >> + g_free(container); > > >> + close(fd); > > >> + return -1; > > >> + } > > >> + > > >> + ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU); > > >> + if (ret) { > > >> + error_report("vfio: failed to set iommu for container: > > >> %s\n", > > >> + strerror(errno)); > > >> + g_free(container); > > >> + close(fd); > > >> + return -1; > > >> + } > > > > > > I think we can still do better. The x86 code sets up a MemoryListener > > > here with data for that embedded into the VFIOContainer. You don't > > > have, need, or want a MemoryListener, but that doesn't mean we can't > > > follow the model of registering that this group exists here and setting > > > up map/unmap callbacks. > > > > > > For instance: > > > > > > in vfio_pci.h: > > > struct sPAPRVFIOData { > > > uint64_t dma32_window_start; > > > uint64_t dma64_window_size; > > > .... > > > int (*map)(struct tce_iommu_dma_map *); > > > int (*unmap)(struct tce_iommu_dma_map *); > > > }; > > > > > > appended to the above spapr tce iommu setup above: > > > > > > struct tce_iommu_info info; > > > > > > /* the MemoryListener embedded in container becomes a union to hold > > > * iommu specific data. */ > > > container->u.spapr.data->map = vfio_spapr_tce_map; > > > container->u.spapr.data->unmap = vfio_spapr_tce_unmap; > > > > I could actually reuse x86 callbacks, just wanted to keep POWER ioctls > > together. The problem here is > > getting the DMA window parameters and avoiding MemoryListener. > > The callback is pretty trivial though, filling in the data structure and > calling the ioctl. We have different data structures and different > ioctls, so probably not a lot to leverage. > > > > ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did > > > > > > container->u.spapr.data->dma32_window_start = info.dma32_window_start; > > > container->u.spapr.data->dma32_window_size = info.dma32_window_size; > > > > > > spapr_register_vfio_container(&container->u.spapr.data) > > > > I assume it is called within vfio_pci.c as we do not want to access > > VFIOContainer members from > > anywhere but vfio_pci.c. > > Or we are changing the approach? I am a bit confused. > > Yes, the registration function would be called from vfio_pci at the > equivalent place in the spar iommu test as x86 is setting up the memory > listener. That would register the map/unmap function pointers and dma > window information. spapr would then make map/unmap calls using those > function pointers, those would be implemented in vfio_pci where they > could dereference the container and therefore get to the container fd. > > > > Then vfio_disconnect_container() could call > > > spapr_unregister_vfio_container(). Maybe the container contains a > > > function pointer to an uninit function so we don't have to ifdef between > > > x86 and power. Does that make sense? Thanks, > > > > We also need to pass the numbers from the info struct to spapr_pci.c in > > order to tell the guest > > where the DMA window besides. Another callback? This exactly what I avoided > > in the kernel when we > > decided not to extend IOMMU API with POWER stuff, I would like to have the > > same here. > > This in an internal API, there's no penalty for fixing it later. > callbacks and window info are passed in the data structure outlined > above. > > > In general, what is good in pulling to VFIO as much platform specific stuff > > as possible? > > > > I am trying to keep sPAPR IOMMU stuff away and make it easy to add new > > platforms to VFIO. > > > > For example, I would rather think of moving the piece of code which checks > > for SPAPR_TCE_IOMMU out > > of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as > > the way IOMMU works is > > definitely the specific machine type feature. > > > > For example, int QEMUMachine::init_iommu(VFIOContainter *container) which > > would not even try > > VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine > > and IOMMU types already. > > I don't think tying vfio into the QEMUMachine type has a future. If you > convince Anthony or Michael otherwise, let me know. Your attempt to > keep spapr stuff completely out of vfio is requiring private interfaces > to be exposed. That I think is the wrong direction. > > > And do something like: > > typedef struct VFIOContainer { > > int fd; > > void *platform_iommu_data; > > } VFIOContainer; > > > > Create additional file called vfio_iommu_x86.c with: > > struct VFIO_Type1_IOMMU { > > MemoryListener listener; > > QLIST_HEAD(, VFIOGroup) group_list; > > QLIST_ENTRY(VFIOContainer) next; > > }; > > and put all MemoryListener stuff there. > > > > For POWER we already have spapr_iommu.c. > > > > Wrong direction? :) > > This is actually very similar to what I'm proposing above. Rather than > a platform_iommu_data pointer, I'm suggesting that we make a union in > VFIOContainer. That allows us to actually dereference the container and > get back to the fd in the callback functions. Thanks,
Here's a stab at generalizing the way it works on x86. The commit comment outlines how I think it should be extended. https://github.com/awilliam/qemu-vfio/commit/96e6f8446fc10a8f9d0506df8230218799de31ae Thanks, Alex