Two comments below. On 13/07/12 06:54, Blue Swirl wrote: > On Thu, Jul 12, 2012 at 8:52 AM, Alexey Kardashevskiy <a...@ozlabs.ru> wrote: >> It literally does the following: >> >> 1. POWERPC IOMMU support (the kernel counterpart is required) >> >> 2. The patch assumes that IOAPIC calls are going to be replaced >> with something generic. I have something in my local git but it's >> too early, we need to extend PCIINTxRoute first. >> >> 3. vfio_get_group() made public. I want to open IOMMU group from >> the sPAPR code to have everything I need for VFIO on sPAPR and >> avoid ugly workarounds with finilizing PHB setup on sPAPR. >> >> 4. Change sPAPR PHB to scan the PCI bus which is used for >> the IOMMU-VFIO group. Now it is enough to add the following to >> the QEMU command line to get VFIO up with all the devices from >> IOMMU group with id=3: >> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\ >> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000 >> >> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> >> --- >> hw/ppc/Makefile.objs | 3 ++ >> hw/spapr.h | 4 ++ >> hw/spapr_iommu.c | 87 ++++++++++++++++++++++++++++++++++++++ >> hw/spapr_pci.c | 115 >> +++++++++++++++++++++++++++++++++++++++++++++++--- >> hw/spapr_pci.h | 5 +++ >> hw/vfio_pci.c | 28 +++++++++++- >> hw/vfio_pci.h | 2 + >> 7 files changed, 237 insertions(+), 7 deletions(-) >> >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs >> index f573a95..c46a049 100644 >> --- a/hw/ppc/Makefile.objs >> +++ b/hw/ppc/Makefile.objs >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o >> # Xilinx PPC peripherals >> obj-y += xilinx_ethlite.o >> >> +# VFIO PCI device assignment >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o >> + >> obj-y := $(addprefix ../,$(obj-y)) >> diff --git a/hw/spapr.h b/hw/spapr.h >> index b37f337..9dca704 100644 >> --- a/hw/spapr.h >> +++ b/hw/spapr.h >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char >> *propname, >> int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname, >> DMAContext *dma); >> >> +void spapr_vfio_init_dma(int fd, uint32_t liobn, >> + uint64_t *dma32_window_start, >> + uint64_t *dma32_window_size); >> + >> #endif /* !defined (__HW_SPAPR_H__) */ >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c >> index 50c288d..0a194e8 100644 >> --- a/hw/spapr_iommu.c >> +++ b/hw/spapr_iommu.c >> @@ -16,6 +16,8 @@ >> * You should have received a copy of the GNU Lesser General Public >> * License along with this library; if not, see >> <http://www.gnu.org/licenses/>. >> */ >> +#include <sys/ioctl.h> >> + >> #include "hw.h" >> #include "kvm.h" >> #include "qdev.h" >> @@ -23,6 +25,7 @@ >> #include "dma.h" >> >> #include "hw/spapr.h" >> +#include "hw/linux-vfio.h" >> >> #include <libfdt.h> >> >> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong >> ioba, target_ulong tce) >> return 0; >> } >> >> +/* -------- API for POWERPC IOMMU -------- */ >> + >> +#define POWERPC_IOMMU 2 >> + >> +struct tce_iommu_info { > > CamelCase. > >> + __u32 argsz; >> + __u32 dma32_window_start; >> + __u32 dma32_window_size; > > Please use uint32_t. > >> +}; >> + >> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) >> + >> +struct tce_iommu_dma_map { >> + __u32 argsz; > > The structure may or may not be padded here since there's no > QEMU_PACKED attribute. If possible, just rearrange the fields. > >> + __u64 va; >> + __u64 dmaaddr; >> +}; >> + >> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) >> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14) >> + >> +typedef struct sPAPRVFIOTable { >> + int fd; >> + uint32_t liobn; >> + QLIST_ENTRY(sPAPRVFIOTable) list; >> +} sPAPRVFIOTable; >> + >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables; >> + >> +void spapr_vfio_init_dma(int fd, uint32_t liobn, >> + uint64_t *dma32_window_start, >> + uint64_t *dma32_window_size) >> +{ >> + sPAPRVFIOTable *t; >> + struct tce_iommu_info info = { .argsz = sizeof(info) }; >> + >> + if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) { >> + fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno); >> + return; >> + } >> + *dma32_window_start = info.dma32_window_start; >> + *dma32_window_size = info.dma32_window_size; >> + >> + t = g_malloc0(sizeof(*t)); >> + t->fd = fd; >> + t->liobn = liobn; >> + >> + QLIST_INSERT_HEAD(&vfio_tce_tables, t, list); >> +} >> + >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce) >> +{ >> + sPAPRVFIOTable *t; >> + struct tce_iommu_dma_map map = { >> + .argsz = sizeof(map), >> + .va = 0, >> + .dmaaddr = ioba, >> + }; >> + >> + QLIST_FOREACH(t, &vfio_tce_tables, list) { >> + if (t->liobn != liobn) { >> + continue; >> + } >> + if (tce) { >> + map.va = (uintptr_t)qemu_get_ram_ptr(tce & >> ~SPAPR_TCE_PAGE_MASK); >> + if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) { >> + fprintf(stderr, "TCE_MAP_DMA: %d\n", errno); > > perror()? > >> + return H_PARAMETER; >> + } >> + } else { >> + if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) { >> + fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno); >> + return H_PARAMETER; >> + } >> + } >> + return H_SUCCESS; >> + } >> + return H_CONTINUE; /* positive non-zero value */ >> +} >> + >> static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr, >> target_ulong opcode, target_ulong *args) >> { >> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, >> sPAPREnvironment *spapr, >> if (0 >= ret) { >> return ret ? H_PARAMETER : H_SUCCESS; >> } >> + ret = put_tce_vfio(liobn, ioba, tce); >> + if (0 >= ret) { > > This order in expressions is not common, please reverse. > >> + return ret ? H_PARAMETER : H_SUCCESS; >> + } >> #ifdef DEBUG_TCE >> fprintf(stderr, "%s on liobn=" TARGET_FMT_lx >> " ioba 0x" TARGET_FMT_lx " TCE 0x" TARGET_FMT_lx " ret=%d\n", >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c >> index 014297b..92c48b6 100644 >> --- a/hw/spapr_pci.c >> +++ b/hw/spapr_pci.c >> @@ -22,6 +22,9 @@ >> * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN >> * THE SOFTWARE. >> */ >> +#include <sys/types.h> >> +#include <dirent.h> >> + >> #include "hw.h" >> #include "pci.h" >> #include "msi.h" >> @@ -29,10 +32,10 @@ >> #include "pci_host.h" >> #include "hw/spapr.h" >> #include "hw/spapr_pci.h" >> +#include "hw/vfio_pci.h" >> #include "exec-memory.h" >> #include <libfdt.h> >> #include "trace.h" >> - >> #include "hw/pci_internals.h" >> >> /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ >> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int >> irq_num, int level) >> level); >> } >> >> +static int pci_spapr_get_irq(void *opaque, int irq_num) >> +{ >> + sPAPRPHBState *phb = opaque; >> + return phb->lsi_table[irq_num].dt_irq; >> +} >> + >> static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr, >> unsigned size) >> { >> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus >> *bus, void *opaque, >> return phb->dma; >> } >> >> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb) >> +{ >> + char iommupath[256]; >> + DIR *dirp; >> + struct dirent *entry; >> + >> + phb->iommugroup = vfio_get_group(phb->iommugroupid); >> + if (!phb->iommugroup) { >> + fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid); >> + return -1; >> + } >> + >> + if (!phb->scan) { >> + printf("Autoscan disabled\n"); >> + return 0; >> + } >> + >> + sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", >> phb->iommugroupid); > > Please use snprintf() or g_strdup_printf(). > > >> + dirp = opendir(iommupath); >> + >> + while ((entry = readdir(dirp)) != NULL) { >> + char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32); >> + FILE *deviceclassfile; >> + unsigned deviceclass = 0, domainid, busid, devid, fnid; >> + char addr[32]; >> + DeviceState *dev; >> + >> + if (4 != sscanf(entry->d_name, "%X:%X:%X.%x", > > Please put the constant last. > >> + &domainid, &busid, &devid, &fnid)) { >> + continue; >> + } >> + >> + sprintf(tmp, "%s%s/class", iommupath, entry->d_name); > > Again, snprintf() or g_strdup_printf() (which avoids the alloca() too). > >> + printf("Reading device class from %s\n", tmp); > > Leftover debugging? > >> + >> + deviceclassfile = fopen(tmp, "r"); >> + if (deviceclassfile) { >> + fscanf(deviceclassfile, "%x", &deviceclass); >> + fclose(deviceclassfile); >> + } >> + if (!deviceclass) { >> + continue; >> + } >> +#define PCI_BASE_CLASS_BRIDGE 0x06 > > This belongs to pci_ids.h.
It should but it is not. It is way smaller than the same one in the kernel. >> + if ((phb->scan < 2) && ((deviceclass >> 16) == >> PCI_BASE_CLASS_BRIDGE)) { >> + continue; >> + } >> + if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) { >> + /* Tweak USB */ >> + phb->force_addr = 1; >> + phb->enable_multifunction = 1; >> + } >> + >> + printf("Creating device %X:%X:%X.%x class=0x%X\n", >> + domainid, busid, devid, fnid, deviceclass); > > Lower case hex, please. > >> + >> + dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci"); >> + if (!dev) { >> + fprintf(stderr, "failed to create vfio-pci\n"); >> + continue; >> + } >> + qdev_prop_parse(dev, "host", entry->d_name); >> + if (phb->force_addr) { >> + sprintf(addr, "%X.%X", devid, fnid); > > snprintf, lower case hex. > >> + qdev_prop_parse(dev, "addr", addr); >> + } >> + if (phb->enable_multifunction) { >> + qdev_prop_set_bit(dev, "multifunction", 1); >> + } >> + qdev_init_nofail(dev); >> + } >> + closedir(dirp); >> + >> + return 0; >> +} >> + >> static int spapr_phb_init(SysBusDevice *s) >> { >> sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s); >> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s) >> >> bus = pci_register_bus(&phb->host_state.busdev.qdev, >> phb->busname ? phb->busname : phb->dtbusname, >> - pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb, >> + pci_spapr_set_irq, pci_spapr_get_irq, >> + pci_spapr_map_irq, phb, >> &phb->memspace, &phb->iospace, >> PCI_DEVFN(0, 0), PCI_NUM_PINS); >> phb->host_state.bus = bus; >> >> phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16); >> - phb->dma_window_start = 0; >> - phb->dma_window_size = 0x40000000; >> - phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, >> phb->dma_window_size); >> pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb); >> >> QLIST_INSERT_HEAD(&spapr->phbs, phb, list); >> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s) >> } >> } >> >> + if (phb->iommugroupid >= 0) { >> + if (0 > spapr_pci_scan_vfio(phb)) { > > Order. > >> + return -1; >> + } >> + if (!phb->iommugroup || !phb->iommugroup->container) { >> + return -1; >> + } >> + spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn, >> + &phb->dma_window_start, >> + &phb->dma_window_size); >> + return 0; >> + } >> + >> + phb->dma_window_start = 0; >> + phb->dma_window_size = 0x40000000; >> + phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, >> + phb->dma_window_size); >> + >> return 0; >> } >> >> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = { >> DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0), >> DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000), >> DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0), >> + DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1), >> + DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 >> +devices 2 +buses */ >> + DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0), >> + DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0), >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h >> index 145071c..1953a74 100644 >> --- a/hw/spapr_pci.h >> +++ b/hw/spapr_pci.h >> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState { >> int nvec; >> } msi_table[SPAPR_MSIX_MAX_DEVS]; >> >> + int32_t iommugroupid; >> + struct VFIOGroup *iommugroup; >> + uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */ >> + uint8_t enable_multifunction, force_addr; > > Use bool for both? There is no DEFINE_PROP_BOOL. There is DEFINE_PROP_BIT but it works with bits, not bools. uint8_t is the closest one. >> + >> QLIST_ENTRY(sPAPRPHBState) list; >> } sPAPRPHBState; >> >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c >> index 1ac287f..73681fb 100644 >> --- a/hw/vfio_pci.c >> +++ b/hw/vfio_pci.c >> @@ -21,7 +21,6 @@ >> #include <dirent.h> >> #include <stdio.h> >> #include <unistd.h> >> -#include <sys/io.h> >> #include <sys/ioctl.h> >> #include <sys/mman.h> >> #include <sys/types.h> >> @@ -43,6 +42,12 @@ >> #include "range.h" >> #include "vfio_pci.h" >> #include "linux-vfio.h" >> +#ifndef TARGET_PPC64 >> +#include <sys/io.h> >> +#else >> +#include "hw/pci_internals.h" >> +#include "hw/spapr.h" >> +#endif >> >> //#define DEBUG_VFIO >> #ifdef DEBUG_VFIO >> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group) >> >> memory_listener_register(&container->listener, get_system_memory()); >> >> +#define POWERPC_IOMMU 2 >> + } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) { >> + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); >> + if (ret) { >> + error_report("vfio: failed to set group container: %s\n", >> + strerror(errno)); >> + g_free(container); >> + close(fd); >> + return -1; >> + } >> + >> + ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU); >> + if (ret) { >> + error_report("vfio: failed to set iommu for container: %s\n", >> + strerror(errno)); >> + g_free(container); >> + close(fd); >> + return -1; >> + } >> } else { >> error_report("vfio: No available IOMMU models\n"); >> g_free(container); >> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group) >> } >> } >> >> -static VFIOGroup *vfio_get_group(int groupid) >> +VFIOGroup *vfio_get_group(int groupid) >> { >> VFIOGroup *group; >> char path[32]; >> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h >> index 226607c..d63dd63 100644 >> --- a/hw/vfio_pci.h >> +++ b/hw/vfio_pci.h >> @@ -105,4 +105,6 @@ typedef struct VFIOGroup { >> #define VFIO_FLAG_IOMMU_SHARED_BIT 0 >> #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT) >> >> +VFIOGroup *vfio_get_group(int groupid); >> + >> #endif /* __VFIO_H__ */ >> -- >> 1.7.10 >> >> -- Alexey