On Wed, 2014-03-12 at 16:52 +1100, Alexey Kardashevskiy wrote: > From: David Gibson <da...@gibson.dropbear.id.au> > > This patch uses the new IOMMU notifiers to allow VFIO pass through devices > to work with guest side IOMMUs, as long as the host-side VFIO iommu has > sufficient capability and granularity to match the guest side. This works > by tracking all map and unmap operations on the guest IOMMU using the > notifiers, and mirroring them into VFIO. > > There are a number of FIXMEs, and the scheme involves rather more notifier > structures than I'd like, but it should make for a reasonable proof of > concept. > > Signed-off-by: David Gibson <da...@gibson.dropbear.id.au> > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> > > --- > Changes: > v4: > * fixed list objects naming > * vfio_listener_region_add() reworked to call memory_region_ref() from one > place only, it is also easier to review the changes > * fixes boundary check not to fail on sections == 2^64 bytes, > the "vfio: Fix debug output for int128 values" patch is required; > this obsoletes the "[PATCH v3 0/3] vfio: fixes for better support > for 128 bit memory section sizes" patch proposal > --- > hw/misc/vfio.c | 126 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 120 insertions(+), 6 deletions(-) > > diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c > index 038010b..4f6f5da 100644 > --- a/hw/misc/vfio.c > +++ b/hw/misc/vfio.c > @@ -159,10 +159,18 @@ typedef struct VFIOContainer { > }; > void (*release)(struct VFIOContainer *); > } iommu_data; > + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > QLIST_HEAD(, VFIOGroup) group_list; > QLIST_ENTRY(VFIOContainer) next; > } VFIOContainer; > > +typedef struct VFIOGuestIOMMU { > + VFIOContainer *container; > + MemoryRegion *iommu; > + Notifier n; > + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; > +} VFIOGuestIOMMU; > + > /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map > */ > typedef struct VFIOMSIXInfo { > uint8_t table_bar; > @@ -2241,8 +2249,9 @@ static int vfio_dma_map(VFIOContainer *container, > hwaddr iova, > > static bool vfio_listener_skipped_section(MemoryRegionSection *section) > { > - return !memory_region_is_ram(section->mr) || > - /* > + return (!memory_region_is_ram(section->mr) && > + !memory_region_is_iommu(section->mr)) || > + /*
White space damage > * Sizing an enabled 64-bit BAR can cause spurious mappings to > * addresses in the upper part of the 64-bit address space. These > * are never accessed by the CPU and beyond the address width of > @@ -2251,6 +2260,61 @@ static bool > vfio_listener_skipped_section(MemoryRegionSection *section) > section->offset_within_address_space & (1ULL << 63); > } > > +static void vfio_iommu_map_notify(Notifier *n, void *data) > +{ > + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); > + VFIOContainer *container = giommu->container; > + IOMMUTLBEntry *iotlb = data; > + MemoryRegion *mr; > + hwaddr xlat; > + hwaddr len = iotlb->addr_mask + 1; > + void *vaddr; > + int ret; > + > + DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", > + iotlb->iova, iotlb->iova + iotlb->addr_mask); > + > + /* > + * The IOMMU TLB entry we have just covers translation through > + * this IOMMU to its immediate target. We need to translate > + * it the rest of the way through to memory. > + */ > + mr = address_space_translate(&address_space_memory, > + iotlb->translated_addr, > + &xlat, &len, iotlb->perm & IOMMU_WO); Write-only? Is this supposed to be read-write to mask just 2 bits? > + if (!memory_region_is_ram(mr)) { > + DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n", > + xlat); > + return; > + } > + if (len & iotlb->addr_mask) { > + DPRINTF("iommu has granularity incompatible with target AS\n"); Is this possible? Assuming len is initially a power-of-2, would the translate function change it? Maybe worth a comment to explain. > + return; > + } > + > + vaddr = memory_region_get_ram_ptr(mr) + xlat; This lookup isn't free and the unmap path doesn't need it, maybe move the variable and lookup into the first branch below? > + > + if (iotlb->perm != IOMMU_NONE) { > + ret = vfio_dma_map(container, iotlb->iova, > + iotlb->addr_mask + 1, vaddr, > + !(iotlb->perm & IOMMU_WO) || mr->readonly); > + if (ret) { > + error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " > + "0x%"HWADDR_PRIx", %p) = %d (%m)", > + container, iotlb->iova, > + iotlb->addr_mask + 1, vaddr, ret); > + } > + } else { > + ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); > + if (ret) { > + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " > + "0x%"HWADDR_PRIx") = %d (%m)", > + container, iotlb->iova, > + iotlb->addr_mask + 1, ret); > + } > + } > +} > + > static void vfio_listener_region_add(MemoryListener *listener, > MemoryRegionSection *section) > { > @@ -2261,8 +2325,6 @@ static void vfio_listener_region_add(MemoryListener > *listener, > void *vaddr; > int ret; > > - assert(!memory_region_is_iommu(section->mr)); > - > if (vfio_listener_skipped_section(section)) { > DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n", > section->offset_within_address_space, > @@ -2286,15 +2348,47 @@ static void vfio_listener_region_add(MemoryListener > *listener, > return; > } > > + memory_region_ref(section->mr); > + > + if (memory_region_is_iommu(section->mr)) { > + VFIOGuestIOMMU *giommu; > + > + DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", > + iova, int128_get64(int128_sub(llend, int128_one()))); > + /* > + * FIXME: We should do some checking to see if the > + * capabilities of the host VFIO IOMMU are adequate to model > + * the guest IOMMU > + * > + * FIXME: This assumes that the guest IOMMU is empty of > + * mappings at this point - we should either enforce this, or > + * loop through existing mappings to map them into VFIO. > + * > + * FIXME: For VFIO iommu types which have KVM acceleration to > + * avoid bouncing all map/unmaps through qemu this way, this > + * would be the right place to wire that up (tell the KVM > + * device emulation the VFIO iommu handles to use). > + */ That's a lot of FIXMEs... The second one in particular looks like it needs to expand a bit on why this is likely a valid assumption. The last one is more of a TODO than a FIXME. > + giommu = g_malloc0(sizeof(*giommu)); > + giommu->iommu = section->mr; > + giommu->container = container; > + giommu->n.notify = vfio_iommu_map_notify; > + QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); > + memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); > + > + return; > + } > + > + /* Here we assume that memory_region_is_ram(section->mr)==true */ > + > end = int128_get64(llend); > vaddr = memory_region_get_ram_ptr(section->mr) + > section->offset_within_region + > (iova - section->offset_within_address_space); > > - DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n", > + DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n", > iova, end - 1, vaddr); > > - memory_region_ref(section->mr); > ret = vfio_dma_map(container, iova, end - iova, vaddr, > section->readonly); > if (ret) { > error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " > @@ -2338,6 +2432,26 @@ static void vfio_listener_region_del(MemoryListener > *listener, > return; > } > > + if (memory_region_is_iommu(section->mr)) { > + VFIOGuestIOMMU *giommu; > + > + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { > + if (giommu->iommu == section->mr) { > + memory_region_unregister_iommu_notifier(&giommu->n); > + QLIST_REMOVE(giommu, giommu_next); > + g_free(giommu); > + break; > + } > + } > + > + /* > + * FIXME: We assume the one big unmap below is adequate to > + * remove any individual page mappings in the IOMMU which > + * might have been copied into VFIO. That may not be true for > + * all IOMMU types > + */ We assume this because the IOVA that gets unmapped is the same regardless of whether a guest IOMMU is present? > + } > + > iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); > end = (section->offset_within_address_space + > int128_get64(section->size)) & > TARGET_PAGE_MASK;