On 09/06/2013 04:49 AM, Alex Williamson wrote: > On Fri, 2013-08-30 at 20:15 +1000, Alexey Kardashevskiy wrote: >> From: David Gibson <da...@gibson.dropbear.id.au> >> >> This patch uses the new IOMMU notifiers to allow VFIO pass through devices >> to work with guest side IOMMUs, as long as the host-side VFIO iommu has >> sufficient capability and granularity to match the guest side. This works >> by tracking all map and unmap operations on the guest IOMMU using the >> notifiers, and mirroring them into VFIO. >> >> There are a number of FIXMEs, and the scheme involves rather more notifier >> structures than I'd like, but it should make for a reasonable proof of >> concept. >> >> Signed-off-by: David Gibson <da...@gibson.dropbear.id.au> >> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> >> >> --- >> Changes: >> v4: >> * fixed list objects naming >> * vfio_listener_region_add() reworked to call memory_region_ref() from one >> place only, it is also easier to review the changes >> * fixes boundary check not to fail on sections == 2^64 bytes, >> the "vfio: Fix debug output for int128 values" patch is required; >> this obsoletes the "[PATCH v3 0/3] vfio: fixes for better support >> for 128 bit memory section sizes" patch proposal >> --- >> hw/misc/vfio.c | 137 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++---- >> 1 file changed, 128 insertions(+), 9 deletions(-) >> >> diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c >> index c16f41b..53791fb 100644 >> --- a/hw/misc/vfio.c >> +++ b/hw/misc/vfio.c >> @@ -150,10 +150,18 @@ typedef struct VFIOContainer { >> }; >> void (*release)(struct VFIOContainer *); >> } iommu_data; >> + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; >> QLIST_HEAD(, VFIOGroup) group_list; >> QLIST_ENTRY(VFIOContainer) next; >> } VFIOContainer; >> >> +typedef struct VFIOGuestIOMMU { >> + VFIOContainer *container; >> + MemoryRegion *iommu; >> + Notifier n; >> + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; >> +} VFIOGuestIOMMU; >> + >> /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map >> */ >> typedef struct VFIOMSIXInfo { >> uint8_t table_bar; >> @@ -1917,7 +1925,63 @@ static int vfio_dma_map(VFIOContainer *container, >> hwaddr iova, >> >> static bool vfio_listener_skipped_section(MemoryRegionSection *section) >> { >> - return !memory_region_is_ram(section->mr); >> + return !memory_region_is_ram(section->mr) && >> + !memory_region_is_iommu(section->mr); >> +} >> + >> +static void vfio_iommu_map_notify(Notifier *n, void *data) >> +{ >> + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); >> + VFIOContainer *container = giommu->container; >> + IOMMUTLBEntry *iotlb = data; >> + MemoryRegion *mr; >> + hwaddr xlat; >> + hwaddr len = iotlb->addr_mask + 1; >> + void *vaddr; >> + int ret; >> + >> + DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", >> + iotlb->iova, iotlb->iova + iotlb->addr_mask); >> + >> + /* >> + * The IOMMU TLB entry we have just covers translation through >> + * this IOMMU to its immediate target. We need to translate >> + * it the rest of the way through to memory. >> + */ >> + mr = address_space_translate(&address_space_memory, >> + iotlb->translated_addr, >> + &xlat, &len, iotlb->perm & IOMMU_WO); >> + if (!memory_region_is_ram(mr)) { >> + DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n", >> + xlat); >> + return; >> + } >> + if (len & iotlb->addr_mask) { >> + DPRINTF("iommu has granularity incompatible with target AS\n"); >> + return; >> + } >> + >> + vaddr = memory_region_get_ram_ptr(mr) + xlat; >> + >> + if (iotlb->perm != IOMMU_NONE) { >> + ret = vfio_dma_map(container, iotlb->iova, >> + iotlb->addr_mask + 1, vaddr, >> + !(iotlb->perm & IOMMU_WO) || mr->readonly); >> + if (ret) { >> + error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " >> + "0x%"HWADDR_PRIx", %p) = %d (%m)", >> + container, iotlb->iova, >> + iotlb->addr_mask + 1, vaddr, ret); >> + } >> + } else { >> + ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); >> + if (ret) { >> + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " >> + "0x%"HWADDR_PRIx") = %d (%m)", >> + container, iotlb->iova, >> + iotlb->addr_mask + 1, ret); >> + } >> + } >> } >> >> static void vfio_listener_region_add(MemoryListener *listener, >> @@ -1926,11 +1990,10 @@ static void vfio_listener_region_add(MemoryListener >> *listener, >> VFIOContainer *container = container_of(listener, VFIOContainer, >> iommu_data.listener); >> hwaddr iova, end; >> + Int128 llend; >> void *vaddr; >> int ret; >> >> - assert(!memory_region_is_iommu(section->mr)); >> - >> if (vfio_listener_skipped_section(section)) { >> DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n", >> section->offset_within_address_space, >> @@ -1946,21 +2009,57 @@ static void vfio_listener_region_add(MemoryListener >> *listener, >> } >> >> iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); >> + llend = int128_make64(section->offset_within_address_space); >> + llend = int128_add(llend, section->size); >> + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); > > So you're still looking for me to pickup patches 1/3 & 2/3 from the > previous int128 series for this?
Oops. Just noticed the question. Yes, please. Your tree is a lot faster :) >> + >> + if (int128_ge(int128_make64(iova), llend)) { >> + return; >> + } >> + >> + memory_region_ref(section->mr); >> + >> + if (memory_region_is_iommu(section->mr)) { >> + VFIOGuestIOMMU *giommu; >> + >> + DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", >> + iova, int128_get64(int128_sub(llend, int128_one()))); >> + /* >> + * FIXME: We should do some checking to see if the >> + * capabilities of the host VFIO IOMMU are adequate to model >> + * the guest IOMMU >> + * >> + * FIXME: This assumes that the guest IOMMU is empty of >> + * mappings at this point - we should either enforce this, or >> + * loop through existing mappings to map them into VFIO. > > I would hope that when you do the below > memory_region_register_iommu_notifier() the callback gets a replay of > the current state of the iommu like we do for a memory region when we > register the listener that gets us here. Is that not the case? >From what I see, it just adds a notifier: void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n) { notifier_list_add(&mr->iommu_notify, n); } >> + * >> + * FIXME: For VFIO iommu types which have KVM acceleration to >> + * avoid bouncing all map/unmaps through qemu this way, this >> + * would be the right place to wire that up (tell the KVM >> + * device emulation the VFIO iommu handles to use). >> + */ >> + giommu = g_malloc0(sizeof(*giommu)); >> + giommu->iommu = section->mr; >> + giommu->container = container; >> + giommu->n.notify = vfio_iommu_map_notify; >> + QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); >> + memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); >> + >> + return; >> + } >> + >> + /* Here we assume that memory_region_is_ram(section->mr)==true */ >> + >> end = (section->offset_within_address_space + >> int128_get64(section->size)) & >> TARGET_PAGE_MASK; > > Why isn't this now calculated from llend? I did not want to change the existing code :) I will fix it. . > I thought that was part of > why the int128 stuff was rolled in here. Not sure that I understood the note. Everything I wanted here was not calling int128_get64() for IOMMU case. -- Alexey