On Mon, Apr 11, 2016 at 09:35:15AM -0600, Alex Williamson wrote: > On Sat, 9 Apr 2016 21:03:38 +0300 > "Aviv B.D." <bd.a...@gmail.com> wrote: > > > From: "Aviv Ben-David" <bd.a...@gmail.com> > > Date: Tue, 23 Feb 2016 00:24:54 +0200 > > Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present > > > > * Fix bug that prevent qemu from starting up with vIOMMU and VFIO > > device are present. > > * Advertize Cache Mode capability in iommu cap register. > > * Register every VFIO device with IOMMU state. > > * On page cache invalidation in vIOMMU, check if the domain belong to > > VFIO device and mirror the guest requests to host. > > > > Changes from previous versions: > > * remove assumption that the cache do not clears > > * fix lock up on high load. > > * refactor vtd_get_did_dev to return success return code, and actual > > domain_id via argument. > > > > Tested only on network cards (also with multiple cards at once). > > > > Signed-off-by: Aviv Ben-David <bd.a...@gmail.com> > > --- > > hw/i386/intel_iommu.c | 113 > > +++++++++++++++++++++++++++++++++++------ > > hw/i386/intel_iommu_internal.h | 3 ++ > > hw/vfio/common.c | 12 +++-- > > include/exec/memory.h | 8 ++- > > include/hw/i386/intel_iommu.h | 4 ++ > > include/hw/vfio/vfio-common.h | 1 + > > 6 files changed, 121 insertions(+), 20 deletions(-) > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > > index 347718f..a568181 100644 > > --- a/hw/i386/intel_iommu.c > > +++ b/hw/i386/intel_iommu.c > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | > > VTD_DBGBIT(CSR); > > #define VTD_DPRINTF(what, fmt, ...) do {} while (0) > > #endif > > > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > + uint8_t devfn, VTDContextEntry *ce); > > + > > static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, > > uint64_t wmask, uint64_t w1cmask) > > { > > @@ -126,6 +129,22 @@ static uint32_t > > vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, > > return new_val; > > } > > > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, > > uint8_t devfn, uint16_t * domain_id) > > +{ > > + VTDContextEntry ce; > > + int ret_fr; > > + > > + assert(domain_id); > > + > > + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); > > + if (ret_fr){ > > + return -1; > > + } > > + > > + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); > > + return 0; > > +} > > + > > static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, > > uint64_t clear, uint64_t mask) > > { > > @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, > > uint32_t level) > > /* Given the @gpa, get relevant @slptep. @slpte_level will be the last > > level > > * of the translation, can be used for deciding the size of large page. > > */ > > -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool > > is_write, > > +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, > > IOMMUAccessPermissions is_write, > > uint64_t *slptep, uint32_t *slpte_level, > > bool *reads, bool *writes) > > { > > @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, > > uint64_t gpa, bool is_write, > > } > > > > /* FIXME: what is the Atomics request here? */ > > - access_right_check = is_write ? VTD_SL_W : VTD_SL_R; > > + switch(is_write){ > > + case IOMMU_WRITE: > > + access_right_check = VTD_SL_W; > > + break; > > + case IOMMU_READ: > > + access_right_check = VTD_SL_R; > > + break; > > + case IOMMU_ANY: > > + access_right_check = VTD_SL_R | VTD_SL_W; > > + break; > > + default: > > + assert(0); > > + } > > > > while (true) { > > offset = vtd_gpa_level_offset(gpa, level); > > @@ -711,9 +742,9 @@ static int > > vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > > } > > > > if (!vtd_context_entry_present(ce)) { > > - VTD_DPRINTF(GENERAL, > > + /*VTD_DPRINTF(GENERAL, > > "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " > > - "is not present", devfn, bus_num); > > + "is not present", devfn, bus_num);*/ > > return -VTD_FR_CONTEXT_ENTRY_P; > > } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || > > (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { > > @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) > > * @entry: IOMMUTLBEntry that contain the addr to be translated and result > > */ > > static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, > > - uint8_t devfn, hwaddr addr, bool > > is_write, > > + uint8_t devfn, hwaddr addr, > > IOMMUAccessPermissions is_write, > > IOMMUTLBEntry *entry) > > { > > IntelIOMMUState *s = vtd_as->iommu_state; > > @@ -848,12 +879,14 @@ static void > > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, > > is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; > > if (ret_fr) { > > ret_fr = -ret_fr; > > - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { > > - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA " > > + if (is_write != IOMMU_ANY){ > > + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { > > + VTD_DPRINTF(FLOG, "fault processing is disabled for > > DMA " > > "requests through this context-entry " > > "(with FPD Set)"); > > - } else { > > - vtd_report_dmar_fault(s, source_id, addr, ret_fr, > > is_write); > > + } else { > > + vtd_report_dmar_fault(s, source_id, addr, ret_fr, > > is_write); > > + } > > } > > return; > > } > > @@ -870,11 +903,13 @@ static void > > vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, > > &reads, &writes); > > if (ret_fr) { > > ret_fr = -ret_fr; > > - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { > > - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA > > requests " > > + if (is_write != IOMMU_ANY){ > > + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { > > + VTD_DPRINTF(FLOG, "fault processing is disabled for > > DMA requests " > > "through this context-entry (with FPD Set)"); > > - } else { > > - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); > > + } else { > > + vtd_report_dmar_fault(s, source_id, addr, ret_fr, > > is_write); > > + } > > } > > return; > > } > > @@ -1016,18 +1051,58 @@ static void > > vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) > > &domain_id); > > } > > > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, > > uint16_t domain_id, > > + hwaddr addr, uint8_t am) > > +{ > > + VFIOGuestIOMMU * giommu; > > + > > + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ > > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, > > VTDAddressSpace, iommu); > > + uint16_t vfio_domain_id; > > + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), > > vtd_as->devfn, &vfio_domain_id); > > + int i=0; > > + if (!ret && domain_id == vfio_domain_id){ > > + IOMMUTLBEntry entry; > > + > > + /* do vfio unmap */ > > + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, > > am); > > + entry.target_as = NULL; > > + entry.iova = addr & VTD_PAGE_MASK_4K; > > + entry.translated_addr = 0; > > + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); > > + entry.perm = IOMMU_NONE; > > + memory_region_notify_iommu(giommu->iommu, entry); > > + > > + /* do vfio map */ > > + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, > > am); > > + /* call to vtd_iommu_translate */ > > + for (i = 0; i < (1 << am); i++, addr+=(1 << > > VTD_PAGE_SHIFT_4K)){ > > + IOMMUTLBEntry entry = > > s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY); > > + if (entry.perm != IOMMU_NONE){ > > + memory_region_notify_iommu(giommu->iommu, entry); > > + } > > + } > > + } > > + } > > +} > > + > > static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t > > domain_id, > > hwaddr addr, uint8_t am) > > { > > VTDIOTLBPageInvInfo info; > > > > assert(am <= VTD_MAMV); > > + > > info.domain_id = domain_id; > > info.addr = addr; > > info.mask = ~((1 << am) - 1); > > + > > g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); > > + > > + vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am); > > } > > > > + > > /* Flush IOTLB > > * Returns the IOTLB Actual Invalidation Granularity. > > * @val: the content of the IOTLB_REG > > @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, > > } > > > > static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, > > - bool is_write) > > + IOMMUAccessPermissions is_write) > > { > > VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); > > IntelIOMMUState *s = vtd_as->iommu_state; > > @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = { > > DEFINE_PROP_END_OF_LIST(), > > }; > > > > +void vtd_register_giommu(VFIOGuestIOMMU * giommu) > > +{ > > + VTDAddressSpace *vtd_as = container_of(giommu->iommu, > > VTDAddressSpace, iommu); > > + IntelIOMMUState *s = vtd_as->iommu_state; > > + > > + QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next); > > +} > > > > VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int > > devfn) > > { > > @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s) > > s->iq_last_desc_type = VTD_INV_DESC_NONE; > > s->next_frcd_reg = 0; > > s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW | > > - VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS; > > + VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS| > > + VTD_CAP_CM; > > s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; > > > > vtd_reset_context_cache(s); > > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h > > index e5f514c..102e9a5 100644 > > --- a/hw/i386/intel_iommu_internal.h > > +++ b/hw/i386/intel_iommu_internal.h > > @@ -190,6 +190,7 @@ > > #define VTD_CAP_MAMV (VTD_MAMV << 48) > > #define VTD_CAP_PSI (1ULL << 39) > > #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) > > +#define VTD_CAP_CM (1ULL << 7) > > > > /* Supported Adjusted Guest Address Widths */ > > #define VTD_CAP_SAGAW_SHIFT 8 > > @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; > > #define VTD_PAGE_SHIFT_1G 30 > > #define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1)) > > > > +#define VTD_PAGE_MASK(shift) (~((1ULL << (shift)) - 1)) > > + > > struct VTDRootEntry { > > uint64_t val; > > uint64_t rsvd; > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > > index 607ec70..98c8d67 100644 > > --- a/hw/vfio/common.c > > +++ b/hw/vfio/common.c > > @@ -32,6 +32,9 @@ > > #include "sysemu/kvm.h" > > #include "trace.h" > > > > +#include "hw/sysbus.h" > > +#include "hw/i386/intel_iommu.h" > > + > > struct vfio_group_head vfio_group_list = > > QLIST_HEAD_INITIALIZER(vfio_group_list); > > struct vfio_as_head vfio_address_spaces = > > @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void > > *data) > > out: > > rcu_read_unlock(); > > } > > - > > +#if 0 > > static hwaddr vfio_container_granularity(VFIOContainer *container) > > { > > return (hwaddr)1 << ctz64(container->iova_pgsizes); > > } > > - > > +#endif > > static void vfio_listener_region_add(MemoryListener *listener, > > MemoryRegionSection *section) > > { > > @@ -344,6 +347,7 @@ static void > > vfio_listener_region_add(MemoryListener *listener, > > iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); > > llend = int128_make64(section->offset_within_address_space); > > llend = int128_add(llend, section->size); > > + llend = int128_add(llend, int128_exts64(-1)); > > llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); > > > > if (int128_ge(int128_make64(iova), llend)) { > > @@ -381,11 +385,13 @@ static void > > vfio_listener_region_add(MemoryListener *listener, > > giommu->n.notify = vfio_iommu_map_notify; > > QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); > > > > + vtd_register_giommu(giommu); > > memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); > > +#if 0 > > memory_region_iommu_replay(giommu->iommu, &giommu->n, > > vfio_container_granularity(container), > > false); > > - > > +#endif > > Clearly replay has a purpose here and we can't assume the vIOMMU is > VT-d, we can't really move beyond an RFC without resolving those. > However we're still ignoring the issue that the existing vfio code > attempts to put all devices into a single IOMMU domain on the host, > something that is generally successful on x86. So it seems like mostly > wishful thinking that's preventing conflicts between devices. Wouldn't > the right thing to do be to create a separate container and thus IOMMU > domain for each vIOMMU managed device? We could make a simplifying > assertion that to support vIOMMU, there must be a single device per > IOMMU group. The vfio type1 interfaces is probably going to quickly > show limitations for this usage mode, not only in mapping performance, > but also in locked page accounting. Type1 is really meant for largely > static mappings, it's going to be a bottleneck for dynamic mappings, > and if any containers are in "passthrough" mode, mapping all VM memory, > the vIOMMU domains are going to start hitting locked memory limits. We > can probably move forward with those latter issues, but we absolutely > need multiple containers for correctness. Thanks, > > Alex
I guess we could limit this to a single VFIO device, fail attempts to add more. This might be an easier intermediate step than full multi-domain support. > > return; > > } > > > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > index 2de7898..0e814ab 100644 > > --- a/include/exec/memory.h > > +++ b/include/exec/memory.h > > @@ -146,10 +146,14 @@ struct MemoryRegionOps { > > }; > > > > typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps; > > - > > +typedef enum IOMMUAccessPermissions{ > > + IOMMU_READ = 0, > > + IOMMU_WRITE = 1, > > + IOMMU_ANY = 2 > > +} IOMMUAccessPermissions; > > struct MemoryRegionIOMMUOps { > > /* Return a TLB entry that contains a given address. */ > > - IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool > > is_write); > > + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, > > IOMMUAccessPermissions is_write); > > }; > > > > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h > > index b024ffa..22f3f83 100644 > > --- a/include/hw/i386/intel_iommu.h > > +++ b/include/hw/i386/intel_iommu.h > > @@ -23,6 +23,7 @@ > > #define INTEL_IOMMU_H > > #include "hw/qdev.h" > > #include "sysemu/dma.h" > > +#include "hw/vfio/vfio-common.h" > > > > #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" > > #define INTEL_IOMMU_DEVICE(obj) \ > > @@ -123,6 +124,8 @@ struct IntelIOMMUState { > > MemoryRegionIOMMUOps iommu_ops; > > GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by > > PCIBus* reference */ > > VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects > > indexed by bus number */ > > + > > + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; > > }; > > > > /* Find the VTD Address space associated with the given bus pointer, > > @@ -130,4 +133,5 @@ struct IntelIOMMUState { > > */ > > VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int > > devfn); > > > > +void vtd_register_giommu(VFIOGuestIOMMU * giommu); > > #endif > > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > > index f037f3c..9225ba3 100644 > > --- a/include/hw/vfio/vfio-common.h > > +++ b/include/hw/vfio/vfio-common.h > > @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU { > > MemoryRegion *iommu; > > Notifier n; > > QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; > > + QLIST_ENTRY(VFIOGuestIOMMU) iommu_next; > > } VFIOGuestIOMMU; > > > > typedef struct VFIODeviceOps VFIODeviceOps;