See my comments below, Thanks, Aviv. On Mon, Apr 11, 2016 at 11:25 PM, Alex Williamson <alex.william...@redhat.com> wrote: > > Some more detailed comments now that I have some faith that the host > IOMMU domain is working correctly... > > On Sat, 9 Apr 2016 21:03:38 +0300 > "Aviv B.D." <bd.a...@gmail.com> wrote: > >> From: "Aviv Ben-David" <bd.a...@gmail.com> >> Date: Tue, 23 Feb 2016 00:24:54 +0200 >> Subject: [PATCH] IOMMU: Add Support to VFIO devices with vIOMMU present >> >> * Fix bug that prevent qemu from starting up with vIOMMU and VFIO >> device are present. >> * Advertize Cache Mode capability in iommu cap register. >> * Register every VFIO device with IOMMU state. >> * On page cache invalidation in vIOMMU, check if the domain belong to >> VFIO device and mirror the guest requests to host. >> >> Changes from previous versions: >> * remove assumption that the cache do not clears >> * fix lock up on high load. >> * refactor vtd_get_did_dev to return success return code, and actual >> domain_id via argument. >> >> Tested only on network cards (also with multiple cards at once). >> >> Signed-off-by: Aviv Ben-David <bd.a...@gmail.com> >> --- >> hw/i386/intel_iommu.c | 113 >> +++++++++++++++++++++++++++++++++++------ >> hw/i386/intel_iommu_internal.h | 3 ++ >> hw/vfio/common.c | 12 +++-- >> include/exec/memory.h | 8 ++- >> include/hw/i386/intel_iommu.h | 4 ++ >> include/hw/vfio/vfio-common.h | 1 + >> 6 files changed, 121 insertions(+), 20 deletions(-) >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c >> index 347718f..a568181 100644 >> --- a/hw/i386/intel_iommu.c >> +++ b/hw/i386/intel_iommu.c >> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | >> VTD_DBGBIT(CSR); >> #define VTD_DPRINTF(what, fmt, ...) do {} while (0) >> #endif >> >> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, >> + uint8_t devfn, VTDContextEntry *ce); >> + >> static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val, >> uint64_t wmask, uint64_t w1cmask) >> { >> @@ -126,6 +129,22 @@ static uint32_t >> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr, >> return new_val; >> } >> >> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, >> uint8_t devfn, uint16_t * domain_id) >> +{ >> + VTDContextEntry ce; >> + int ret_fr; >> + >> + assert(domain_id); >> + >> + ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); >> + if (ret_fr){ >> + return -1; >> + } >> + >> + *domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi); >> + return 0; >> +} >> + >> static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr, >> uint64_t clear, uint64_t mask) >> { >> @@ -621,7 +640,7 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, >> uint32_t level) >> /* Given the @gpa, get relevant @slptep. @slpte_level will be the last level >> * of the translation, can be used for deciding the size of large page. >> */ >> -static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, bool >> is_write, >> +static int vtd_gpa_to_slpte(VTDContextEntry *ce, uint64_t gpa, >> IOMMUAccessPermissions is_write, > > "is_write" is binary, yes/no, IOMMUAccessPermissions clearly has more > states. This should change to "flags" or something and should use > existing IOMMUAccessFlags rather than defining something new. This > should be done in a separate patch that doesn't introduce new > functionality otherwise.
OK, I will do it. > >> uint64_t *slptep, uint32_t *slpte_level, >> bool *reads, bool *writes) >> { >> @@ -641,7 +660,19 @@ static int vtd_gpa_to_slpte(VTDContextEntry *ce, >> uint64_t gpa, bool is_write, >> } >> >> /* FIXME: what is the Atomics request here? */ >> - access_right_check = is_write ? VTD_SL_W : VTD_SL_R; >> + switch(is_write){ >> + case IOMMU_WRITE: >> + access_right_check = VTD_SL_W; >> + break; >> + case IOMMU_READ: >> + access_right_check = VTD_SL_R; >> + break; >> + case IOMMU_ANY: >> + access_right_check = VTD_SL_R | VTD_SL_W; >> + break; >> + default: >> + assert(0); >> + } >> >> while (true) { >> offset = vtd_gpa_level_offset(gpa, level); >> @@ -711,9 +742,9 @@ static int >> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, >> } >> >> if (!vtd_context_entry_present(ce)) { >> - VTD_DPRINTF(GENERAL, >> + /*VTD_DPRINTF(GENERAL, >> "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") " >> - "is not present", devfn, bus_num); >> + "is not present", devfn, bus_num);*/ > > > Leftover debug? Yes :/ I'll clear them... > >> return -VTD_FR_CONTEXT_ENTRY_P; >> } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) || >> (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) { >> @@ -785,7 +816,7 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) >> * @entry: IOMMUTLBEntry that contain the addr to be translated and result >> */ >> static void vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, >> - uint8_t devfn, hwaddr addr, bool >> is_write, >> + uint8_t devfn, hwaddr addr, >> IOMMUAccessPermissions is_write, >> IOMMUTLBEntry *entry) >> { >> IntelIOMMUState *s = vtd_as->iommu_state; >> @@ -848,12 +879,14 @@ static void >> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, >> is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; >> if (ret_fr) { >> ret_fr = -ret_fr; >> - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { >> - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA " >> + if (is_write != IOMMU_ANY){ > > Is this debugging as well? Seems like this hides the majority of > faults that might occur. No, this is actually the purpose of IOMMU_ANY - to suppress translate's errors reporting. The guest kernel may issue invalidation of some consecutive pages that some of them may not be present. > >> + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { >> + VTD_DPRINTF(FLOG, "fault processing is disabled for DMA >> " >> "requests through this context-entry " >> "(with FPD Set)"); >> - } else { >> - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); >> + } else { >> + vtd_report_dmar_fault(s, source_id, addr, ret_fr, >> is_write); >> + } >> } >> return; >> } >> @@ -870,11 +903,13 @@ static void >> vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, >> &reads, &writes); >> if (ret_fr) { >> ret_fr = -ret_fr; >> - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { >> - VTD_DPRINTF(FLOG, "fault processing is disabled for DMA >> requests " >> + if (is_write != IOMMU_ANY){ > > Here as well, why only fault non-RW entries? same as above, maybe the name IOMMU_ANY is misleading and should be something like IOMMU_NO_FAIL... > >> + if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { >> + VTD_DPRINTF(FLOG, "fault processing is disabled for >> DMA requests " >> "through this context-entry (with FPD Set)"); >> - } else { >> - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); >> + } else { >> + vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); >> + } >> } >> return; >> } >> @@ -1016,18 +1051,58 @@ static void >> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) >> &domain_id); >> } >> >> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, >> uint16_t domain_id, >> + hwaddr addr, uint8_t am) >> +{ >> + VFIOGuestIOMMU * giommu; >> + >> + QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){ >> + VTDAddressSpace *vtd_as = container_of(giommu->iommu, >> VTDAddressSpace, iommu); >> + uint16_t vfio_domain_id; >> + int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), >> vtd_as->devfn, &vfio_domain_id); >> + int i=0; >> + if (!ret && domain_id == vfio_domain_id){ >> + IOMMUTLBEntry entry; >> + >> + /* do vfio unmap */ >> + VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, >> am); >> + entry.target_as = NULL; >> + entry.iova = addr & VTD_PAGE_MASK_4K; >> + entry.translated_addr = 0; >> + entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am); >> + entry.perm = IOMMU_NONE; >> + memory_region_notify_iommu(giommu->iommu, entry); >> + >> + /* do vfio map */ >> + VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am); >> + /* call to vtd_iommu_translate */ >> + for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){ >> + IOMMUTLBEntry entry = >> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_ANY); >> + if (entry.perm != IOMMU_NONE){ >> + memory_region_notify_iommu(giommu->iommu, entry); >> + } >> + } >> + } >> + } >> +} >> + >> static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t >> domain_id, >> hwaddr addr, uint8_t am) >> { >> VTDIOTLBPageInvInfo info; >> >> assert(am <= VTD_MAMV); >> + >> info.domain_id = domain_id; >> info.addr = addr; >> info.mask = ~((1 << am) - 1); >> + >> g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); >> + >> + vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am); > > Why is this vfio related and why does it need to know about giommus? > That's vfio private data. Notifies need to happen regardless of > whether there's a vfio device attached or not. It seems like this is > just filling a gap that current VT-d code doesn't notify everywhere it > needs to, but it shouldn't know about vfio. Noted, I'll try to change them. > >> } >> >> + >> /* Flush IOTLB >> * Returns the IOTLB Actual Invalidation Granularity. >> * @val: the content of the IOTLB_REG >> @@ -1840,7 +1915,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, >> } >> >> static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, >> - bool is_write) >> + IOMMUAccessPermissions is_write) >> { >> VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); >> IntelIOMMUState *s = vtd_as->iommu_state; >> @@ -1895,6 +1970,13 @@ static Property vtd_properties[] = { >> DEFINE_PROP_END_OF_LIST(), >> }; >> >> +void vtd_register_giommu(VFIOGuestIOMMU * giommu) >> +{ >> + VTDAddressSpace *vtd_as = container_of(giommu->iommu, >> VTDAddressSpace, iommu); >> + IntelIOMMUState *s = vtd_as->iommu_state; >> + >> + QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next); >> +} > > This function shouldn't be needed. > >> >> VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) >> { >> @@ -1949,7 +2031,8 @@ static void vtd_init(IntelIOMMUState *s) >> s->iq_last_desc_type = VTD_INV_DESC_NONE; >> s->next_frcd_reg = 0; >> s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW | >> - VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS; >> + VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS| >> + VTD_CAP_CM; > > This should be a separate patch as well. Noted. > >> s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; >> >> vtd_reset_context_cache(s); >> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h >> index e5f514c..102e9a5 100644 >> --- a/hw/i386/intel_iommu_internal.h >> +++ b/hw/i386/intel_iommu_internal.h >> @@ -190,6 +190,7 @@ >> #define VTD_CAP_MAMV (VTD_MAMV << 48) >> #define VTD_CAP_PSI (1ULL << 39) >> #define VTD_CAP_SLLPS ((1ULL << 34) | (1ULL << 35)) >> +#define VTD_CAP_CM (1ULL << 7) >> >> /* Supported Adjusted Guest Address Widths */ >> #define VTD_CAP_SAGAW_SHIFT 8 >> @@ -338,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo; >> #define VTD_PAGE_SHIFT_1G 30 >> #define VTD_PAGE_MASK_1G (~((1ULL << VTD_PAGE_SHIFT_1G) - 1)) >> >> +#define VTD_PAGE_MASK(shift) (~((1ULL << (shift)) - 1)) >> + >> struct VTDRootEntry { >> uint64_t val; >> uint64_t rsvd; >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 607ec70..98c8d67 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -32,6 +32,9 @@ >> #include "sysemu/kvm.h" >> #include "trace.h" >> >> +#include "hw/sysbus.h" >> +#include "hw/i386/intel_iommu.h" >> + >> struct vfio_group_head vfio_group_list = >> QLIST_HEAD_INITIALIZER(vfio_group_list); >> struct vfio_as_head vfio_address_spaces = >> @@ -312,12 +315,12 @@ static void vfio_iommu_map_notify(Notifier *n, void >> *data) >> out: >> rcu_read_unlock(); >> } >> - >> +#if 0 >> static hwaddr vfio_container_granularity(VFIOContainer *container) >> { >> return (hwaddr)1 << ctz64(container->iova_pgsizes); >> } >> - >> +#endif >> static void vfio_listener_region_add(MemoryListener *listener, >> MemoryRegionSection *section) >> { >> @@ -344,6 +347,7 @@ static void >> vfio_listener_region_add(MemoryListener *listener, >> iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); >> llend = int128_make64(section->offset_within_address_space); >> llend = int128_add(llend, section->size); >> + llend = int128_add(llend, int128_exts64(-1)); >> llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); >> >> if (int128_ge(int128_make64(iova), llend)) { >> @@ -381,11 +385,13 @@ static void >> vfio_listener_region_add(MemoryListener *listener, >> giommu->n.notify = vfio_iommu_map_notify; >> QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); >> >> + vtd_register_giommu(giommu); >> memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); >> +#if 0 >> memory_region_iommu_replay(giommu->iommu, &giommu->n, >> vfio_container_granularity(container), >> false); >> - >> +#endif > > AFAICT, none of the above vfio changes should be required. The > overflow is already fixed in qemu.git, the giommu registration > shouldn't be necessary, the replay is probably not used, but shouldn't > be a problem either. Not that there aren't vfio issues, but I think > they're internal, like how pages are accounted and map/unmap efficiency. > >> return; >> } >> >> diff --git a/include/exec/memory.h b/include/exec/memory.h >> index 2de7898..0e814ab 100644 >> --- a/include/exec/memory.h >> +++ b/include/exec/memory.h >> @@ -146,10 +146,14 @@ struct MemoryRegionOps { >> }; >> >> typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps; >> - >> +typedef enum IOMMUAccessPermissions{ >> + IOMMU_READ = 0, >> + IOMMU_WRITE = 1, >> + IOMMU_ANY = 2 >> +} IOMMUAccessPermissions; >> struct MemoryRegionIOMMUOps { >> /* Return a TLB entry that contains a given address. */ >> - IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool >> is_write); >> + IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, >> IOMMUAccessPermissions is_write); >> }; >> >> typedef struct CoalescedMemoryRange CoalescedMemoryRange; >> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h >> index b024ffa..22f3f83 100644 >> --- a/include/hw/i386/intel_iommu.h >> +++ b/include/hw/i386/intel_iommu.h >> @@ -23,6 +23,7 @@ >> #define INTEL_IOMMU_H >> #include "hw/qdev.h" >> #include "sysemu/dma.h" >> +#include "hw/vfio/vfio-common.h" >> >> #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu" >> #define INTEL_IOMMU_DEVICE(obj) \ >> @@ -123,6 +124,8 @@ struct IntelIOMMUState { >> MemoryRegionIOMMUOps iommu_ops; >> GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by >> PCIBus* reference */ >> VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects >> indexed by bus number */ >> + >> + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; >> }; >> >> /* Find the VTD Address space associated with the given bus pointer, >> @@ -130,4 +133,5 @@ struct IntelIOMMUState { >> */ >> VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int >> devfn); >> >> +void vtd_register_giommu(VFIOGuestIOMMU * giommu); >> #endif > > Needing to know anything about vfio is an indication that this > shouldn't be necessary. > >> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h >> index f037f3c..9225ba3 100644 >> --- a/include/hw/vfio/vfio-common.h >> +++ b/include/hw/vfio/vfio-common.h >> @@ -82,6 +82,7 @@ typedef struct VFIOGuestIOMMU { >> MemoryRegion *iommu; >> Notifier n; >> QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; >> + QLIST_ENTRY(VFIOGuestIOMMU) iommu_next; >> } VFIOGuestIOMMU; > > This is clearly a layering violation, vt-d should not be managing a > list on a vfio data structure, especially one that it shouldn't even > have access to. Thanks, As above, I'll try to separate them. > > Alex