> The default replay() don't work for VT-d since vt-d will have a huge > default memory region which covers address range 0-(2^64-1). This will > normally bring a dead loop when guest starts. > > The solution is simple - we don't walk over all the regions. Instead, we > jump over the regions when we found that the page directories are empty. > It'll greatly reduce the time to walk the whole region. > > To achieve this, we provided a page walk helper to do that, invoking > corresponding hook function when we found an page we are interested in. > vtd_page_walk_level() is the core logic for the page walking. It's > interface is designed to suite further use case, e.g., to invalidate a > range of addresses. > > Signed-off-by: Peter Xu <address@hidden> > --- > hw/i386/intel_iommu.c | 212 > ++++++++++++++++++++++++++++++++++++++++++++++++-- > hw/i386/trace-events | 8 ++ > include/exec/memory.h | 2 + > 3 files changed, 217 insertions(+), 5 deletions(-) > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c > index 46b8a2f..2fcd7af 100644 > --- a/hw/i386/intel_iommu.c > +++ b/hw/i386/intel_iommu.c > @@ -620,6 +620,22 @@ static inline uint32_t > vtd_get_agaw_from_context_entry(VTDContextEntry *ce) > return 30 + (ce->hi & VTD_CONTEXT_ENTRY_AW) * 9; > } > > +static inline uint64_t vtd_iova_limit(VTDContextEntry *ce) > +{ > + uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); > + return 1ULL << MIN(ce_agaw, VTD_MGAW); > +} > + > +/* Return true if IOVA passes range check, otherwise false. */ > +static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce) > +{ > + /* > + * Check if @iova is above 2^X-1, where X is the minimum of MGAW > + * in CAP_REG and AW in context-entry. > + */ > + return !(iova & ~(vtd_iova_limit(ce) - 1)); > +} > + > static const uint64_t vtd_paging_entry_rsvd_field[] = { > [0] = ~0ULL, > /* For not large page */ > @@ -656,13 +672,9 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, > uint64_t > iova, > uint32_t level = vtd_get_level_from_context_entry(ce); > uint32_t offset; > uint64_t slpte; > - uint32_t ce_agaw = vtd_get_agaw_from_context_entry(ce); > uint64_t access_right_check = 0; > > - /* Check if @iova is above 2^X-1, where X is the minimum of MGAW > - * in CAP_REG and AW in context-entry. > - */ > - if (iova & ~((1ULL << MIN(ce_agaw, VTD_MGAW)) - 1)) { > + if (!vtd_iova_range_check(iova, ce)) { > error_report("IOVA 0x%"PRIx64 " exceeds limits", iova); > return -VTD_FR_ADDR_BEYOND_MGAW; > } > @@ -718,6 +730,166 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, > uint64_t iova, > } > } > > +typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private); > + > +/** > + * vtd_page_walk_level - walk over specific level for IOVA range > + * > + * @addr: base GPA addr to start the walk > + * @start: IOVA range start address > + * @end: IOVA range end address (start <= addr < end) > + * @hook_fn: hook func to be called when detected page > + * @private: private data to be passed into hook func > + * @read: whether parent level has read permission > + * @write: whether parent level has write permission > + * @skipped: accumulated skipped ranges > + * @notify_unmap: whether we should notify invalid entries > + */ > +static int vtd_page_walk_level(dma_addr_t addr, uint64_t start, > + uint64_t end, vtd_page_walk_hook hook_fn, > + void *private, uint32_t level, > + bool read, bool write, uint64_t *skipped, > + bool notify_unmap) > +{ > + bool read_cur, write_cur, entry_valid; > + uint32_t offset; > + uint64_t slpte; > + uint64_t subpage_size, subpage_mask; > + IOMMUTLBEntry entry; > + uint64_t iova = start; > + uint64_t iova_next; > + uint64_t skipped_local = 0; > + int ret = 0; > + > + trace_vtd_page_walk_level(addr, level, start, end); > + > + subpage_size = 1ULL << vtd_slpt_level_shift(level); > + subpage_mask = vtd_slpt_level_page_mask(level); > + > + while (iova < end) { > + iova_next = (iova & subpage_mask) + subpage_size; > + > + offset = vtd_iova_level_offset(iova, level); > + slpte = vtd_get_slpte(addr, offset); > + > + /* > + * When one of the following case happens, we assume the whole > + * range is invalid: > + * > + * 1. read block failed > + * 3. reserved area non-zero > + * 2. both read & write flag are not set > + */ > + > + if (slpte == (uint64_t)-1) { > + trace_vtd_page_walk_skip_read(iova, iova_next); > + skipped_local++; > + goto next; > + } > + > + if (vtd_slpte_nonzero_rsvd(slpte, level)) { > + trace_vtd_page_walk_skip_reserve(iova, iova_next); > + skipped_local++; > + goto next; > + } > + > + /* Permissions are stacked with parents' */ > + read_cur = read && (slpte & VTD_SL_R); > + write_cur = write && (slpte & VTD_SL_W); > + > + /* > + * As long as we have either read/write permission, this is > + * a valid entry. The rule works for both page or page tables. > + */ > + entry_valid = read_cur | write_cur; > + > + if (vtd_is_last_slpte(slpte, level)) { > + entry.target_as = &address_space_memory; > + entry.iova = iova & subpage_mask; > + /* > + * This might be meaningless addr if (!read_cur && > + * !write_cur), but after all this field will be > + * meaningless in that case, so let's share the code to > + * generate the IOTLBs no matter it's an MAP or UNMAP > + */ > + entry.translated_addr = vtd_get_slpte_addr(slpte); > + entry.addr_mask = ~subpage_mask; > + entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur); > + if (!entry_valid && !notify_unmap) { > + trace_vtd_page_walk_skip_perm(iova, iova_next); > + skipped_local++; > + goto next; > + } > + trace_vtd_page_walk_one(level, entry.iova, addr, > + entry.addr_mask, entry.perm); > + if (hook_fn) { > + ret = hook_fn(&entry, private); > + if (ret < 0) { > + error_report("Detected error in page walk hook " > + "function, stop walk."); > + return ret; > + } > + } > + } else { > + if (!entry_valid) { > + trace_vtd_page_walk_skip_perm(iova, iova_next); > + skipped_local++; > + goto next; > + } > + ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova, > + MIN(iova_next, end), hook_fn, private, > + level - 1, read_cur, write_cur, > + &skipped_local, notify_unmap); > + if (ret < 0) { > + error_report("Detected page walk error on addr 0x%"PRIx64 > + " level %"PRIu32", stop walk.", addr, level - > 1); > + return ret; > + } > + } > + > +next: > + iova = iova_next; > + } > + > + if (skipped) { > + *skipped += skipped_local; > + } > + > + return 0; > +} > + > +/** > + * vtd_page_walk - walk specific IOVA range, and call the hook > + * > + * @ce: context entry to walk upon > + * @start: IOVA address to start the walk > + * @end: size of the IOVA range to walk over > + * @hook_fn: the hook that to be called for each detected area > + * @private: private data for the hook function > + */ > +static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end, > + vtd_page_walk_hook hook_fn, void *private) > +{ > + dma_addr_t addr = vtd_get_slpt_base_from_context(ce); > + uint32_t level = vtd_get_level_from_context_entry(ce); > + > + if (!vtd_iova_range_check(start, ce)) { > + error_report("IOVA start 0x%"PRIx64 " end 0x%"PRIx64" exceeds > limits", > + start, end); > + return -VTD_FR_ADDR_BEYOND_MGAW; > + } > + > + if (!vtd_iova_range_check(end, ce)) { > + /* Fix end so that it reaches the maximum */ > + end = vtd_iova_limit(ce); > + } > + > + trace_vtd_page_walk(ce->hi, ce->lo, start, end); > + > + return vtd_page_walk_level(addr, start, end, hook_fn, private, > + level, true, true, NULL, false); > +} > + > /* Map a device to its corresponding domain (context-entry) */ > static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num, > uint8_t devfn, VTDContextEntry *ce) > @@ -2430,6 +2602,35 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, > PCIBus *bus, int devfn) > return vtd_dev_as; > } > > +static int vtd_replay_hook(IOMMUTLBEntry *entry, void *private) > +{ > + memory_region_notify_one((IOMMUNotifier *)private, entry); > + return 0; > +} > + > +static void vtd_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n) > +{ > + VTDAddressSpace *vtd_as = container_of(mr, VTDAddressSpace, iommu); > + IntelIOMMUState *s = vtd_as->iommu_state; > + uint8_t bus_n = pci_bus_num(vtd_as->bus); > + VTDContextEntry ce; > + > + if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) {
Hi Peter, Again I'd like to give my cheers on this patch set. Hereby, I have a minor concern on this part. The replay would be performed in vfio_listener_region_add() when device is assigned. Since guest is just started, so there will be no valid context entry for the assigned device. So there will be no vtd_page_walk. Thus no change to SL page table in host. The SL page table would still be a IOVA->HPA mapping. It's true that the gIOVA->HPA mapping can be built by page map/unmap when guest trying to issue dma. So it is just ok without relpay in the beginning. Hot plug should be all the same. I guess it is the design here? However, it may be incompatible with a future work in which we expose an vIOMMU to guest but guest disable IOVA usage. In this scenario, the SL page table in host would be a GPA->HPA instead of a gIOVA->HPA mapping. It would rely on memory_replay to build a GPA->HPA mapping. If I'm correct, I think it would be expected to have a replay in the time device is assigned. Regards, Yi L