>-----Original Message----- >From: Eric Auger <eric.au...@redhat.com> >Sent: Friday, June 24, 2022 12:52 AM >To: Duan, Zhenzhong <zhenzhong.d...@intel.com> >Cc: qemu-devel@nongnu.org; m...@redhat.com; jean-phili...@linaro.org; >pbonz...@redhat.com; Zhang, Yu C <yu.c.zh...@intel.com>; Dong, >Chuanxiao <chuanxiao.d...@intel.com>; Zhang, Tina ><tina.zh...@intel.com> >Subject: Re: [PATCH 1/3] virtio-iommu: Add bypass mode support to >assigned device > >Hi, >On 6/13/22 08:10, Zhenzhong Duan wrote: >> Currently assigned devices can not work in virtio-iommu bypass mode. >> Guest driver fails to probe the device due to DMA failure. And the >> reason is because of lacking GPA -> HPA mappings when VM is created. >> >> Add a root container memory region to hold both bypass memory region >> and iommu memory region, so the switch between them is supported just >> like the implementation in virtual VT-d. >> >> Signed-off-by: Zhenzhong Duan <zhenzhong.d...@intel.com> >> --- >> hw/virtio/trace-events | 1 + >> hw/virtio/virtio-iommu.c | 115 ++++++++++++++++++++++++++++++- >> include/hw/virtio/virtio-iommu.h | 2 + >> 3 files changed, 116 insertions(+), 2 deletions(-) >> >> diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events index >> ab8e095b73fa..20af2e7ebd78 100644 >> --- a/hw/virtio/trace-events >> +++ b/hw/virtio/trace-events >> @@ -124,6 +124,7 @@ virtio_iommu_remap(const char *name, uint64_t >> virt_start, uint64_t virt_end, uin >> virtio_iommu_set_page_size_mask(const char *name, uint64_t old, >uint64_t new) "mr=%s old_mask=0x%"PRIx64" new_mask=0x%"PRIx64 >virtio_iommu_notify_flag_add(const char *name) "add notifier to mr %s" >> virtio_iommu_notify_flag_del(const char *name) "del notifier from mr %s" >> +virtio_iommu_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, >bool on) "Device %02x:%02x.%x switching address space (iommu >enabled=%d)" >> >> # virtio-mem.c >> virtio_mem_send_response(uint16_t type) "type=%" PRIu16 diff --git >> a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c index >> 2597e166f939..ff718107ee02 100644 >> --- a/hw/virtio/virtio-iommu.c >> +++ b/hw/virtio/virtio-iommu.c >> @@ -69,6 +69,77 @@ static inline uint16_t >virtio_iommu_get_bdf(IOMMUDevice *dev) >> return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); } >> >> +static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev) { >> + uint32_t sid; >> + bool bypassed; >> + VirtIOIOMMU *s = sdev->viommu; >> + VirtIOIOMMUEndpoint *ep; >> + >> + sid = virtio_iommu_get_bdf(sdev); >> + >> + qemu_mutex_lock(&s->mutex); >> + /* need to check bypass before system reset */ >> + if (!s->endpoints) { >> + bypassed = s->config.bypass; >> + goto unlock; >> + } >> + >> + ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid)); >> + if (!ep || !ep->domain) { >> + bypassed = s->config.bypass; >> + } else { >> + bypassed = ep->domain->bypass; >> + } >> + >> +unlock: >> + qemu_mutex_unlock(&s->mutex); >> + return bypassed; >> +} >> + >> +/* Return whether the device is using IOMMU translation. */ static >> +bool virtio_iommu_switch_address_space(IOMMUDevice *sdev) { >> + bool use_remapping; >> + >> + assert(sdev); >> + >> + use_remapping = !virtio_iommu_device_bypassed(sdev); >> + >> + trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus), >> + PCI_SLOT(sdev->devfn), >> + PCI_FUNC(sdev->devfn), >> + use_remapping); >> + >> + /* Turn off first then on the other */ >> + if (use_remapping) { >> + memory_region_set_enabled(&sdev->bypass_mr, false); >> + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), >true); >> + } else { >> + memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), >false); >> + memory_region_set_enabled(&sdev->bypass_mr, true); >> + } >> + >> + return use_remapping; >> +} >> + >> +static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s) { >> + GHashTableIter iter; >> + IOMMUPciBus *iommu_pci_bus; >> + int i; >> + >> + g_hash_table_iter_init(&iter, s->as_by_busptr); >> + while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) >{ >> + for (i = 0; i < PCI_DEVFN_MAX; i++) { >> + if (!iommu_pci_bus->pbdev[i]) { >> + continue; >> + } >> + virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]); >> + } >> + } >> +} >> + >> /** >> * The bus number is used for lookup when SID based operations occur. >> * In that case we lazily populate the IOMMUPciBus array from the bus >> hash @@ -213,6 +284,7 @@ static gboolean >> virtio_iommu_notify_map_cb(gpointer key, gpointer value, static void >> virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) >{ >> VirtIOIOMMUDomain *domain = ep->domain; >> + IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, >> + iommu_mr); >> >> if (!ep->domain) { >> return; >> @@ -221,6 +293,7 @@ static void >virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep) >> ep->iommu_mr); >> QLIST_REMOVE(ep, next); >> ep->domain = NULL; >> + virtio_iommu_switch_address_space(sdev); >> } >> >> static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU >*s, >> @@ -323,12 +396,39 @@ static AddressSpace >> *virtio_iommu_find_add_as(PCIBus *bus, void *opaque, >> >> trace_virtio_iommu_init_iommu_mr(name); >> >> + memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX); >> + address_space_init(&sdev->as, &sdev->root, >> + TYPE_VIRTIO_IOMMU); >> + >> + /* >> + * Build the IOMMU disabled container with aliases to the >> + * shared MRs. Note that aliasing to a shared memory region >What do you call the 'disabled container' and the shared MRs?
' IOMMU disabled container' means sdev->bypass_mr. Shared MRs means get_system_memory(), aka system_memory. All endpoint's sdev->bypass_mr alias to system_memory, so saying sdev->bypass_mr 'IOMMU disabled' and system_memory shared MRs. > >> + * could help the memory API to detect same FlatViews so we >> + * can have devices to share the same FlatView when in bypass >> + * mode. (either by not configuring virtio-iommu driver or with >> + * "iommu=pt"). It will greatly reduce the total number of >> + * FlatViews of the system hence VM runs faster. > >Do you mean that we could have used a shared bypass MR instead of >creatingone per device? I think we still need to create iommu MR per device which is enabled when endpoint attached to dma domain. Shared bypass MR is enabled when: 1. endpoint not attached to any domain yet and virtio-iommu.boot_bypass=true 2. endpoint is attached to bypass domain >> + */ >> + memory_region_init_alias(&sdev->bypass_mr, OBJECT(s), >> + "system", get_system_memory(), 0, >> + >> + memory_region_size(get_system_memory())); >> + >> memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev- >>iommu_mr), >> TYPE_VIRTIO_IOMMU_MEMORY_REGION, >> OBJECT(s), name, >> UINT64_MAX); >> - address_space_init(&sdev->as, >> - MEMORY_REGION(&sdev->iommu_mr), >TYPE_VIRTIO_IOMMU); >> + >> + /* >> + * Hook both the containers under the root container, we >did you mean "hook both sub-regions to the root container"? Yes >> + * switch between iommu & bypass MRs by enable/disable >> + * corresponding sub-containers >> + */ >> + memory_region_add_subregion_overlap(&sdev->root, 0, >> + MEMORY_REGION(&sdev->iommu_mr), >> + 0); >> + memory_region_add_subregion_overlap(&sdev->root, 0, >> + &sdev->bypass_mr, 0); >> + >> + virtio_iommu_switch_address_space(sdev); >> g_free(name); >> } >> return &sdev->as; >> @@ -342,6 +442,7 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, >> uint32_t flags = le32_to_cpu(req->flags); >> VirtIOIOMMUDomain *domain; >> VirtIOIOMMUEndpoint *ep; >> + IOMMUDevice *sdev; >> >> trace_virtio_iommu_attach(domain_id, ep_id); >> >> @@ -375,6 +476,8 @@ static int virtio_iommu_attach(VirtIOIOMMU *s, >> QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next); >> >> ep->domain = domain; >> + sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr); >> + virtio_iommu_switch_address_space(sdev); >> >> /* Replay domain mappings on the associated memory region */ >> g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb, @@ >> -887,6 +990,7 @@ static void virtio_iommu_set_config(VirtIODevice *vdev, >> return; >> } >> dev_config->bypass = in_config->bypass; >> + virtio_iommu_switch_address_space_all(dev); >> } >> >> trace_virtio_iommu_set_config(in_config->bypass); >> @@ -1026,6 +1130,8 @@ static void virtio_iommu_system_reset(void >*opaque) >> * system reset >> */ >> s->config.bypass = s->boot_bypass; >> + virtio_iommu_switch_address_space_all(s); >> + >> } >> >> static void virtio_iommu_device_realize(DeviceState *dev, Error >> **errp) @@ -1041,6 +1147,11 @@ static void >virtio_iommu_device_realize(DeviceState *dev, Error **errp) >> virtio_iommu_handle_command); >> s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, >> NULL); >> >> + /* >> + * config.bypass is needed to get initial address space early, such as >> + * in vfio realize >> + */ >I don't understand this comment, please can you elaborate? In vfio_realize(), virtio_iommu_find_add_as() is called to get the address space from virtio-iommu. virtio_iommu_find_add_as() calls virtio_iommu_switch_address_space() and virtio_iommu_switch_address_space() need to check config.bypass to enable the right MRs, either shared bypass MR or iommu MR. It could be ok to use s->boot_bypass directly in virtio_iommu_device_bypassed(), then this change could be removed. >> + s->config.bypass = s->boot_bypass; >> s->config.page_size_mask = TARGET_PAGE_MASK; >> s->config.input_range.end = UINT64_MAX; >> s->config.domain_range.end = UINT32_MAX; diff --git >> a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-iommu.h >> index 84391f844826..102eeefa731d 100644 >> --- a/include/hw/virtio/virtio-iommu.h >> +++ b/include/hw/virtio/virtio-iommu.h >> @@ -37,6 +37,8 @@ typedef struct IOMMUDevice { >> int devfn; >> IOMMUMemoryRegion iommu_mr; >> AddressSpace as; >> + MemoryRegion root; /* The root container of the device */ >> + MemoryRegion bypass_mr; /* The alias of shared memory MR */ >> } IOMMUDevice; >> >> typedef struct IOMMUPciBus { >Did you test migration? I wonder if we shouldn't call > >virtio_iommu_switch_address_space_all(s) >after restoring the endpoints in iommu_post_load() as it is done in intel- >iommu Thanks for point out, I missed the migration scenario. After your suggested change, the migration works without pass-through device. With pass-through device I got "VFIO device doesn't support migration". Not clear if pass-through device migration is totally supported now. Zhenzhong