Since kernel commit: 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block the access in D3hot power state") any attempt to do an mmap access to a BAR when the device is in d3hot state will generate a fault.
On system_powerdown, if the VFIO device is translated by an IOMMU, the device is moved to D3hot state and then the vIOMMU gets disabled by the guest. As a result of this later operation, the address space is swapped from translated to untranslated. When re-enabling the aliased regions, the RAM regions are dma-mapped again and this causes DMA_MAP faults when attempting the operation on BARs. To avoid doing the remap on those BARs, we need to retrieve the information whether the device is in a non compatible state. Implement the vfio_is_dma_map_allowed() callback for PCI devices. If the device is in D3hot state, skip the DMA MAP in vfio_listener_add(). To ease the implementation, vfio_section_is_vfio_pci now returns a VFIOPCIDevice pointer and the function is moved before the first caller. Signed-off-by: Eric Auger <eric.au...@redhat.com> --- hw/vfio/common.c | 57 +++++++++++++++++++++++++++----------------- hw/vfio/pci.c | 22 +++++++++++++++++ hw/vfio/trace-events | 1 + 3 files changed, 58 insertions(+), 22 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 173fb3a997..96f401f10a 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -555,11 +555,34 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, return true; } +static VFIOPCIDevice *vfio_section_is_vfio_pci(MemoryRegionSection *section, + VFIOContainerBase *bcontainer) +{ + VFIOPCIDevice *pcidev; + VFIODevice *vbasedev; + Object *owner; + + owner = memory_region_owner(section->mr); + + QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { + continue; + } + pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + if (OBJECT(pcidev) == owner) { + return pcidev; + } + } + + return NULL; +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, listener); + VFIOPCIDevice *vdev; hwaddr iova, end; Int128 llend, llsize; void *vaddr; @@ -630,6 +653,18 @@ static void vfio_listener_region_add(MemoryListener *listener, /* Here we assume that memory_region_is_ram(section->mr)==true */ + /* skip if the region is a BAR and the power state forbids DMA MAP */ + vdev = vfio_section_is_vfio_pci(section, bcontainer); + if (vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; + assert(vbasedev->ops->vfio_is_dma_map_allowed); + if (!vbasedev->ops->vfio_is_dma_map_allowed(vbasedev)) { + trace_vfio_listener_region_add_skip(section->mr->name); + return; + } + } + + /* * For RAM memory regions with a RamDiscardManager, we only want to map the * actually populated parts - and update the mapping whenever we're notified @@ -804,28 +839,6 @@ typedef struct VFIODirtyRangesListener { MemoryListener listener; } VFIODirtyRangesListener; -static bool vfio_section_is_vfio_pci(MemoryRegionSection *section, - VFIOContainerBase *bcontainer) -{ - VFIOPCIDevice *pcidev; - VFIODevice *vbasedev; - Object *owner; - - owner = memory_region_owner(section->mr); - - QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { - if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - pcidev = container_of(vbasedev, VFIOPCIDevice, vbasedev); - if (OBJECT(pcidev) == owner) { - return true; - } - } - - return false; -} - static void vfio_dirty_tracking_update_range(VFIODirtyRanges *range, hwaddr iova, hwaddr end, bool update_pci) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index ab17a98ee5..314dddae4a 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2653,6 +2653,26 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) return ret; } +/* + * BARs cannot be dma-mapped if the device is in D3hot state since + * linux commit 2b2c651baf1c ("vfio/pci: Invalidate mmaps and block + * the access in D3hot power state") + */ +static bool vfio_pci_is_dma_map_allowed(VFIODevice *vbasedev) +{ + VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + uint16_t pmcsr; + uint8_t state; + + pmcsr = vfio_pci_read_config(&vdev->pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + state = pmcsr & PCI_PM_CTRL_STATE_MASK; + if (state == 3) { + return false; + } + return true; +} + + static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, @@ -2660,6 +2680,7 @@ static VFIODeviceOps vfio_pci_ops = { .vfio_get_object = vfio_pci_get_object, .vfio_save_config = vfio_pci_save_config, .vfio_load_config = vfio_pci_load_config, + .vfio_is_dma_map_allowed = vfio_pci_is_dma_map_allowed, }; bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) @@ -3477,3 +3498,4 @@ static void register_vfio_pci_dev_type(void) } type_init(register_vfio_pci_dev_type) + diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index c5385e1a4f..a0d5868c2f 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -121,6 +121,7 @@ vfio_legacy_dma_unmap_overflow_workaround(void) "" vfio_get_dirty_bitmap(uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start, uint64_t dirty_pages) "iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64" dirty_pages=%"PRIu64 vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64 vfio_reset_handler(void) "" +vfio_listener_region_add_skip(const char *name) "DMA MAP would fail on region %s due to incompatible power state, skip it" # platform.c vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s" -- 2.47.1