On Wed, 7 Jul 2021 10:20:25 -0700 Steve Sistare <steven.sist...@oracle.com> wrote: > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index 9220e64..40c882f 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -31,6 +31,7 @@ > #include "exec/memory.h" > #include "exec/ram_addr.h" > #include "hw/hw.h" > +#include "qemu/env.h" > #include "qemu/error-report.h" > #include "qemu/main-loop.h" > #include "qemu/range.h" > @@ -440,6 +441,10 @@ static int vfio_dma_unmap(VFIOContainer *container, > return vfio_dma_unmap_bitmap(container, iova, size, iotlb); > } > > + if (container->reused) { > + return 0; > + } > + > while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) { > /* > * The type1 backend has an off-by-one bug in the kernel > (71a7d3d78e3c > @@ -463,6 +468,11 @@ static int vfio_dma_unmap(VFIOContainer *container, > return -errno; > } > > + if (unmap.size != size) { > + warn_report("VFIO_UNMAP_DMA(0x%lx, 0x%lx) only unmaps 0x%llx", > + iova, size, unmap.size); > + } > +
I'm a tad nervous that we have paths that can trigger this, the ioctl certainly supports that we can call it across multiple mappings and the size returned is the sum of the previously mapped ranges that were unmapped. See for instance vfio_listener_region_del()'s use of this function. > return 0; > } > > @@ -477,6 +487,10 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr > iova, > .size = size, > }; > > + if (container->reused) { > + return 0; > + } > + > if (!readonly) { > map.flags |= VFIO_DMA_MAP_FLAG_WRITE; > } > @@ -1603,6 +1617,10 @@ static int vfio_init_container(VFIOContainer > *container, int group_fd, > if (iommu_type < 0) { > return iommu_type; > } > + if (container->reused) { > + container->iommu_type = iommu_type; > + return 0; > + } How would this handle the case where SPAPR_TCE_v2 falls back to SPAPR_TCE (v1)? > > ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd); > if (ret) { ... > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 9fc12bc..0f5c542 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -3264,6 +3272,61 @@ static Property vfio_pci_dev_properties[] = { > DEFINE_PROP_END_OF_LIST(), > }; > > +static void vfio_merge_config(VFIOPCIDevice *vdev) > +{ > + PCIDevice *pdev = &vdev->pdev; > + int size = MIN(pci_config_size(pdev), vdev->config_size); > + uint8_t *phys_config = g_malloc(size); > + uint32_t mask; > + int ret, i; > + > + ret = pread(vdev->vbasedev.fd, phys_config, size, vdev->config_offset); > + if (ret < size) { > + ret = ret < 0 ? errno : EFAULT; Leaks phys_config > + error_report("failed to read device config space: %s", > strerror(ret)); > + return; > + } > + > + for (i = 0; i < size; i++) { > + mask = vdev->emulated_config_bits[i]; > + pdev->config[i] = (pdev->config[i] & mask) | (phys_config[i] & > ~mask); > + } > + > + g_free(phys_config); > +} > + > +static int vfio_pci_post_load(void *opaque, int version_id) > +{ > + VFIOPCIDevice *vdev = opaque; > + PCIDevice *pdev = &vdev->pdev; > + bool enabled; > + > + vfio_merge_config(vdev); > + > + pdev->reused = false; > + enabled = pci_get_word(pdev->config + PCI_COMMAND) & PCI_COMMAND_MASTER; > + memory_region_set_enabled(&pdev->bus_master_enable_region, enabled); This seems generic to any PCI device, I'm surprised we need to do it explicitly. Thanks, Alex > + > + return 0; > +} > +