On Thu, 16 Jan 2025 17:53:55 +0800 Wencheng Yang <east.moutain.y...@gmail.com> wrote:
> On confidential VM platform, for example, AMD-SEV, P2P doesn't work. > The underlying reason is that IOMMU driver set encryption bit on > IOMMU page table pte entry, it's reasonalbe if the pte maps iova > to system memory. However, if the pte maps iova to device's > mmio bar space, setting encryption bit on pte would cause IOMMU > translates iova to incorrect bus address, rather than mmio bar > address. > > To fix the issue, the key point is to let IOMMU driver know the > target phyical address is system memory or device mmio. > > VFIO allocates virtual address and maps it to device mmio bar, > the member @ram_device of MemoryRegion indicates the memory > region is for mmio. The patch passes the info to VFIO DAM API, > IOMMU driver would do the correct thing. > > Signed-off-by: Wencheng Yang <east.moutain.y...@gmail.com> > --- > hw/vfio/common.c | 67 +++++++++++++++++---------- > hw/vfio/container-base.c | 4 +- > hw/vfio/container.c | 6 ++- > hw/vfio/iommufd.c | 4 +- > hw/virtio/vhost-vdpa.c | 6 +-- > include/exec/memory.h | 7 ++- > include/hw/vfio/vfio-common.h | 4 ++ > include/hw/vfio/vfio-container-base.h | 4 +- > linux-headers/linux/vfio.h | 1 + > system/memory.c | 11 +++-- > 10 files changed, 74 insertions(+), 40 deletions(-) > > diff --git a/hw/vfio/common.c b/hw/vfio/common.c > index f7499a9b74..2660a42f9e 100644 > --- a/hw/vfio/common.c > +++ b/hw/vfio/common.c > @@ -247,31 +247,42 @@ static bool > vfio_listener_skipped_section(MemoryRegionSection *section) > > /* Called with rcu_read_lock held. */ > static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, > - ram_addr_t *ram_addr, bool *read_only, > + ram_addr_t *ram_addr, uint32_t *flag, > Error **errp) > { > bool ret, mr_has_discard_manager; > + uint32_t mr_flag = 0; > > - ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only, > + ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, &mr_flag, > &mr_has_discard_manager, errp); > - if (ret && mr_has_discard_manager) { > - /* > - * Malicious VMs might trigger discarding of IOMMU-mapped memory. The > - * pages will remain pinned inside vfio until unmapped, resulting in > a > - * higher memory consumption than expected. If memory would get > - * populated again later, there would be an inconsistency between > pages > - * pinned by vfio and pages seen by QEMU. This is the case until > - * unmapped from the IOMMU (e.g., during device reset). > - * > - * With malicious guests, we really only care about pinning more > memory > - * than expected. RLIMIT_MEMLOCK set for the user/process can never > be > - * exceeded and can be used to mitigate this problem. > - */ > - warn_report_once("Using vfio with vIOMMUs and coordinated discarding > of" > - " RAM (e.g., virtio-mem) works, however, malicious" > - " guests can trigger pinning of more memory than" > - " intended via an IOMMU. It's possible to mitigate " > - " by setting/adjusting RLIMIT_MEMLOCK."); > + if (ret) { > + if (flag) { > + if (mr_flag & MRF_READONLY) > + *flag |= VFIO_MRF_READONLY; > + > + if (mr_flag & MRF_RAMDEV) > + *flag |= VFIO_MRF_RAMDEV; > + } > + > + if (mr_has_discard_manager) { > + /* > + * Malicious VMs might trigger discarding of IOMMU-mapped > memory. The > + * pages will remain pinned inside vfio until unmapped, > resulting in a > + * higher memory consumption than expected. If memory would get > + * populated again later, there would be an inconsistency > between pages > + * pinned by vfio and pages seen by QEMU. This is the case until > + * unmapped from the IOMMU (e.g., during device reset). > + * > + * With malicious guests, we really only care about pinning more > memory > + * than expected. RLIMIT_MEMLOCK set for the user/process can > never be > + * exceeded and can be used to mitigate this problem. > + */ > + warn_report_once("Using vfio with vIOMMUs and coordinated > discarding of" > + " RAM (e.g., virtio-mem) works, however, > malicious" > + " guests can trigger pinning of more memory > than" > + " intended via an IOMMU. It's possible to > mitigate " > + " by setting/adjusting RLIMIT_MEMLOCK."); > + } > } > return ret; > } > @@ -298,9 +309,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, > IOMMUTLBEntry *iotlb) > rcu_read_lock(); > > if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { > - bool read_only; > + uint32_t flag = 0; > > - if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, > &local_err)) { > + if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &flag, &local_err)) { > error_report_err(local_err); > goto out; > } > @@ -313,7 +324,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, > IOMMUTLBEntry *iotlb) > */ > ret = vfio_container_dma_map(bcontainer, iova, > iotlb->addr_mask + 1, vaddr, > - read_only); > + flag); > if (ret) { > error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " > "0x%"HWADDR_PRIx", %p) = %d (%s)", > @@ -363,6 +374,7 @@ static int > vfio_ram_discard_notify_populate(RamDiscardListener *rdl, > int128_get64(section->size); > hwaddr start, next, iova; > void *vaddr; > + uint32_t flag = 0; > int ret; > > /* > @@ -377,8 +389,10 @@ static int > vfio_ram_discard_notify_populate(RamDiscardListener *rdl, > section->offset_within_address_space; > vaddr = memory_region_get_ram_ptr(section->mr) + start; > > + flag |= section->readonly? VFIO_MRF_READONLY: 0; > + flag |= section->ram_device? VFIO_MRF_RAMDEV: 0; > ret = vfio_container_dma_map(bcontainer, iova, next - start, > - vaddr, section->readonly); > + vaddr, flag); > if (ret) { > /* Rollback */ > vfio_ram_discard_notify_discard(rdl, section); > @@ -563,6 +577,7 @@ static void vfio_listener_region_add(MemoryListener > *listener, > hwaddr iova, end; > Int128 llend, llsize; > void *vaddr; > + uint32_t flag = 0; > int ret; > Error *err = NULL; > > @@ -661,8 +676,10 @@ static void vfio_listener_region_add(MemoryListener > *listener, > } > } > > + flag |= section->readonly? VFIO_MRF_READONLY: 0; > + flag |= section->ram_device? VFIO_MRF_RAMDEV: 0; > ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize), > - vaddr, section->readonly); > + vaddr, flag); > if (ret) { > error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", " > "0x%"HWADDR_PRIx", %p) = %d (%s)", > diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c > index 749a3fd29d..7cee2ac562 100644 > --- a/hw/vfio/container-base.c > +++ b/hw/vfio/container-base.c > @@ -17,12 +17,12 @@ > > int vfio_container_dma_map(VFIOContainerBase *bcontainer, > hwaddr iova, ram_addr_t size, > - void *vaddr, bool readonly) > + void *vaddr, uint32_t flag) > { > VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); > > g_assert(vioc->dma_map); > - return vioc->dma_map(bcontainer, iova, size, vaddr, readonly); > + return vioc->dma_map(bcontainer, iova, size, vaddr, flag); > } > > int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, > diff --git a/hw/vfio/container.c b/hw/vfio/container.c > index 4ebb526808..90c32cd16d 100644 > --- a/hw/vfio/container.c > +++ b/hw/vfio/container.c > @@ -176,7 +176,7 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase > *bcontainer, > } > > static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr > iova, > - ram_addr_t size, void *vaddr, bool readonly) > + ram_addr_t size, void *vaddr, uint32_t flag) > { > const VFIOContainer *container = container_of(bcontainer, VFIOContainer, > bcontainer); > @@ -188,9 +188,11 @@ static int vfio_legacy_dma_map(const VFIOContainerBase > *bcontainer, hwaddr iova, > .size = size, > }; > > - if (!readonly) { > + if (!(flag & VFIO_MRF_READONLY)) { > map.flags |= VFIO_DMA_MAP_FLAG_WRITE; > } > + if (flag & VFIO_MRF_RAMDEV) > + map.flags |= VFIO_DMA_MAP_FLAG_MMIO; > > /* > * Try the mapping, if it fails with EBUSY, unmap the region and try > diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c > index 3490a8f1eb..c773b45b5d 100644 > --- a/hw/vfio/iommufd.c > +++ b/hw/vfio/iommufd.c > @@ -28,14 +28,14 @@ > #include "exec/ram_addr.h" > > static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, > - ram_addr_t size, void *vaddr, bool readonly) > + ram_addr_t size, void *vaddr, uint32_t flag) > { > const VFIOIOMMUFDContainer *container = > container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); > > return iommufd_backend_map_dma(container->be, > container->ioas_id, > - iova, size, vaddr, readonly); > + iova, size, vaddr, flag & > VFIO_MRF_READONLY); > } > > static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > index 3cdaa12ed5..dea733ef8a 100644 > --- a/hw/virtio/vhost-vdpa.c > +++ b/hw/virtio/vhost-vdpa.c > @@ -226,15 +226,15 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier > *n, IOMMUTLBEntry *iotlb) > } > > if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) { > - bool read_only; > + uint32_t flag; > > - if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL, > + if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &flag, NULL, > &local_err)) { > error_report_err(local_err); > return; > } > ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova, > - iotlb->addr_mask + 1, vaddr, read_only); > + iotlb->addr_mask + 1, vaddr, flag & > MRF_READONLY); > if (ret) { > error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", " > "0x%" HWADDR_PRIx ", %p) = %d (%m)", > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 9458e2801d..24405af0be 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -104,10 +104,15 @@ struct MemoryRegionSection { > hwaddr offset_within_region; > hwaddr offset_within_address_space; > bool readonly; > + bool ram_device; > bool nonvolatile; > bool unmergeable; > }; > > +/* memory region flag */ > +#define MRF_READONLY 0x1 > +#define MRF_RAMDEV 0x2 > + > typedef struct IOMMUTLBEntry IOMMUTLBEntry; > > /* See address_space_translate: bit 0 is read, bit 1 is write. */ > @@ -742,7 +747,7 @@ void > ram_discard_manager_unregister_listener(RamDiscardManager *rdm, > * Return: true on success, else false setting @errp with error. > */ > bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, > - ram_addr_t *ram_addr, bool *read_only, > + ram_addr_t *ram_addr, uint32_t *flag, > bool *mr_has_discard_manager, Error **errp); > > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h > index 0c60be5b15..48018dc751 100644 > --- a/include/hw/vfio/vfio-common.h > +++ b/include/hw/vfio/vfio-common.h > @@ -43,6 +43,10 @@ enum { > VFIO_DEVICE_TYPE_AP = 3, > }; > > +/* vfio memory region flag */ > +#define VFIO_MRF_READONLY 0x1 > +#define VFIO_MRF_RAMDEV 0x2 > + > typedef struct VFIOMmap { > MemoryRegion mem; > void *mmap; > diff --git a/include/hw/vfio/vfio-container-base.h > b/include/hw/vfio/vfio-container-base.h > index 4cff9943ab..bb473e7201 100644 > --- a/include/hw/vfio/vfio-container-base.h > +++ b/include/hw/vfio/vfio-container-base.h > @@ -73,7 +73,7 @@ typedef struct VFIORamDiscardListener { > > int vfio_container_dma_map(VFIOContainerBase *bcontainer, > hwaddr iova, ram_addr_t size, > - void *vaddr, bool readonly); > + void *vaddr, uint32_t flag); > int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, > hwaddr iova, ram_addr_t size, > IOMMUTLBEntry *iotlb); > @@ -113,7 +113,7 @@ struct VFIOIOMMUClass { > bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); > int (*dma_map)(const VFIOContainerBase *bcontainer, > hwaddr iova, ram_addr_t size, > - void *vaddr, bool readonly); > + void *vaddr, uint32_t flag); > int (*dma_unmap)(const VFIOContainerBase *bcontainer, > hwaddr iova, ram_addr_t size, > IOMMUTLBEntry *iotlb); > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > index 1b5e254d6a..4a32e70c33 100644 > --- a/linux-headers/linux/vfio.h > +++ b/linux-headers/linux/vfio.h > @@ -1560,6 +1560,7 @@ struct vfio_iommu_type1_dma_map { > #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device > */ > #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ > #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2) > +#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3) Where's the kernel patch that implements the MMIO map flag. That needs to come first. I also don't understand why we're creating multiple read-only and ramdev flags to distill back into vfio mapping flags. Thanks, Alex > __u64 vaddr; /* Process virtual address */ > __u64 iova; /* IO virtual address */ > __u64 size; /* Size of mapping (bytes) */ > diff --git a/system/memory.c b/system/memory.c > index b17b5538ff..71c54fc045 100644 > --- a/system/memory.c > +++ b/system/memory.c > @@ -223,6 +223,7 @@ struct FlatRange { > uint8_t dirty_log_mask; > bool romd_mode; > bool readonly; > + bool ram_device; > bool nonvolatile; > bool unmergeable; > }; > @@ -240,6 +241,7 @@ section_from_flat_range(FlatRange *fr, FlatView *fv) > .size = fr->addr.size, > .offset_within_address_space = int128_get64(fr->addr.start), > .readonly = fr->readonly, > + .ram_device = fr->ram_device, > .nonvolatile = fr->nonvolatile, > .unmergeable = fr->unmergeable, > }; > @@ -252,6 +254,7 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b) > && a->offset_in_region == b->offset_in_region > && a->romd_mode == b->romd_mode > && a->readonly == b->readonly > + && a->ram_device == b->ram_device > && a->nonvolatile == b->nonvolatile > && a->unmergeable == b->unmergeable; > } > @@ -657,6 +660,7 @@ static void render_memory_region(FlatView *view, > fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr); > fr.romd_mode = mr->romd_mode; > fr.readonly = readonly; > + fr.ram_device = mr->ram_device; > fr.nonvolatile = nonvolatile; > fr.unmergeable = unmergeable; > > @@ -2184,7 +2188,7 @@ void > ram_discard_manager_unregister_listener(RamDiscardManager *rdm, > > /* Called with rcu_read_lock held. */ > bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, > - ram_addr_t *ram_addr, bool *read_only, > + ram_addr_t *ram_addr, uint32_t *flag, > bool *mr_has_discard_manager, Error **errp) > { > MemoryRegion *mr; > @@ -2246,8 +2250,9 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void > **vaddr, > *ram_addr = memory_region_get_ram_addr(mr) + xlat; > } > > - if (read_only) { > - *read_only = !writable || mr->readonly; > + if (flag) { > + *flag |= (!writable || mr->readonly)? MRF_READONLY: 0; > + *flag |= mr->ram_device? MRF_RAMDEV: 0; > } > > return true;