Convert the VFIO device fd fops->mmap to create a DMABUF representing the BAR mapping, and make the VMA fault handler look up PFNs from the corresponding DMABUF. This supports future code mmap()ing BAR DMABUFs, and iommufd work to support Type1 P2P.
First, vfio_pci_core_mmap() uses the new vfio_pci_core_mmap_prep_dmabuf() helper to export a DMABUF representing a single BAR range. Then, the vfio_pci_mmap_huge_fault() callback is updated to understand revoked buffers, and uses the new vfio_pci_dma_buf_find_pfn() helper to determine the PFN for a given fault address. Now that the VFIO DMABUFs can be mmap()ed, vfio_pci_dma_buf_move() and vfio_pci_dma_buf_cleanup() need to zap PTEs on revocation and cleanup paths. CONFIG_VFIO_PCI_CORE now unconditionally depends on CONFIG_DMA_SHARED_BUFFER. CONFIG_VFIO_PCI_DMABUF remains, to conditionally include support for VFIO_DEVICE_FEATURE_DMA_BUF, and depends on CONFIG_PCI_P2PDMA. Signed-off-by: Matt Evans <[email protected]> --- drivers/vfio/pci/Kconfig | 3 +- drivers/vfio/pci/Makefile | 3 +- drivers/vfio/pci/vfio_pci_core.c | 73 ++++++++++++++++++++---------- drivers/vfio/pci/vfio_pci_dmabuf.c | 14 ++++++ drivers/vfio/pci/vfio_pci_priv.h | 11 +---- 5 files changed, 67 insertions(+), 37 deletions(-) diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 1e82b44bda1a..bf5c64d1fe22 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -6,6 +6,7 @@ config VFIO_PCI_CORE tristate select VFIO_VIRQFD select IRQ_BYPASS_MANAGER + select DMA_SHARED_BUFFER config VFIO_PCI_INTX def_bool y if !S390 @@ -56,7 +57,7 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. config VFIO_PCI_DMABUF - def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + def_bool y if PCI_P2PDMA source "drivers/vfio/pci/mlx5/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index e0a0757dd1d2..bab7a33a2b31 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only -vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio_pci_dmabuf.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o -vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f9ed3374d268..41224efa58d8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1648,18 +1648,6 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -static unsigned long vma_to_pfn(struct vm_area_struct *vma) -{ - struct vfio_pci_core_device *vdev = vma->vm_private_data; - int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); - u64 pgoff; - - pgoff = vma->vm_pgoff & - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); - - return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; -} - vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, struct vm_fault *vmf, unsigned long pfn, @@ -1692,23 +1680,45 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, unsigned int order) { struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - unsigned long pfn = vma_to_pfn(vma) + pgoff; + struct vfio_pci_dma_buf *priv = vma->vm_private_data; + struct vfio_pci_core_device *vdev; + unsigned long pfn; vm_fault_t ret = VM_FAULT_FALLBACK; + int pres; + + vdev = READ_ONCE(priv->vdev); - if (is_aligned_for_order(vma, addr, pfn, order)) { - scoped_guard(rwsem_read, &vdev->memory_lock) - ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + /* + * A fault might occur after vfio_pci_dma_buf_cleanup() has + * revoked and destroyed the vdev's DMABUFs, and annulled + * vdev. After creation, vdev is only ever written in + * cleanup. + */ + if (!vdev) + return VM_FAULT_SIGBUS; + + pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, order, &pfn); + + if (pres == 0) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + /* + * A buffer's revocation/unmap and status + * change occurs whilst holding memory_lock, + * so protects against racing faults. + */ + if (priv->revoked) + ret = VM_FAULT_SIGBUS; + else + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } else if (pres != -EAGAIN) { + ret = VM_FAULT_SIGBUS; } dev_dbg_ratelimited(&vdev->pdev->dev, - "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", - __func__, order, - vma->vm_pgoff >> - (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT), - pgoff, (unsigned int)ret); + "%s(order = %d) PFN 0x%lx, VA 0x%lx, pgoff 0x%lx: 0x%x\n", + __func__, order, pfn, vmf->address, vma->vm_pgoff, + (unsigned int)ret); return ret; } @@ -1773,7 +1783,20 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma if (ret) return ret; - vma->vm_private_data = vdev; + /* + * Create a DMABUF with a single range corresponding to this + * mapping, and wire it into vma->vm_private_data. The VMA's + * vm_file becomes that of the DMABUF, and the DMABUF takes + * ownership of the VFIO device file (put upon DMABUF + * release). This maintains the behaviour of a live VMA + * mapping holding the VFIO device file open. + */ + ret = vfio_pci_core_mmap_prep_dmabuf(vdev, vma, + pci_resource_start(pdev, index), + pgoff, req_len); + if (ret) + return ret; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 76db340ba592..197f50365ee1 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -9,6 +9,7 @@ MODULE_IMPORT_NS("DMA_BUF"); +#ifdef CONFIG_VFIO_PCI_DMABUF static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attachment) { @@ -25,6 +26,7 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, return 0; } +#endif /* CONFIG_VFIO_PCI_DMABUF */ static void vfio_pci_dma_buf_done(struct kref *kref) { @@ -89,7 +91,9 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) } static const struct dma_buf_ops vfio_pci_dmabuf_ops = { +#ifdef CONFIG_VFIO_PCI_DMABUF .attach = vfio_pci_dma_buf_attach, +#endif .map_dma_buf = vfio_pci_dma_buf_map, .unmap_dma_buf = vfio_pci_dma_buf_unmap, .release = vfio_pci_dma_buf_release, @@ -219,6 +223,7 @@ static int vfio_pci_dmabuf_export(struct vfio_pci_core_device *vdev, return 0; } +#ifdef CONFIG_VFIO_PCI_DMABUF /* * This is a temporary "private interconnect" between VFIO DMABUF and iommufd. * It allows the two co-operating drivers to exchange the physical address of @@ -428,6 +433,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, kfree(dma_ranges); return ret; } +#endif /* CONFIG_VFIO_PCI_DMABUF */ int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, struct vm_area_struct *vma, @@ -490,6 +496,10 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) struct vfio_pci_dma_buf *tmp; lockdep_assert_held_write(&vdev->memory_lock); + /* + * Holding memory_lock ensures a racing VMA fault observes + * priv->revoked properly. + */ list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { if (!get_file_active(&priv->dmabuf->file)) @@ -507,6 +517,8 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (revoked) { kref_put(&priv->kref, vfio_pci_dma_buf_done); wait_for_completion(&priv->comp); + unmap_mapping_range(priv->dmabuf->file->f_mapping, + 0, priv->size, 1); } else { /* * Kref is initialize again, because when revoke @@ -550,6 +562,8 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) dma_resv_unlock(priv->dmabuf->resv); kref_put(&priv->kref, vfio_pci_dma_buf_done); wait_for_completion(&priv->comp); + unmap_mapping_range(priv->dmabuf->file->f_mapping, + 0, priv->size, 1); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 5fd3a6e00a0e..37ece9b4b5bd 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -132,13 +132,13 @@ int vfio_pci_dma_buf_find_pfn(struct vfio_pci_dma_buf *vpdmabuf, int vfio_pci_core_mmap_prep_dmabuf(struct vfio_pci_core_device *vdev, struct vm_area_struct *vma, u64 phys_start, u64 pgoff, u64 req_len); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); #ifdef CONFIG_VFIO_PCI_DMABUF int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_feature_dma_buf __user *arg, size_t argsz); -void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); -void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); #else static inline int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, @@ -147,13 +147,6 @@ vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, { return -ENOTTY; } -static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) -{ -} -static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, - bool revoked) -{ -} #endif #endif -- 2.47.3
