At the moment writing new TCE value to the IOMMU table fails with EBUSY if there is a valid entry already. However PAPR specification allows the guest to write new TCE value without clearing it first.
This adds a set_and_get() callback to iommu_table_ops which does the same thing as set() plus it returns replaced TCE(s) so the caller can release the pages afterwards. This makes iommu_tce_build() put pages returned by set_and_get(). Since now we depend on permission bits in TCE entries, this preserves those bits in TCE in iommu_put_tce_user_mode(). This removes use of pool locks as those locks serve for TCE allocations rathen than IOMMU table access and new set_and_get() callback provides lockless way of safe pages release. This disables external IOMMU use (i.e. VFIO) for IOMMUs which do not implement set_and_get() callback. Therefore the "powernv" platform is the only supported one. Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> --- Changes: v4: * this is merge+rework of powerpc/powernv: Return non-zero TCE from pnv_tce_build powerpc/iommu: Implement put_page() if TCE had non-zero value powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values --- arch/powerpc/include/asm/iommu.h | 6 ++++++ arch/powerpc/kernel/iommu.c | 28 +++++++++++++++------------- arch/powerpc/platforms/powernv/pci.c | 29 +++++++++++++++++++++++------ 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index c725e4a..4b13e4e 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -49,6 +49,12 @@ struct iommu_table_ops { unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs); + int (*set_and_get)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs); void (*clear)(struct iommu_table *tbl, long index, long npages); unsigned long (*get)(struct iommu_table *tbl, long index); diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 6a86788..ad52e00 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1007,9 +1007,6 @@ EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) { unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); oldtce = tbl->it_ops->get(tbl, entry); if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) @@ -1017,8 +1014,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) else oldtce = 0; - spin_unlock(&(pool->lock)); - return oldtce; } EXPORT_SYMBOL_GPL(iommu_clear_tce); @@ -1056,16 +1051,12 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, { int ret = -EBUSY; unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - spin_lock(&(pool->lock)); + ret = tbl->it_ops->set_and_get(tbl, entry, 1, hwaddr, &oldtce, + direction, NULL); - oldtce = tbl->it_ops->get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL); - - spin_unlock(&(pool->lock)); + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) + put_page(pfn_to_page(__pa(oldtce) >> PAGE_SHIFT)); /* if (unlikely(ret)) pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", @@ -1092,6 +1083,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, return -EFAULT; } hwaddr = (unsigned long) page_address(page) + offset; + hwaddr |= tce & (TCE_PCI_READ | TCE_PCI_WRITE); ret = iommu_tce_build(tbl, entry, hwaddr, direction); if (ret) @@ -1110,6 +1102,16 @@ int iommu_take_ownership(struct iommu_table *tbl) unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; int ret = 0, bit0 = 0; + /* + * VFIO does not control TCE entries allocation and the guest + * can write new TCEs on top of existing ones so iommu_tce_build() + * must be able to release old pages. This functionality + * requires set_and_get() callback defined so if it is not + * implemented, we disallow taking ownership over the table. + */ + if (!tbl->it_ops->set_and_get) + return -EINVAL; + spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 1179c63..629d443 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -572,12 +572,14 @@ static void pnv_tce_invalidate(struct iommu_table *tbl, __be64 *startp, } static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, + unsigned long uaddr, unsigned long *old_tces, + enum dma_data_direction direction, struct dma_attrs *attrs, bool rm) { u64 proto_tce; __be64 *tcep, *tces; u64 rpn; + long i; proto_tce = TCE_PCI_READ; // Read allowed @@ -587,9 +589,13 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; rpn = __pa(uaddr) >> tbl->it_page_shift; - while (npages--) - *(tcep++) = cpu_to_be64(proto_tce | - (rpn++ << tbl->it_page_shift)); + for (i = 0; i < npages; i++) { + unsigned long oldtce = xchg(tcep, cpu_to_be64(proto_tce | + (rpn++ << tbl->it_page_shift))); + if (old_tces) + old_tces[i] = (unsigned long) __va(oldtce); + tcep++; + } pnv_tce_invalidate(tbl, tces, tcep - 1, rm); @@ -601,8 +607,18 @@ static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, enum dma_data_direction direction, struct dma_attrs *attrs) { - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, - false); + return pnv_tce_build(tbl, index, npages, uaddr, NULL, direction, + attrs, false); +} + +static int pnv_tce_set_and_get_vm(struct iommu_table *tbl, long index, + long npages, + unsigned long uaddr, unsigned long *old_tces, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + return pnv_tce_build(tbl, index, npages, uaddr, old_tces, direction, + attrs, false); } static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, @@ -630,6 +646,7 @@ static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) struct iommu_table_ops pnv_iommu_ops = { .set = pnv_tce_build_vm, + .set_and_get = pnv_tce_set_and_get_vm, .clear = pnv_tce_free_vm, .get = pnv_tce_get, }; -- 2.0.0 _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev