At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

This adds a set_and_get() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE(s) so the caller can release
the pages afterwards.

This makes iommu_tce_build() put pages returned by set_and_get().

Since now we depend on permission bits in TCE entries, this preserves
those bits in TCE in iommu_put_tce_user_mode().

This removes use of pool locks as those locks serve for TCE allocations
rathen than IOMMU table access and new set_and_get() callback provides
lockless way of safe pages release.

This disables external IOMMU use (i.e. VFIO) for IOMMUs which do not
implement set_and_get() callback. Therefore the "powernv" platform is
the only supported one.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
Changes:
v4:
* this is merge+rework of
        powerpc/powernv: Return non-zero TCE from pnv_tce_build
        powerpc/iommu: Implement put_page() if TCE had non-zero value
        powerpc/iommu: Extend ppc_md.tce_build(_rm) to return old TCE values
---
 arch/powerpc/include/asm/iommu.h     |  6 ++++++
 arch/powerpc/kernel/iommu.c          | 28 +++++++++++++++-------------
 arch/powerpc/platforms/powernv/pci.c | 29 +++++++++++++++++++++++------
 3 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index c725e4a..4b13e4e 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -49,6 +49,12 @@ struct iommu_table_ops {
                        unsigned long uaddr,
                        enum dma_data_direction direction,
                        struct dma_attrs *attrs);
+       int (*set_and_get)(struct iommu_table *tbl,
+                       long index, long npages,
+                       unsigned long uaddr,
+                       unsigned long *old_tces,
+                       enum dma_data_direction direction,
+                       struct dma_attrs *attrs);
        void (*clear)(struct iommu_table *tbl,
                        long index, long npages);
        unsigned long (*get)(struct iommu_table *tbl, long index);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 6a86788..ad52e00 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1007,9 +1007,6 @@ EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
 {
        unsigned long oldtce;
-       struct iommu_pool *pool = get_pool(tbl, entry);
-
-       spin_lock(&(pool->lock));
 
        oldtce = tbl->it_ops->get(tbl, entry);
        if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
@@ -1017,8 +1014,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, 
unsigned long entry)
        else
                oldtce = 0;
 
-       spin_unlock(&(pool->lock));
-
        return oldtce;
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
@@ -1056,16 +1051,12 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned 
long entry,
 {
        int ret = -EBUSY;
        unsigned long oldtce;
-       struct iommu_pool *pool = get_pool(tbl, entry);
 
-       spin_lock(&(pool->lock));
+       ret = tbl->it_ops->set_and_get(tbl, entry, 1, hwaddr, &oldtce,
+                       direction, NULL);
 
-       oldtce = tbl->it_ops->get(tbl, entry);
-       /* Add new entry if it is not busy */
-       if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-               ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
-
-       spin_unlock(&(pool->lock));
+       if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+               put_page(pfn_to_page(__pa(oldtce) >> PAGE_SHIFT));
 
        /* if (unlikely(ret))
                pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx 
ret=%d\n",
@@ -1092,6 +1083,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, 
unsigned long entry,
                return -EFAULT;
        }
        hwaddr = (unsigned long) page_address(page) + offset;
+       hwaddr |= tce & (TCE_PCI_READ | TCE_PCI_WRITE);
 
        ret = iommu_tce_build(tbl, entry, hwaddr, direction);
        if (ret)
@@ -1110,6 +1102,16 @@ int iommu_take_ownership(struct iommu_table *tbl)
        unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
        int ret = 0, bit0 = 0;
 
+       /*
+        * VFIO does not control TCE entries allocation and the guest
+        * can write new TCEs on top of existing ones so iommu_tce_build()
+        * must be able to release old pages. This functionality
+        * requires set_and_get() callback defined so if it is not
+        * implemented, we disallow taking ownership over the table.
+        */
+       if (!tbl->it_ops->set_and_get)
+               return -EINVAL;
+
        spin_lock_irqsave(&tbl->large_pool.lock, flags);
        for (i = 0; i < tbl->nr_pools; i++)
                spin_lock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 1179c63..629d443 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,12 +572,14 @@ static void pnv_tce_invalidate(struct iommu_table *tbl, 
__be64 *startp,
 }
 
 static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-                        unsigned long uaddr, enum dma_data_direction direction,
+                        unsigned long uaddr, unsigned long *old_tces,
+                        enum dma_data_direction direction,
                         struct dma_attrs *attrs, bool rm)
 {
        u64 proto_tce;
        __be64 *tcep, *tces;
        u64 rpn;
+       long i;
 
        proto_tce = TCE_PCI_READ; // Read allowed
 
@@ -587,9 +589,13 @@ static int pnv_tce_build(struct iommu_table *tbl, long 
index, long npages,
        tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
        rpn = __pa(uaddr) >> tbl->it_page_shift;
 
-       while (npages--)
-               *(tcep++) = cpu_to_be64(proto_tce |
-                               (rpn++ << tbl->it_page_shift));
+       for (i = 0; i < npages; i++) {
+               unsigned long oldtce = xchg(tcep, cpu_to_be64(proto_tce |
+                               (rpn++ << tbl->it_page_shift)));
+               if (old_tces)
+                       old_tces[i] = (unsigned long) __va(oldtce);
+               tcep++;
+       }
 
        pnv_tce_invalidate(tbl, tces, tcep - 1, rm);
 
@@ -601,8 +607,18 @@ static int pnv_tce_build_vm(struct iommu_table *tbl, long 
index, long npages,
                            enum dma_data_direction direction,
                            struct dma_attrs *attrs)
 {
-       return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
-                       false);
+       return pnv_tce_build(tbl, index, npages, uaddr, NULL, direction,
+                       attrs, false);
+}
+
+static int pnv_tce_set_and_get_vm(struct iommu_table *tbl, long index,
+                                 long npages,
+                                 unsigned long uaddr, unsigned long *old_tces,
+                                 enum dma_data_direction direction,
+                                 struct dma_attrs *attrs)
+{
+       return pnv_tce_build(tbl, index, npages, uaddr, old_tces, direction,
+                       attrs, false);
 }
 
 static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
@@ -630,6 +646,7 @@ static unsigned long pnv_tce_get(struct iommu_table *tbl, 
long index)
 
 struct iommu_table_ops pnv_iommu_ops = {
        .set = pnv_tce_build_vm,
+       .set_and_get = pnv_tce_set_and_get_vm,
        .clear = pnv_tce_free_vm,
        .get = pnv_tce_get,
 };
-- 
2.0.0

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to