On Fri, 2012-12-07 at 18:35 +1100, Alexey Kardashevskiy wrote:
> This patch initializes IOMMU groups based on the IOMMU
> configuration discovered during the PCI scan on POWERNV
> (POWER non virtualized) platform. The IOMMU groups are
> to be used later by VFIO driver (PCI pass through).
> 
> It also implements an API for mapping/unmapping pages for
> guest PCI drivers and providing DMA window properties.
> This API is going to be used later by QEMU-VFIO to handle
> h_put_tce hypercalls from the KVM guest.
> 
> Although this driver has been tested only on the POWERNV
> platform, it should work on any platform which supports
> TCE tables.
> 
> To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
> option and configure VFIO as required.
> 
> Cc: David Gibson <da...@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h     |   10 ++
>  arch/powerpc/kernel/iommu.c          |  214 
> ++++++++++++++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci.c |  134 +++++++++++++++++++++
>  drivers/iommu/Kconfig                |    8 ++
>  4 files changed, 366 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h 
> b/arch/powerpc/include/asm/iommu.h
> index cbfe678..be3b11b 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -76,6 +76,9 @@ struct iommu_table {
>       struct iommu_pool large_pool;
>       struct iommu_pool pools[IOMMU_NR_POOLS];
>       unsigned long *it_map;       /* A simple allocation bitmap for now */
> +#ifdef CONFIG_IOMMU_API
> +     struct iommu_group *it_group;
> +#endif
>  };
>  
>  struct scatterlist;
> @@ -147,5 +150,12 @@ static inline void iommu_restore(void)
>  }
>  #endif
>  
> +extern void iommu_reset_table(struct iommu_table *tbl, bool release);
> +extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +             unsigned long pages);
> +extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +             uint64_t tce, enum dma_data_direction direction,
> +             unsigned long pages);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_IOMMU_H */
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index ff5a6ce..123431a 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -44,6 +44,7 @@
>  #include <asm/kdump.h>
>  #include <asm/fadump.h>
>  #include <asm/vio.h>
> +#include <asm/tce.h>
>  
>  #define DBG(...)
>  
> @@ -856,3 +857,216 @@ void iommu_free_coherent(struct iommu_table *tbl, 
> size_t size,
>               free_pages((unsigned long)vaddr, get_order(size));
>       }
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * SPAPR TCE API
> + */
> +
> +/*
> + * iommu_reset_table is called when it started/stopped being used
> + */
> +void iommu_reset_table(struct iommu_table *tbl, bool release)
> +{
> +     /*
> +      * Page at 0 is marked as used in iommu_init_table,
> +      * so here we clear it when called with release=false...
> +      */
> +     if (!release && (tbl->it_offset == 0))
> +             clear_bit(0, tbl->it_map);

Isn't this redundant to the memset below?

> +
> +     iommu_clear_tces(tbl, tbl->it_offset, tbl->it_size);
> +
> +     memset(tbl->it_map, 0, (tbl->it_size + 7) >> 3);
> +
> +     /*
> +      * ... or restore when release=true
> +      */
> +     if (release && (tbl->it_offset == 0))
> +             set_bit(0, tbl->it_map);

"release" to me implies something is freed, maybe this should just be
called "restore".

> +}
> +EXPORT_SYMBOL_GPL(iommu_reset_table);
> +
> +/*
> + * Returns the number of used IOMMU pages (4K) within
> + * the same system page (4K or 64K).
> + * bitmap_weight is not used as it does not support bigendian maps.
> + * "offset" is an IOMMU page number relative to DMA window start.
> + */
> +static int syspage_weight(unsigned long *map, unsigned long offset)
> +{
> +     int ret = 0, nbits = PAGE_SIZE/IOMMU_PAGE_SIZE;
> +
> +     /* Aligns TCE entry number to system page boundary */
> +     offset &= PAGE_MASK >> IOMMU_PAGE_SHIFT;
> +
> +     /* Count used 4K pages */
> +     while (nbits) {
> +             if (test_bit(offset, map))
> +                     ++ret;
> +             --nbits;
> +             ++offset;
> +     }
> +
> +     return ret;
> +}
> +
> +static void tce_flush(struct iommu_table *tbl)
> +{
> +     /* Flush/invalidate TLB caches if necessary */
> +     if (ppc_md.tce_flush)
> +             ppc_md.tce_flush(tbl);
> +
> +     /* Make sure updates are seen by hardware */
> +     mb();
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number of system pages
> + * which it called put_page() on
> + */
> +static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
> +             unsigned long pages)
> +{
> +     int i, retpages = 0, clr;
> +     unsigned long oldtce, oldweight;
> +     struct page *page;
> +
> +     for (i = 0; i < pages; ++i) {

Any reason not to increment "entry" and avoid the 5 cases of "entry + i"
below?

> +             if (!test_bit(entry + i - tbl->it_offset, tbl->it_map))
> +                     continue;
> +
> +             oldtce = ppc_md.tce_get(tbl, entry + i);
> +             ppc_md.tce_free(tbl, entry + i, 1);
> +
> +             oldweight = syspage_weight(tbl->it_map,
> +                             entry + i - tbl->it_offset);
> +             clr = __test_and_clear_bit(entry + i - tbl->it_offset,
> +                             tbl->it_map);
> +
> +             if (WARN_ON(!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))))
> +                     continue;
> +
> +             page = pfn_to_page(oldtce >> PAGE_SHIFT);
> +
> +             if (WARN_ON(!page))
> +                     continue;
> +
> +             if (oldtce & TCE_PCI_WRITE)
> +                     SetPageDirty(page);
> +
> +             put_page(page);
> +
> +             /* That was the last IOMMU page within the system page */
> +             if ((oldweight == 1) && clr)
> +                     ++retpages;
> +     }
> +
> +     return retpages;
> +}
> +
> +/*
> + * iommu_clear_tces clears tces and returned the number
> + * of released system pages
> + */
> +long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
> +             unsigned long pages)
> +{
> +     int ret;
> +     struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +     spin_lock(&(pool->lock));
> +     ret = clear_tces_nolock(tbl, entry, pages);
> +     tce_flush(tbl);
> +     spin_unlock(&(pool->lock));
> +
> +     return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_clear_tces);
> +
> +static int put_tce(struct iommu_table *tbl, unsigned long entry,
> +             uint64_t tce, enum dma_data_direction direction)
> +{
> +     int ret;
> +     struct page *page = NULL;
> +     unsigned long kva, offset, oldweight;
> +
> +     /* Map new TCE */
> +     offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
> +     ret = get_user_pages_fast(tce & PAGE_MASK, 1,
> +                     direction != DMA_TO_DEVICE, &page);
> +     if (ret != 1) {
> +             pr_err("tce_vfio: get_user_pages_fast failed tce=%llx ioba=%lx 
> ret=%d\n",
> +                             tce, entry << IOMMU_PAGE_SHIFT, ret);
> +             return -EFAULT;
> +     }
> +
> +     kva = (unsigned long) page_address(page);
> +     kva += offset;
> +
> +     /* tce_build receives a virtual address */
> +     ret = ppc_md.tce_build(tbl, entry, 1, kva, direction, NULL);
> +
> +     /* tce_build() only returns non-zero for transient errors */
> +     if (unlikely(ret)) {
> +             pr_err("tce_vfio: tce_put failed on tce=%llx ioba=%lx kva=%lx 
> ret=%d\n",
> +                             tce, entry << IOMMU_PAGE_SHIFT, kva, ret);
> +             put_page(page);
> +             return -EIO;
> +     }
> +
> +     /* Calculate if new system page has been locked */
> +     oldweight = syspage_weight(tbl->it_map, entry - tbl->it_offset);
> +     __set_bit(entry - tbl->it_offset, tbl->it_map);
> +
> +     return (oldweight == 0) ? 1 : 0;

It seems like there's an optimization for syspage_weight since you only
care about two cases, ie. syspage_weight_one and syspage_weight_zero.
The zero test is easy, just mask and return !! the value.  Testing
weight 1 means you don't have to find more than 2 bits set.  I won't
hold you to that optimization, just fyi.

> +}
> +
> +/*
> + * iommu_put_tces builds tces and returned the number of actually
> + * locked system pages
> + */
> +long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
> +             uint64_t tce, enum dma_data_direction direction,
> +             unsigned long pages)
> +{
> +     int i, ret = 0, retpages = 0;
> +     struct iommu_pool *pool = get_pool(tbl, entry);
> +
> +     BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
> +     BUG_ON(direction == DMA_NONE);
> +
> +     spin_lock(&(pool->lock));
> +
> +     /* Check if any is in use */
> +     for (i = 0; i < pages; ++i) {
> +             if (test_bit(entry + i - tbl->it_offset, tbl->it_map)) {

Again, seems like there's an optimization here that avoids individually
testing bits since you only care about zero or non-zero for a sequential
run.

> +                     spin_unlock(&(pool->lock));
> +                     return -EBUSY;
> +             }
> +     }
> +
> +     /* Put tces to the table */
> +     for (i = 0; (i < pages) && (ret >= 0); ++i, tce += IOMMU_PAGE_SIZE) {
> +             ret = put_tce(tbl, entry + i, tce, direction);
> +             if (ret == 1)
> +                     ++retpages;
> +     }
> +
> +     /*
> +      * If failed, release locked pages, otherwise return the number
> +      * of locked system pages
> +      */
> +     if (ret < 0)
> +             clear_tces_nolock(tbl, entry, i);
> +     else
> +             ret = retpages;
> +
> +     tce_flush(tbl);
> +     spin_unlock(&(pool->lock));
> +
> +     return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_put_tces);
> +
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/arch/powerpc/platforms/powernv/pci.c 
> b/arch/powerpc/platforms/powernv/pci.c
> index 05205cf..1b970bf 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -20,6 +20,7 @@
>  #include <linux/irq.h>
>  #include <linux/io.h>
>  #include <linux/msi.h>
> +#include <linux/iommu.h>
>  
>  #include <asm/sections.h>
>  #include <asm/io.h>
> @@ -613,3 +614,136 @@ void __init pnv_pci_init(void)
>       ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
>  #endif
>  }
> +
> +#ifdef CONFIG_IOMMU_API
> +/*
> + * IOMMU groups support required by VFIO
> + */
> +static int add_device(struct device *dev)
> +{
> +     struct iommu_table *tbl;
> +     int ret = 0;
> +
> +     if (WARN_ON(dev->iommu_group)) {
> +             pr_warn("tce_vfio: device %s is already in iommu group %d, 
> skipping\n",
> +                             dev_name(dev),
> +                             iommu_group_id(dev->iommu_group));
> +             return -EBUSY;
> +     }
> +
> +     tbl = get_iommu_table_base(dev);
> +     if (!tbl) {
> +             pr_debug("tce_vfio: skipping device %s with no tbl\n",
> +                             dev_name(dev));
> +             return 0;
> +     }
> +
> +     pr_debug("tce_vfio: adding %s to iommu group %d\n",
> +                     dev_name(dev), iommu_group_id(tbl->it_group));
> +
> +     ret = iommu_group_add_device(tbl->it_group, dev);
> +     if (ret < 0)
> +             pr_err("tce_vfio: %s has not been added, ret=%d\n",
> +                             dev_name(dev), ret);
> +
> +     return ret;
> +}
> +
> +static void del_device(struct device *dev)
> +{
> +     iommu_group_remove_device(dev);
> +}
> +
> +static int iommu_bus_notifier(struct notifier_block *nb,
> +                           unsigned long action, void *data)
> +{
> +     struct device *dev = data;
> +
> +     switch (action) {
> +     case BUS_NOTIFY_ADD_DEVICE:
> +             return add_device(dev);
> +     case BUS_NOTIFY_DEL_DEVICE:
> +             del_device(dev);
> +             return 0;
> +     default:
> +             return 0;
> +     }
> +}
> +
> +static struct notifier_block tce_iommu_bus_nb = {
> +     .notifier_call = iommu_bus_notifier,
> +};
> +
> +static void group_release(void *iommu_data)
> +{
> +     struct iommu_table *tbl = iommu_data;
> +     tbl->it_group = NULL;
> +}
> +
> +static int __init tce_iommu_init(void)
> +{
> +     struct pci_dev *pdev = NULL;
> +     struct iommu_table *tbl;
> +     struct iommu_group *grp;
> +
> +     /* Allocate and initialize IOMMU groups */
> +     for_each_pci_dev(pdev) {
> +             tbl = get_iommu_table_base(&pdev->dev);
> +             if (!tbl)
> +                     continue;
> +
> +             /* Skip already initialized */
> +             if (tbl->it_group)
> +                     continue;
> +
> +             grp = iommu_group_alloc();
> +             if (IS_ERR(grp)) {
> +                     pr_info("tce_vfio: cannot create new IOMMU group, 
> ret=%ld\n",
> +                                     PTR_ERR(grp));
> +                     return PTR_ERR(grp);
> +             }
> +             tbl->it_group = grp;
> +             iommu_group_set_iommudata(grp, tbl, group_release);

BTW, groups have a name property that shows up in sysfs that can be set
with iommu_group_set_name().  IIRC, this was a feature David requested
for PEs.  It'd be nice if it was used for PEs...  Thanks,

Alex

> +     }
> +
> +     bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +     /* Add PCI devices to VFIO groups */
> +     for_each_pci_dev(pdev)
> +             add_device(&pdev->dev);
> +
> +     return 0;
> +}
> +
> +static void __exit tce_iommu_cleanup(void)
> +{
> +     struct pci_dev *pdev = NULL;
> +     struct iommu_table *tbl;
> +     struct iommu_group *grp = NULL;
> +
> +     bus_unregister_notifier(&pci_bus_type, &tce_iommu_bus_nb);
> +
> +     /* Delete PCI devices from VFIO groups */
> +     for_each_pci_dev(pdev)
> +             del_device(&pdev->dev);
> +
> +     /* Release VFIO groups */
> +     for_each_pci_dev(pdev) {
> +             tbl = get_iommu_table_base(&pdev->dev);
> +             if (!tbl)
> +                     continue;
> +             grp = tbl->it_group;
> +
> +             /* Skip (already) uninitialized */
> +             if (!grp)
> +                     continue;
> +
> +             /* Do actual release, group_release() is expected to work */
> +             iommu_group_put(grp);
> +             BUG_ON(tbl->it_group);
> +     }
> +}
> +
> +module_init(tce_iommu_init);
> +module_exit(tce_iommu_cleanup);
> +#endif /* CONFIG_IOMMU_API */
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 9f69b56..29d11dc 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -187,4 +187,12 @@ config EXYNOS_IOMMU_DEBUG
>  
>         Say N unless you need kernel log message for IOMMU debugging
>  
> +config SPAPR_TCE_IOMMU
> +     bool "sPAPR TCE IOMMU Support"
> +     depends on PPC_POWERNV
> +     select IOMMU_API
> +     help
> +       Enables bits of IOMMU API required by VFIO. The iommu_ops is
> +       still not implemented.
> +
>  endif # IOMMU_SUPPORT



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to