powernv: DMA operations for discontiguous allocation

Benjamin Herrenschmidt Fri, 29 Jun 2018 19:53:23 -0700

On Fri, 2018-06-29 at 17:34 +1000, Russell Currey wrote:
> DMA pseudo-bypass is a new set of DMA operations that solve some issues for
> devices that want to address more than 32 bits but can't address the 59
> bits required to enable direct DMA.


One thing you may need to add (I didn't see it with a cursory glance
but maybe it's there) is some form of handling of allocations or
mapping requests that span a TCE boundary.

For allocations, since they are page orders, that means you only have
to check if they are bigger than a TCE page.

For mappings, you need to check individual sglist entries.

At this stage, if you hit that all you can do is fail, with maybe a
rate limited printk. But it's better than whatever corruption or
misbehaviour will happen if you don't catch them. I don't expect this
to happen much if at all with 1G pages, as most "sg" mappings are
probably be in unit of pages, but I still want to catch if it does
happen.

> The previous implementation for POWER8/PHB3 worked around this by
> configuring a bypass from the default 32-bit address space into 64-bit
> address space.  This approach does not work for POWER9/PHB4 because
> regions of memory are discontiguous and many devices will be unable to
> address memory beyond the first node.
> 
> Instead, implement a new set of DMA operations that allocate TCEs as DMA
> mappings are requested so that all memory is addressable even when a
> one-to-one mapping between real addresses and DMA addresses isn't
> possible.  These TCEs are the maximum size available on the platform,
> which is 256M on PHB3 and 1G on PHB4.
> 
> Devices can now map any region of memory up to the maximum amount they can
> address according to the DMA mask set, in chunks of the largest available
> TCE size.
> 
> This implementation replaces the need for the existing PHB3 solution and
> should be compatible with future PHB versions.
> 
> It is, however, rather naive.  There is no unmapping, and as a result
> devices can eventually run out of space if they address their entire
> DMA mask worth of TCEs.  An implementation with unmap() will come in
> future (and requires a much more complex implementation), but this is a
> good start due to the drastic performance improvement.
> 
> Signed-off-by: Russell Currey <rus...@russell.cc>
> ---
>  arch/powerpc/include/asm/dma-mapping.h    |   1 +
>  arch/powerpc/platforms/powernv/Makefile   |   2 +-
>  arch/powerpc/platforms/powernv/pci-dma.c  | 243 ++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/pci-ioda.c |  82 +++-----
>  arch/powerpc/platforms/powernv/pci.h      |   7 +
>  5 files changed, 281 insertions(+), 54 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/pci-dma.c
> 
> diff --git a/arch/powerpc/include/asm/dma-mapping.h 
> b/arch/powerpc/include/asm/dma-mapping.h
> index 8fa394520af6..354f435160f3 100644
> --- a/arch/powerpc/include/asm/dma-mapping.h
> +++ b/arch/powerpc/include/asm/dma-mapping.h
> @@ -74,6 +74,7 @@ static inline unsigned long device_to_mask(struct device 
> *dev)
>  extern struct dma_map_ops dma_iommu_ops;
>  #endif
>  extern const struct dma_map_ops dma_nommu_ops;
> +extern const struct dma_map_ops dma_pseudo_bypass_ops;
>  
>  static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type 
> *bus)
>  {
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index 703a350a7f4e..2467bdab3c13 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -6,7 +6,7 @@ obj-y                 += opal-msglog.o opal-hmi.o 
> opal-power.o opal-irqchip.o
>  obj-y                        += opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-sensor-groups.o
>  
>  obj-$(CONFIG_SMP)    += smp.o subcore.o subcore-asm.o
> -obj-$(CONFIG_PCI)    += pci.o pci-ioda.o npu-dma.o
> +obj-$(CONFIG_PCI)    += pci.o pci-ioda.o npu-dma.o pci-dma.o
>  obj-$(CONFIG_CXL_BASE)       += pci-cxl.o
>  obj-$(CONFIG_EEH)    += eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM)       += opal-xscom.o
> diff --git a/arch/powerpc/platforms/powernv/pci-dma.c 
> b/arch/powerpc/platforms/powernv/pci-dma.c
> new file mode 100644
> index 000000000000..79382627c7be
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/pci-dma.c
> @@ -0,0 +1,243 @@
> +/*
> + * DMA operations supporting pseudo-bypass for PHB3+
> + *
> + * Author: Russell Currey <rus...@russell.cc>
> + *
> + * Copyright 2018 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation; either version 2 of the License, or (at your
> + * option) any later version.
> + */
> +
> +#include <linux/export.h>
> +#include <linux/memblock.h>
> +#include <linux/device.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/hash.h>
> +
> +#include <asm/pci-bridge.h>
> +#include <asm/ppc-pci.h>
> +#include <asm/pnv-pci.h>
> +#include <asm/tce.h>
> +
> +#include "pci.h"
> +
> +/*
> + * This is a naive implementation that directly operates on TCEs, allocating
> + * on demand.  There is no locking or refcounts since no TCEs are ever 
> removed
> + * and unmap does nothing.
> + */
> +static dma_addr_t dma_pseudo_bypass_get_address(struct device *dev,
> +                                         phys_addr_t addr)
> +{
> +     struct pci_dev *pdev = container_of(dev, struct pci_dev, dev);
> +     struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> +     struct pnv_phb *phb = hose->private_data;
> +     struct pnv_ioda_pe *pe;
> +     u64 i, tce, ret, offset;
> +     __be64 entry;
> +
> +     offset = addr & ((1 << phb->ioda.max_tce_order) - 1);
> +
> +     pe = &phb->ioda.pe_array[pci_get_pdn(pdev)->pe_number];
> +
> +     /* look through the tracking table for a free entry */
> +     for (i = 0; i < pe->tce_count; i++) {
> +             /* skip between 2GB and 4GB */
> +             if ((i << phb->ioda.max_tce_order) >= 0x80000000 &&
> +                 (i << phb->ioda.max_tce_order) < 0x100000000)
> +                     continue;
> +
> +             tce = be64_to_cpu(pe->tces[i]);
> +
> +             /* if the TCE is already valid (read + write) */
> +             if ((tce & 3) == 3) {
> +                     /* check if we're already allocated, if not move on */
> +                     if (tce >> phb->ioda.max_tce_order ==
> +                         addr >> phb->ioda.max_tce_order) {
> +                             /* wait for the lock bit to clear */
> +                             while (be64_to_cpu(pe->tces[i]) & 4)
> +                                     cpu_relax();
> +
> +                             return (i << phb->ioda.max_tce_order) | offset;
> +                     }
> +
> +                     continue;
> +             }
> +
> +             /*
> +              * The TCE isn't being used, so let's try and allocate it.
> +              * Bits 0 and 1 are read/write, and we use bit 2 as a "lock"
> +              * bit.  This is to prevent any race where the value is set in
> +              * the TCE table but the invalidate/mb() hasn't finished yet.
> +              */
> +             entry = cpu_to_be64((addr - offset) | 7);
> +             ret = cmpxchg(&pe->tces[i], tce, entry);
> +             if (ret != tce) {
> +                     /* conflict, start looking again just in case */
> +                     i--;
> +                     continue;
> +             }
> +             pnv_pci_phb3_tce_invalidate(pe, 0, 0, addr - offset, 1);
> +             mb();
> +             /* clear the lock bit now that we know it's active */
> +             ret = cmpxchg(&pe->tces[i], entry, cpu_to_be64((addr - offset) 
> | 3));
> +             if (ret != entry) {
> +                     /* conflict, start looking again just in case */
> +                     i--;
> +                     continue;
> +             }
> +
> +             return (i << phb->ioda.max_tce_order) | offset;
> +     }
> +     /* If we get here, the table must be full, so error out. */
> +     return -1ULL;
> +}
> +
> +/*
> + * For now, don't actually do anything on unmap.
> + */
> +static void dma_pseudo_bypass_unmap_address(struct device *dev, dma_addr_t 
> dma_addr)
> +{
> +}
> +
> +static int dma_pseudo_bypass_dma_supported(struct device *dev, u64 mask)
> +{
> +     /*
> +      * Normally dma_supported() checks if the mask is capable of addressing
> +      * all of memory.  Since we map physical memory in chunks that the
> +      * device can address, the device will be able to address whatever it
> +      * wants - just not all at once.
> +      */
> +     return 1;
> +}
> +
> +static void *dma_pseudo_bypass_alloc_coherent(struct device *dev,
> +                                       size_t size,
> +                                       dma_addr_t *dma_handle,
> +                                       gfp_t flag,
> +                                       unsigned long attrs)
> +{
> +     void *ret;
> +     struct page *page;
> +     int node = dev_to_node(dev);
> +
> +     /* ignore region specifiers */
> +     flag &= ~(__GFP_HIGHMEM);
> +
> +     page = alloc_pages_node(node, flag, get_order(size));
> +     if (page == NULL)
> +             return NULL;
> +     ret = page_address(page);
> +     memset(ret, 0, size);
> +     *dma_handle = dma_pseudo_bypass_get_address(dev, __pa(ret));
> +
> +     return ret;
> +}
> +
> +static void dma_pseudo_bypass_free_coherent(struct device *dev,
> +                                      size_t size,
> +                                      void *vaddr,
> +                                      dma_addr_t dma_handle,
> +                                      unsigned long attrs)
> +{
> +     free_pages((unsigned long)vaddr, get_order(size));
> +}
> +
> +static int dma_pseudo_bypass_mmap_coherent(struct device *dev,
> +                                    struct vm_area_struct *vma,
> +                                    void *cpu_addr,
> +                                    dma_addr_t handle,
> +                                    size_t size,
> +                                    unsigned long attrs)
> +{
> +     unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
> +
> +     return remap_pfn_range(vma, vma->vm_start,
> +                            pfn + vma->vm_pgoff,
> +                            vma->vm_end - vma->vm_start,
> +                            vma->vm_page_prot);
> +}
> +
> +static inline dma_addr_t dma_pseudo_bypass_map_page(struct device *dev,
> +                                             struct page *page,
> +                                             unsigned long offset,
> +                                             size_t size,
> +                                             enum dma_data_direction dir,
> +                                             unsigned long attrs)
> +{
> +     BUG_ON(dir == DMA_NONE);
> +
> +     return dma_pseudo_bypass_get_address(dev, page_to_phys(page) + offset);
> +}
> +
> +static inline void dma_pseudo_bypass_unmap_page(struct device *dev,
> +                                      dma_addr_t dma_address,
> +                                      size_t size,
> +                                      enum dma_data_direction direction,
> +                                      unsigned long attrs)
> +{
> +     dma_pseudo_bypass_unmap_address(dev, dma_address);
> +}
> +
> +
> +static int dma_pseudo_bypass_map_sg(struct device *dev, struct scatterlist 
> *sgl,
> +                          int nents, enum dma_data_direction direction,
> +                          unsigned long attrs)
> +{
> +     struct scatterlist *sg;
> +     int i;
> +
> +
> +     for_each_sg(sgl, sg, nents, i) {
> +             sg->dma_address = dma_pseudo_bypass_get_address(dev, 
> sg_phys(sg));
> +             sg->dma_length = sg->length;
> +
> +             __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
> +     }
> +
> +     return nents;
> +}
> +
> +static void dma_pseudo_bypass_unmap_sg(struct device *dev, struct 
> scatterlist *sgl,
> +                             int nents, enum dma_data_direction direction,
> +                             unsigned long attrs)
> +{
> +     struct scatterlist *sg;
> +     int i;
> +
> +     for_each_sg(sgl, sg, nents, i) {
> +             dma_pseudo_bypass_unmap_address(dev, sg->dma_address);
> +     }
> +}
> +
> +static u64 dma_pseudo_bypass_get_required_mask(struct device *dev)
> +{
> +     /*
> +      * there's no limitation on our end, the driver should just call
> +      * set_mask() with as many bits as the device can address.
> +      */
> +     return -1ULL;
> +}
> +
> +static int dma_pseudo_bypass_mapping_error(struct device *dev, dma_addr_t 
> dma_addr)
> +{
> +     return dma_addr == -1ULL;
> +}
> +
> +
> +const struct dma_map_ops dma_pseudo_bypass_ops = {
> +     .alloc                          = dma_pseudo_bypass_alloc_coherent,
> +     .free                           = dma_pseudo_bypass_free_coherent,
> +     .mmap                           = dma_pseudo_bypass_mmap_coherent,
> +     .map_sg                         = dma_pseudo_bypass_map_sg,
> +     .unmap_sg                       = dma_pseudo_bypass_unmap_sg,
> +     .dma_supported                  = dma_pseudo_bypass_dma_supported,
> +     .map_page                       = dma_pseudo_bypass_map_page,
> +     .unmap_page                     = dma_pseudo_bypass_unmap_page,
> +     .get_required_mask              = dma_pseudo_bypass_get_required_mask,
> +     .mapping_error                  = dma_pseudo_bypass_mapping_error,
> +};
> +EXPORT_SYMBOL(dma_pseudo_bypass_ops);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 17c590087279..d2ca214610fd 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1088,6 +1088,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct 
> pci_dev *dev)
>       pe->pbus = NULL;
>       pe->mve_number = -1;
>       pe->rid = dev->bus->number << 8 | pdn->devfn;
> +     pe->tces = NULL;
>  
>       pe_info(pe, "Associated device to PE\n");
>  
> @@ -1569,6 +1570,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
> u16 num_vfs)
>               pe->mve_number = -1;
>               pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
>                          pci_iov_virtfn_devfn(pdev, vf_index);
> +             pe->tces = NULL;
>  
>               pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
>                       hose->global_number, pdev->bus->number,
> @@ -1774,43 +1776,22 @@ static bool pnv_pci_ioda_pe_single_vendor(struct 
> pnv_ioda_pe *pe)
>       return true;
>  }
>  
> -/*
> - * Reconfigure TVE#0 to be usable as 64-bit DMA space.
> - *
> - * The first 4GB of virtual memory for a PE is reserved for 32-bit accesses.
> - * Devices can only access more than that if bit 59 of the PCI address is set
> - * by hardware, which indicates TVE#1 should be used instead of TVE#0.
> - * Many PCI devices are not capable of addressing that many bits, and as a
> - * result are limited to the 4GB of virtual memory made available to 32-bit
> - * devices in TVE#0.
> - *
> - * In order to work around this, reconfigure TVE#0 to be suitable for 64-bit
> - * devices by configuring the virtual memory past the first 4GB inaccessible
> - * by 64-bit DMAs.  This should only be used by devices that want more than
> - * 4GB, and only on PEs that have no 32-bit devices.
> - *
> - * Currently this will only work on PHB3 (POWER8).
> - */
> -static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
> +static int pnv_pci_pseudo_bypass_setup(struct pnv_ioda_pe *pe)
>  {
> -     u64 window_size, table_size, tce_count, addr;
> +     u64 tce_count, table_size, window_size;
> +     struct pnv_phb *p = pe->phb;
>       struct page *table_pages;
> -     u64 tce_order = 28; /* 256MB TCEs */
>       __be64 *tces;
> -     s64 rc;
> +     int rc = -ENOMEM;
>  
> -     /*
> -      * Window size needs to be a power of two, but needs to account for
> -      * shifting memory by the 4GB offset required to skip 32bit space.
> -      */
> -     window_size = roundup_pow_of_two(memory_hotplug_max() + (1ULL << 32));
> -     tce_count = window_size >> tce_order;
> +     window_size = roundup_pow_of_two(memory_hotplug_max());
> +     tce_count = window_size >> p->ioda.max_tce_order;
>       table_size = tce_count << 3;
>  
>       if (table_size < PAGE_SIZE)
>               table_size = PAGE_SIZE;
>  
> -     table_pages = alloc_pages_node(pe->phb->hose->node, GFP_KERNEL,
> +     table_pages = alloc_pages_node(p->hose->node, GFP_KERNEL,
>                                      get_order(table_size));
>       if (!table_pages)
>               goto err;
> @@ -1821,26 +1802,23 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct 
> pnv_ioda_pe *pe)
>  
>       memset(tces, 0, table_size);
>  
> -     for (addr = 0; addr < memory_hotplug_max(); addr += (1 << tce_order)) {
> -             tces[(addr + (1ULL << 32)) >> tce_order] =
> -                     cpu_to_be64(addr | TCE_PCI_READ | TCE_PCI_WRITE);
> -     }
> +     pe->tces = tces;
> +     pe->tce_count = tce_count;
>  
>       rc = opal_pci_map_pe_dma_window(pe->phb->opal_id,
>                                       pe->pe_number,
> -                                     /* reconfigure window 0 */
>                                       (pe->pe_number << 1) + 0,
>                                       1,
>                                       __pa(tces),
>                                       table_size,
> -                                     1 << tce_order);
> +                                     1 << p->ioda.max_tce_order);
>       if (rc == OPAL_SUCCESS) {
> -             pe_info(pe, "Using 64-bit DMA iommu bypass (through TVE#0)\n");
> +             pe_info(pe, "TCE tables configured for pseudo-bypass\n");
>               return 0;
>       }
>  err:
> -     pe_err(pe, "Error configuring 64-bit DMA bypass\n");
> -     return -EIO;
> +     pe_err(pe, "error configuring pseudo-bypass\n");
> +     return rc;
>  }
>  
>  static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
> @@ -1851,7 +1829,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev 
> *pdev, u64 dma_mask)
>       struct pnv_ioda_pe *pe;
>       uint64_t top;
>       bool bypass = false;
> -     s64 rc;
>  
>       if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
>               return -ENODEV;
> @@ -1868,21 +1845,15 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev 
> *pdev, u64 dma_mask)
>       } else {
>               /*
>                * If the device can't set the TCE bypass bit but still wants
> -              * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
> -              * bypass the 32-bit region and be usable for 64-bit DMAs.
> -              * The device needs to be able to address all of this space.
> +              * to access 4GB or more, we need to use a different set of DMA
> +              * operations with an indirect mapping.
>                */
>               if (dma_mask >> 32 &&
> -                 dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
> -                 pnv_pci_ioda_pe_single_vendor(pe) &&
> -                 phb->model == PNV_PHB_MODEL_PHB3) {
> -                     /* Configure the bypass mode */
> -                     rc = pnv_pci_ioda_dma_64bit_bypass(pe);
> -                     if (rc)
> -                             return rc;
> -                     /* 4GB offset bypasses 32-bit space */
> -                     set_dma_offset(&pdev->dev, (1ULL << 32));
> -                     set_dma_ops(&pdev->dev, &dma_nommu_ops);
> +                 phb->model != PNV_PHB_MODEL_P7IOC &&
> +                 pnv_pci_ioda_pe_single_vendor(pe)) {
> +                     if (!pe->tces)
> +                             pnv_pci_pseudo_bypass_setup(pe);
> +                     set_dma_ops(&pdev->dev, &dma_pseudo_bypass_ops);
>               } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
>                       /*
>                        * Fail the request if a DMA mask between 32 and 64 bits
> @@ -2071,7 +2042,7 @@ static inline void 
> pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
>       __raw_writeq_be(val, invalidate);
>  }
>  
> -static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
> +void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
>                                       unsigned shift, unsigned long index,
>                                       unsigned long npages)
>  {
> @@ -2611,10 +2582,15 @@ static unsigned long 
> pnv_pci_ioda2_get_table_size(__u32 page_shift,
>  static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
>  {
>       struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
> -                                             table_group);
> +                                           table_group);
> +
>       /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
>       struct iommu_table *tbl = pe->table_group.tables[0];
>  
> +     if (pe->tces)
> +             free_pages((unsigned long)pe->tces,
> +                        get_order(pe->tce_count << 3));
> +
>       pnv_pci_ioda2_set_bypass(pe, false);
>       pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>       if (pe->pbus)
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index c9952def5e93..56846ddc76a2 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -70,6 +70,10 @@ struct pnv_ioda_pe {
>       bool                    tce_bypass_enabled;
>       uint64_t                tce_bypass_base;
>  
> +     /* TCE tables for DMA pseudo-bypass */
> +     __be64                  *tces;
> +     u64                     tce_count;
> +
>       /* MSIs. MVE index is identical for for 32 and 64 bit MSI
>        * and -1 if not supported. (It's actually identical to the
>        * PE number)
> @@ -211,6 +215,9 @@ extern void pnv_tce_free(struct iommu_table *tbl, long 
> index, long npages);
>  extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
>               unsigned long *hpa, enum dma_data_direction *direction);
>  extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
> +extern void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
> +                                     unsigned shift, unsigned long index,
> +                                     unsigned long npages);
>  
>  void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
>                               unsigned char *log_buff);

Re: [PATCH 2/3] powerpc/powernv: DMA operations for discontiguous allocation

Reply via email to