IODA2 systems (POWER8/9) allow DMA windows at 2 fixed locations - 0 and
0x800.0000.0000.0000==1<<59, stored in TVT as TVE0/1. PHB4 on POWER9 has
an additional PHB mode to allow mapping both windows at 0 and selecting
one based on IOBA address - accesses below 4GB go via TVE0 and above
4GB - via TVE1; it is called TVT Select 'GTE4GB' Option in PHB4.

This adds a new value for the "iommu" kernel parameter - "iommu_bypass".
When enabled this programs PHB to enable the TVE1_4GB mode and uses TVE1
to map the entire host RAM using the largest available IOMMU pages
- 1GB for PHB and 256MB for NPU-PHB effectively.

This creates huge DMA windows at 4GB for VFIO; new iommu_table::it_dmaoff
flag tells the VFIO IOMMU driver about the bus offset it should be using.
Since the actual second table still covers the first 4GB of the bus space
iommu_table::it_offset is set to 0. The next patch will explicitly
advertise new offset to the userspace.

When TVE1_4GB enabled, this creates a small (2GB typically) 32 bit window
as there is no need to cover as much of lower DMA space - the 4GB+ window
does it better anyway.

This keeps the existing behavior by default as the TVE1_4GB flag is set
per PHB by device assignment is done on PE basis and managing both modes
dynamically might get nasty.

This relies on skiboot supporting the new OPAL_PHB_SET_OPTION OPAL call.

Suggested-by: Alistair Popple <alist...@popple.id.au>
Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h           |   1 +
 arch/powerpc/include/asm/opal-api.h        |   9 +-
 arch/powerpc/include/asm/opal.h            |   2 +
 arch/powerpc/platforms/powernv/pci.h       |   1 +
 arch/powerpc/platforms/powernv/opal-call.c |   2 +
 arch/powerpc/platforms/powernv/pci-ioda.c  | 187 ++++++++++++++++++---
 6 files changed, 173 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 479439ef003e..e0ca82d31253 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -97,6 +97,7 @@ struct iommu_table {
        unsigned long  it_level_size;
        unsigned long  it_allocated_size;
        unsigned long  it_offset;    /* Offset into global table */
+       unsigned long  it_dmaoff;
        unsigned long  it_base;      /* mapped address of tce table */
        unsigned long  it_index;     /* which iommu table this is */
        unsigned long  it_type;      /* type: PCI or Virtual Bus */
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c1f25a760eb1..7873754f5ea6 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -214,7 +214,9 @@
 #define OPAL_SECVAR_GET                                176
 #define OPAL_SECVAR_GET_NEXT                   177
 #define OPAL_SECVAR_ENQUEUE_UPDATE             178
-#define OPAL_LAST                              178
+#define OPAL_PHB_SET_OPTION                    179
+#define OPAL_PHB_GET_OPTION                    180
+#define OPAL_LAST                              180
 
 #define QUIESCE_HOLD                   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT                 2 /* Fail all calls with OPAL_BUSY */
@@ -437,6 +439,11 @@ enum OpalSlotLedState {
        OPAL_SLOT_LED_STATE_ON = 1      /* LED is ON */
 };
 
+enum OpalPhbOption {
+       OPAL_PHB_OPTION_TVE1_4GB = 0x1,
+       OPAL_PHB_OPTION_MMIO_EEH_DISABLE = 0x2,
+};
+
 /*
  * Address cycle types for LPC accesses. These also correspond
  * to the content of the first cell of the "reg" property for
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9986ac34b8e2..89b712288cdd 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -142,6 +142,8 @@ int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, 
uint16_t pe_number, uint16_t
 int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number,
                                        uint16_t dma_window_number, uint64_t 
pci_start_addr,
                                        uint64_t pci_mem_size);
+int64_t opal_phb_set_option(uint64_t phb_id, uint64_t opt, uint64_t setting);
+int64_t opal_phb_get_option(uint64_t phb_id, uint64_t opt, uint64_t *setting);
 int64_t opal_pci_reset(uint64_t id, uint8_t reset_scope, uint8_t assert_state);
 
 int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer,
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index a808dd396522..ce00278185b0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -100,6 +100,7 @@ struct pnv_phb {
        int                     has_dbgfs;
        struct dentry           *dbgfs;
 #endif
+       struct iommu_table      *bypass_tbl; /* PNV_IOMMU_TCE_BYPASS only */
 
        unsigned int            msi_base;
        unsigned int            msi32_support;
diff --git a/arch/powerpc/platforms/powernv/opal-call.c 
b/arch/powerpc/platforms/powernv/opal-call.c
index 5cd0f52d258f..3130d5a41570 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -293,3 +293,5 @@ OPAL_CALL(opal_mpipl_query_tag,                     
OPAL_MPIPL_QUERY_TAG);
 OPAL_CALL(opal_secvar_get,                     OPAL_SECVAR_GET);
 OPAL_CALL(opal_secvar_get_next,                        OPAL_SECVAR_GET_NEXT);
 OPAL_CALL(opal_secvar_enqueue_update,          OPAL_SECVAR_ENQUEUE_UPDATE);
+OPAL_CALL(opal_phb_set_option,                 OPAL_PHB_SET_OPTION);
+OPAL_CALL(opal_phb_get_option,                 OPAL_PHB_GET_OPTION);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f5f1b4e25530..74b4ac8cb71d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -51,6 +51,10 @@ static const char * const pnv_phb_names[] = { "IODA1", 
"IODA2", "NPU_NVLINK",
                                              "NPU_OCAPI" };
 
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+static long pnv_pci_ioda2_create_table(int nid, int num, __u64 bus_offset,
+               __u32 page_shift, __u64 window_size, __u32 levels,
+               bool alloc_userspace_copy, struct iommu_table **ptbl);
 
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
                            const char *fmt, ...)
@@ -83,7 +87,14 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const 
char *level,
        va_end(args);
 }
 
-static bool pnv_iommu_bypass_disabled __read_mostly;
+enum pnv_iommu_bypass_mode {
+       PNV_IOMMU_NO_TRANSLATE,
+       PNV_IOMMU_BYPASS_DISABLED,
+       PNV_IOMMU_TCE_BYPASS
+};
+
+static enum pnv_iommu_bypass_mode pnv_iommu_bypass_mode __read_mostly =
+               PNV_IOMMU_NO_TRANSLATE;
 static bool pci_reset_phbs __read_mostly;
 
 static int __init iommu_setup(char *str)
@@ -93,9 +104,13 @@ static int __init iommu_setup(char *str)
 
        while (*str) {
                if (!strncmp(str, "nobypass", 8)) {
-                       pnv_iommu_bypass_disabled = true;
+                       pnv_iommu_bypass_mode = PNV_IOMMU_BYPASS_DISABLED;
                        pr_info("PowerNV: IOMMU bypass window disabled.\n");
                        break;
+               } else if (!strncmp(str, "iommu_bypass", 12)) {
+                       pnv_iommu_bypass_mode = PNV_IOMMU_TCE_BYPASS;
+                       pr_info("PowerNV: IOMMU TCE bypass window selected.\n");
+                       break;
                }
                str += strcspn(str, ",");
                if (*str == ',')
@@ -2351,28 +2366,125 @@ static long pnv_pci_ioda2_set_window(struct 
iommu_table_group *table_group,
        return 0;
 }
 
+static long pnv_pci_ioda2_set_bypass_iommu(struct pnv_ioda_pe *pe,
+               unsigned long offset)
+{
+       struct pnv_phb *phb = pe->phb;
+       long rc;
+       struct memblock_region *r;
+       unsigned long i, pgsizes, tceoff;
+
+
+       pgsizes = pnv_ioda_parse_tce_sizes(phb);
+       /* Filter sizes to have round number of TCEs to cover 0..offset */
+       pgsizes &= offset | (offset - 1);
+       if (!pgsizes)
+               return -1;
+
+       if (!phb->bypass_tbl) {
+               struct iommu_table *tbl = NULL;
+
+               rc = pnv_pci_ioda2_create_table(phb->hose->node,
+                               1 /* window number */,
+                               0 /* bus offset */,
+                               __fls(pgsizes),
+                               roundup_pow_of_two(memory_hotplug_max() +
+                                       offset),
+                               2 /* levels */,
+                               false /* userspace cache */,
+                               &tbl);
+               if (rc)
+                       return -1;
+
+               tceoff = offset >> tbl->it_page_shift;
+
+               for_each_memblock(memory, r) {
+                       phys_addr_t start, end;
+
+                       start = r->base >> tbl->it_page_shift;
+                       end = start + (r->size >> tbl->it_page_shift);
+
+                       for (i = start; i < end; ++i)
+                               tbl->it_ops->set(tbl, i + tceoff, 1,
+                                               i << tbl->it_page_shift,
+                                               DMA_BIDIRECTIONAL, 0);
+               }
+               phb->bypass_tbl = tbl;
+               pe_err(pe, "Created 64-bit bypass TCE table at 0x%lx\n",
+                       phb->bypass_tbl->it_base);
+       } else {
+               iommu_tce_table_get(phb->bypass_tbl);
+       }
+
+       rc = pnv_pci_ioda2_set_window(&pe->table_group, 1, phb->bypass_tbl);
+       if (rc) {
+               iommu_tce_table_put(phb->bypass_tbl);
+               return -1;
+       }
+
+       pe->tce_bypass_enabled = true;
+
+       return 0;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
+       struct pnv_phb *phb = pe->phb;
        uint16_t window_id = (pe->pe_number << 1 ) + 1;
        int64_t rc;
+       phys_addr_t top;
 
-       pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
-       if (enable) {
-               phys_addr_t top = memblock_end_of_DRAM();
+       if (!enable) {
+               pe_info(pe, "Disabling 64-bit bypass\n");
+               /*
+                * Whether it was bypass or iommu bypass, this will write
+                * a zero to TVE and disable the window;
+                * opal_pci_map_pe_dma_window() does the same thing when
+                * tce_table_size==0.
+                */
+               rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
+                               pe->pe_number, window_id, 0, 0);
+               if (rc)
+                       pe_err(pe, "OPAL error %lld configuring bypass 
window\n",
+                               rc);
 
-               top = roundup_pow_of_two(top);
-               rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
-                               pe->pe_number, window_id,
-                               pe->table_group.tce64_start, top);
-       } else {
-               rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
-                               pe->pe_number, window_id,
-                               pe->table_group.tce64_start, 0);
+               pe->tce_bypass_enabled = false;
+               return;
+       }
+
+       if (pnv_iommu_bypass_mode == PNV_IOMMU_TCE_BYPASS) {
+               if (!opal_phb_set_option(phb->opal_id,
+                                       OPAL_PHB_OPTION_TVE1_4GB, 1)) {
+                       pe->table_group.tce64_start = SZ_4G;
+                       if (!pnv_pci_ioda2_set_bypass_iommu(pe,
+                                               pe->table_group.tce64_start)) {
+                               pe->tce_bypass_enabled = true;
+                               pe_info(pe, "Enabled 64-bit IOMMU bypass at 
%llx\n",
+                                               pe->table_group.tce64_start);
+                               return;
+                       }
+                       pe_err(pe, "Enabled TVE1_4GB but failed to configure 
TCE table");
+               } else {
+                       /* IOMMU bypass failed, fallback to direct bypass */
+                       pnv_iommu_bypass_mode = PNV_IOMMU_NO_TRANSLATE;
+               }
+       }
+
+       if (pnv_iommu_bypass_mode == PNV_IOMMU_NO_TRANSLATE) {
+               /*
+                * FIXME: if we enable dynamic switch, here we need to disable
+                * OPAL_PCI_PHB_FLAG_TVE1_4GB
+                */
+               top = roundup_pow_of_two(memblock_end_of_DRAM());
+               if (!opal_pci_map_pe_dma_window_real(phb->opal_id,
+                                       pe->pe_number, window_id,
+                                       pe->table_group.tce64_start, top)) {
+                       pe->tce_bypass_enabled = true;
+                       pe_info(pe, "Enabled 64-bit direct bypass at %llx\n",
+                                       pe->table_group.tce64_start);
+                       return;
+               }
        }
-       if (rc)
-               pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
-       else
-               pe->tce_bypass_enabled = enable;
 }
 
 static long pnv_pci_ioda2_create_table(int nid, int num, __u64 bus_offset,
@@ -2409,6 +2521,9 @@ static long pnv_pci_ioda2_setup_default_config(struct 
pnv_ioda_pe *pe)
        const unsigned int tceshift = PAGE_SHIFT;
        unsigned long res_start, res_end, tces_order, tcelevel_order, levels;
 
+       if (pnv_iommu_bypass_mode != PNV_IOMMU_BYPASS_DISABLED)
+               pnv_pci_ioda2_set_bypass(pe, true);
+
        /*
         * crashkernel= specifies the kdump kernel's maximum memory at
         * some offset and there is no guaranteed the result is a power
@@ -2432,6 +2547,15 @@ static long pnv_pci_ioda2_setup_default_config(struct 
pnv_ioda_pe *pe)
         */
        /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
        window_size = min((maxblock * 8) << tceshift, max_memory);
+
+       /*
+        * If we get TVE#1_4GB on, there is no point in having a huge default
+        * DMA window.
+        */
+       if (pnv_iommu_bypass_mode == PNV_IOMMU_TCE_BYPASS)
+               window_size = min_t(u64, pe->table_group.tce32_size,
+                               window_size);
+
        /* Each TCE level cannot exceed maxblock so go multilevel if needed */
        tces_order = ilog2(window_size >> tceshift);
        tcelevel_order = ilog2(maxblock >> 3);
@@ -2470,9 +2594,6 @@ static long pnv_pci_ioda2_setup_default_config(struct 
pnv_ioda_pe *pe)
                return rc;
        }
 
-       if (!pnv_iommu_bypass_disabled)
-               pnv_pci_ioda2_set_bypass(pe, true);
-
        /*
         * Set table base for the case of IOMMU DMA use. Usually this is done
         * from dma_dev_setup() which is not called when a device is returned
@@ -2550,15 +2671,27 @@ static long pnv_pci_ioda2_create_table_userspace(
 {
        struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
                        table_group);
-       __u64 bus_offset = num ?
-               pe->table_group.tce64_start : table_group->tce32_start;
-       long ret = pnv_pci_ioda2_create_table(pe->phb->hose->node,
-                       num, bus_offset, page_shift, window_size, levels, true,
-                       ptbl);
+       __u64 bus_offset = num ? table_group->tce64_start :
+                                table_group->tce32_start;
+       long ret;
 
-       if (!ret)
+       if (bus_offset == SZ_4G)
+               ret = pnv_pci_ioda2_create_table(pe->phb->hose->node,
+                               num, 0, page_shift,
+                               roundup_pow_of_two(window_size + bus_offset),
+                               levels, true, ptbl);
+       else
+               ret = pnv_pci_ioda2_create_table(pe->phb->hose->node,
+                               num, bus_offset, page_shift,
+                               window_size,
+                               levels, true, ptbl);
+
+       if (!ret) {
                (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
                                page_shift, window_size, levels);
+               (*ptbl)->it_dmaoff = bus_offset >> page_shift;
+       }
+
        return ret;
 }
 
@@ -2624,8 +2757,6 @@ static void pnv_ioda_setup_bus_iommu_group(struct 
pnv_ioda_pe *pe,
                                bus);
 }
 
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
-
 static void pnv_pci_ioda_setup_iommu_api(void)
 {
        struct pci_controller *hose;
-- 
2.17.1

Reply via email to