On Thu, 2022-06-16 at 17:59 +1000, Alexey Kardashevskiy wrote: > The pseries platform uses 32bit default DMA window (always 4K pages) > and > optional 64bit DMA window available via DDW ("Dynamic DMA Windows"), > 64K or 2M pages. For ages the default one was not removed and a huge > window was created in addition. Things changed with SRIOV-enabled > PowerVM which creates a default-and-bigger DMA window in 64bit space > (still using 4K pages) for IOV VFs so certain OSes do not need to use > the DDW API in order to utilize all available TCE budget. > > Linux on the other hand removes the default window and creates a > bigger > one (with more TCEs or/and a bigger page size - 64K/2M) in a bid to > map > the entire RAM, and if the new window size is smaller than that - it > still uses this new bigger window. The result is that the default > window > is removed but the "ibm,dma-window" property is not. > > When kdump is invoked, the existing code tries reusing the existing > 64bit > DMA window which location and parameters are stored in the device > tree but > this fails as the new property does not make it to the kdump device > tree > blob. So the code falls back to the default window which does not > exist > anymore although the device tree says that it does. The result of > that > is that PCI devices become unusable and cannot be used for kdumping. > > This preserves the DMA64 and DIRECT64 properties in the device tree > blob > for the crash kernel. Since the crash kernel setup is done after > device > drivers are loaded and probed, the proper DMA config is stored at > least > for boot time devices. > > Because DDW window is optional and the code configures the default > window > first, the existing code creates an IOMMU table descriptor for > the non-existing default DMA window. It is harmless for kdump as it > does > not touch the actual window (only reads what is mapped and marks > those IO > pages as used) but it is bad for kexec which clears it thinking it is > a smaller default window rather than a bigger DDW window. > > This removes the "ibm,dma-window" property from the device tree after > a bigger window is created and the crash kernel setup picks it up. > > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
Hey Alexey, great description of the problem. Would this need a Fixes: tag? > --- > arch/powerpc/kexec/file_load_64.c | 52 +++++++++++++++ > arch/powerpc/platforms/pseries/iommu.c | 88 +++++++++++++++--------- > -- > 2 files changed, 102 insertions(+), 38 deletions(-) > > diff --git a/arch/powerpc/kexec/file_load_64.c > b/arch/powerpc/kexec/file_load_64.c > index b4981b651d9a..b4b486b68b63 100644 > --- a/arch/powerpc/kexec/file_load_64.c > +++ b/arch/powerpc/kexec/file_load_64.c > @@ -1038,6 +1038,48 @@ static int update_cpus_node(void *fdt) > return ret; > } > > +static int copy_dma_property(void *fdt, int node_offset, const > struct device_node *dn, > + const char *propname) > +{ > + const void *prop, *fdtprop; > + int len = 0, fdtlen = 0, ret; > + > + prop = of_get_property(dn, propname, &len); > + fdtprop = fdt_getprop(fdt, node_offset, propname, &fdtlen); > + > + if (fdtprop && !prop) > + ret = fdt_delprop(fdt, node_offset, propname); > + else if (prop) > + ret = fdt_setprop(fdt, node_offset, propname, prop, > len); If fdtprop and prop are both false, ret is returned uninitialised. > + > + return ret; > +} > + > +static int update_pci_nodes(void *fdt, const char *dmapropname) > +{ > + struct device_node *dn; > + int pci_offset, root_offset, ret = 0; > + > + if (!firmware_has_feature(FW_FEATURE_LPAR)) > + return 0; > + > + root_offset = fdt_path_offset(fdt, "/"); > + for_each_node_with_property(dn, dmapropname) { > + pci_offset = fdt_subnode_offset(fdt, root_offset, > of_node_full_name(dn)); > + if (pci_offset < 0) > + continue; > + > + ret = copy_dma_property(fdt, pci_offset, dn, > "ibm,dma-window"); > + if (ret < 0) > + break; > + ret = copy_dma_property(fdt, pci_offset, dn, > dmapropname); > + if (ret < 0) > + break; > + } > + > + return ret; > +} > + > /** > * setup_new_fdt_ppc64 - Update the flattend device-tree of the > kernel > * being loaded. > @@ -1099,6 +1141,16 @@ int setup_new_fdt_ppc64(const struct kimage > *image, void *fdt, > if (ret < 0) > goto out; > > +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" > +#define DMA64_PROPNAME "linux,dma64-ddr-window-info" Instead of having these defined in two different places, could they be moved out of iommu.c and into a header? Though we hardcode ibm,dma- window everywhere anyway. > + ret = update_pci_nodes(fdt, DIRECT64_PROPNAME); > + if (ret < 0) > + goto out; > + > + ret = update_pci_nodes(fdt, DMA64_PROPNAME); > + if (ret < 0) > + goto out; > + > /* Update memory reserve map */ > ret = get_reserved_memory_ranges(&rmem); > if (ret) > diff --git a/arch/powerpc/platforms/pseries/iommu.c > b/arch/powerpc/platforms/pseries/iommu.c > index fba64304e859..af3c871668df 100644 > --- a/arch/powerpc/platforms/pseries/iommu.c > +++ b/arch/powerpc/platforms/pseries/iommu.c > @@ -700,6 +700,33 @@ struct iommu_table_ops > iommu_table_lpar_multi_ops = { > .get = tce_get_pSeriesLP > }; > > +/* > + * Find nearest ibm,dma-window (default DMA window) or direct DMA > window or > + * dynamic 64bit DMA window, walking up the device tree. > + */ > +static struct device_node *pci_dma_find(struct device_node *dn, > + const __be32 **dma_window) > +{ > + const __be32 *dw = NULL; > + > + for ( ; dn && PCI_DN(dn); dn = dn->parent) { > + dw = of_get_property(dn, "ibm,dma-window", NULL); > + if (dw) { > + if (dma_window) > + *dma_window = dw; > + return dn; > + } > + dw = of_get_property(dn, DIRECT64_PROPNAME, NULL); > + if (dw) > + return dn; > + dw = of_get_property(dn, DMA64_PROPNAME, NULL); > + if (dw) > + return dn; > + } > + > + return NULL; > +} > + > static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) > { > struct iommu_table *tbl; > @@ -712,20 +739,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct > pci_bus *bus) > pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus > %pOF\n", > dn); > > - /* > - * Find nearest ibm,dma-window (default DMA window), walking > up the > - * device tree > - */ > - for (pdn = dn; pdn != NULL; pdn = pdn->parent) { > - dma_window = of_get_property(pdn, "ibm,dma-window", > NULL); > - if (dma_window != NULL) > - break; > - } > + pdn = pci_dma_find(dn, &dma_window); > > - if (dma_window == NULL) { > + if (dma_window == NULL) > pr_debug(" no ibm,dma-window property !\n"); > - return; > - } > > ppci = PCI_DN(pdn); > > @@ -735,11 +752,13 @@ static void pci_dma_bus_setup_pSeriesLP(struct > pci_bus *bus) > if (!ppci->table_group) { > ppci->table_group = iommu_pseries_alloc_group(ppci- > >phb->node); > tbl = ppci->table_group->tables[0]; > - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, > - ppci->table_group, dma_window); > + if (dma_window) { > + iommu_table_setparms_lpar(ppci->phb, pdn, > tbl, > + ppci->table_group, > dma_window); > > - if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) > - panic("Failed to initialize iommu table"); > + if (!iommu_init_table(tbl, ppci->phb->node, > 0, 0)) > + panic("Failed to initialize iommu > table"); > + } > iommu_register_group(ppci->table_group, > pci_domain_nr(bus), 0); > pr_debug(" created table: %p\n", ppci->table_group); > @@ -1429,16 +1448,22 @@ static bool enable_ddw(struct pci_dev *dev, > struct device_node *pdn) > > pci->table_group->tables[1] = newtbl; > > - /* Keep default DMA window struct if removed */ > - if (default_win_removed) { > - tbl->it_size = 0; > - vfree(tbl->it_map); > - tbl->it_map = NULL; > - } > - > set_iommu_table_base(&dev->dev, newtbl); > } > > + if (default_win_removed) { > + struct property *def_win; Another section of enable_ddw() already has a default_win, could that variable be made top-level and shared? > + struct pci_dn *pci = PCI_DN(pdn); enable_ddw() already has the same variable declared. Otherwise, LGTM. Reviewed-by: Russell Currey <rus...@russell.cc> > + > + iommu_tce_table_put(pci->table_group->tables[0]); > + def_win = of_find_property(pdn, "ibm,dma-window", > NULL); > + if (def_win) { > + of_remove_property(pdn, def_win); > + dev_info(&dev->dev, "Removed default DMA > window for %pOF\n", pdn); > + } > + pci->table_group->tables[0] = NULL; > + } > + > spin_lock(&dma_win_list_lock); > list_add(&window->list, &dma_win_list); > spin_unlock(&dma_win_list_lock); > @@ -1503,13 +1528,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct > pci_dev *dev) > dn = pci_device_to_OF_node(dev); > pr_debug(" node is %pOF\n", dn); > > - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)- > >table_group; > - pdn = pdn->parent) { > - dma_window = of_get_property(pdn, "ibm,dma-window", > NULL); > - if (dma_window) > - break; > - } > - > + pdn = pci_dma_find(dn, &dma_window); > if (!pdn || !PCI_DN(pdn)) { > printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " > "no DMA window found for pci dev=%s > dn=%pOF\n", > @@ -1540,7 +1559,6 @@ static void pci_dma_dev_setup_pSeriesLP(struct > pci_dev *dev) > static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, > u64 dma_mask) > { > struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; > - const __be32 *dma_window = NULL; > > /* only attempt to use a new window if 64-bit DMA is > requested */ > if (dma_mask < DMA_BIT_MASK(64)) > @@ -1554,13 +1572,7 @@ static bool > iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) > * search upwards in the tree until we either hit a dma- > window > * property, OR find a parent with a table already allocated. > */ > - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)- > >table_group; > - pdn = pdn->parent) { > - dma_window = of_get_property(pdn, "ibm,dma-window", > NULL); > - if (dma_window) > - break; > - } > - > + pdn = pci_dma_find(dn, NULL); > if (pdn && PCI_DN(pdn)) > return enable_ddw(pdev, pdn); >