On Thu, 4 Apr 2019 16:23:24 +1100 Alexey Kardashevskiy <a...@ozlabs.ru> wrote:
> The NVIDIA V100 SXM2 GPUs are connected to the CPU via PCIe links and > (on POWER9) NVLinks. In addition to that, GPUs themselves have direct > peer to peer NVLinks in groups of 2 to 4 GPUs. At the moment the POWERNV > platform puts all interconnected GPUs to the same IOMMU group. > > However the user may want to pass individual GPUs to the userspace so > in order to do so we need to put them into separate IOMMU groups and > cut off the interconnects. > > Thankfully V100 GPUs implement an interface to do by programming link > disabling mask to BAR0 of a GPU. Once a link is disabled in a GPU using > this interface, it cannot be re-enabled until the secondary bus reset is > issued to the GPU. > > This adds an extra step to the secondary bus reset handler (the one used > for such GPUs) to block NVLinks to GPUs which do not belong to the same > group as the GPU being reset. > > This adds a new "isolate_nvlink" kernel parameter to allow GPU isolation; > when enabled, every GPU gets its own IOMMU group. The new parameter is off > by default to preserve the existing behaviour. > > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> > --- > Changes: > v2: > * this is rework of [PATCH kernel RFC 0/2] vfio, powerpc/powernv: Isolate > GV100GL > but this time it is contained in the powernv platform > --- > arch/powerpc/platforms/powernv/Makefile | 2 +- > arch/powerpc/platforms/powernv/pci.h | 1 + > arch/powerpc/platforms/powernv/eeh-powernv.c | 1 + > arch/powerpc/platforms/powernv/npu-dma.c | 24 +++- > arch/powerpc/platforms/powernv/nvlinkgpu.c | 131 +++++++++++++++++++ > 5 files changed, 156 insertions(+), 3 deletions(-) > create mode 100644 arch/powerpc/platforms/powernv/nvlinkgpu.c > > diff --git a/arch/powerpc/platforms/powernv/Makefile > b/arch/powerpc/platforms/powernv/Makefile > index da2e99efbd04..60a10d3b36eb 100644 > --- a/arch/powerpc/platforms/powernv/Makefile > +++ b/arch/powerpc/platforms/powernv/Makefile > @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o > opal-power.o opal-irqchip.o > obj-y += opal-kmsg.o opal-powercap.o opal-psr.o > opal-sensor-groups.o > > obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o > -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o > +obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o nvlinkgpu.o > obj-$(CONFIG_CXL_BASE) += pci-cxl.o > obj-$(CONFIG_EEH) += eeh-powernv.o > obj-$(CONFIG_PPC_SCOM) += opal-xscom.o > diff --git a/arch/powerpc/platforms/powernv/pci.h > b/arch/powerpc/platforms/powernv/pci.h > index 8e36da379252..9fd3f391482c 100644 > --- a/arch/powerpc/platforms/powernv/pci.h > +++ b/arch/powerpc/platforms/powernv/pci.h > @@ -250,5 +250,6 @@ extern void pnv_pci_unlink_table_and_group(struct > iommu_table *tbl, > extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, > void *tce_mem, u64 tce_size, > u64 dma_offset, unsigned int page_shift); > +extern void pnv_try_isolate_nvidia_v100(struct pci_dev *gpdev); > > #endif /* __POWERNV_PCI_H */ > diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c > b/arch/powerpc/platforms/powernv/eeh-powernv.c > index f38078976c5d..464b097d9635 100644 > --- a/arch/powerpc/platforms/powernv/eeh-powernv.c > +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c > @@ -937,6 +937,7 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) > pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); > pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); > } > + pnv_try_isolate_nvidia_v100(dev); > } > > static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c > b/arch/powerpc/platforms/powernv/npu-dma.c > index dc23d9d2a7d9..017eae8197e7 100644 > --- a/arch/powerpc/platforms/powernv/npu-dma.c > +++ b/arch/powerpc/platforms/powernv/npu-dma.c > @@ -529,6 +529,23 @@ static void pnv_comp_attach_table_group(struct npu_comp > *npucomp, > ++npucomp->pe_num; > } > > +static bool isolate_nvlink; > + > +static int __init parse_isolate_nvlink(char *p) > +{ > + bool val; > + > + if (!p) > + val = true; > + else if (kstrtobool(p, &val)) > + return -EINVAL; > + > + isolate_nvlink = val; > + > + return 0; > +} > +early_param("isolate_nvlink", parse_isolate_nvlink); > + > struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe > *pe) > { > struct iommu_table_group *table_group; > @@ -549,7 +566,7 @@ struct iommu_table_group > *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) > > hose = pci_bus_to_host(npdev->bus); > > - if (hose->npu) { > + if (hose->npu && !isolate_nvlink) { > table_group = &hose->npu->npucomp.table_group; > > if (!table_group->group) { > @@ -559,7 +576,10 @@ struct iommu_table_group > *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) > pe->pe_number); > } > } else { > - /* Create a group for 1 GPU and attached NPUs for POWER8 */ > + /* > + * Create a group for 1 GPU and attached NPUs for > + * POWER8 (always) or POWER9 (when isolate_nvlink). > + */ > pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL); > table_group = &pe->npucomp->table_group; > table_group->ops = &pnv_npu_peers_ops; > diff --git a/arch/powerpc/platforms/powernv/nvlinkgpu.c > b/arch/powerpc/platforms/powernv/nvlinkgpu.c > new file mode 100644 > index 000000000000..dbd8e9d47a05 > --- /dev/null > +++ b/arch/powerpc/platforms/powernv/nvlinkgpu.c > @@ -0,0 +1,131 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * A helper to disable NVLinks between GPUs on IBM Withersponn platform. > + * > + * Copyright (C) 2019 IBM Corp. All rights reserved. > + * Author: Alexey Kardashevskiy <a...@ozlabs.ru> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/module.h> > +#include <linux/device.h> > +#include <linux/of.h> > +#include <linux/iommu.h> > +#include <linux/pci.h> > + > +static int nvlinkgpu_is_ph_in_group(struct device *dev, void *data) > +{ > + return dev->of_node->phandle == *(phandle *) data; > +} > + > +static u32 nvlinkgpu_get_disable_mask(struct device *dev) > +{ > + int npu, peer; > + u32 mask; > + struct device_node *dn; > + struct iommu_group *group; > + > + dn = dev->of_node; > + if (!of_find_property(dn, "ibm,nvlink-peers", NULL)) > + return 0; > + > + group = iommu_group_get(dev); > + if (!group) > + return 0; > + > + /* > + * Collect links to keep which includes links to NPU and links to > + * other GPUs in the same IOMMU group. > + */ > + for (npu = 0, mask = 0; ; ++npu) { > + u32 npuph = 0; > + > + if (of_property_read_u32_index(dn, "ibm,npu", npu, &npuph)) > + break; > + > + for (peer = 0; ; ++peer) { > + u32 peerph = 0; > + > + if (of_property_read_u32_index(dn, "ibm,nvlink-peers", > + peer, &peerph)) > + break; > + > + if (peerph != npuph && > + !iommu_group_for_each_dev(group, &peerph, > + nvlinkgpu_is_ph_in_group)) > + continue; > + > + mask |= 1 << (peer + 16); > + } > + } > + iommu_group_put(group); > + > + /* Disabling mechanism takes links to disable so invert it here */ > + mask = ~mask & 0x3F0000; > + > + return mask; > +} > + > +void pnv_try_isolate_nvidia_v100(struct pci_dev *bridge) > +{ > + u32 mask; > + void __iomem *bar0_0, *bar0_120000, *bar0_a00000; > + struct pci_dev *pdev; > + > + if (!bridge->subordinate) > + return; > + > + pdev = list_first_entry_or_null(&bridge->subordinate->devices, > + struct pci_dev, bus_list); > + if (!pdev) > + return; > + > + if (pdev->vendor != PCI_VENDOR_ID_NVIDIA) > + return; > + > + mask = nvlinkgpu_get_disable_mask(&pdev->dev); > + if (!mask) > + return; > + > + bar0_0 = pci_iomap_range(pdev, 0, 0, 0x10000); > + bar0_120000 = pci_iomap_range(pdev, 0, 0x120000, 0x10000); > + bar0_a00000 = pci_iomap_range(pdev, 0, 0xA00000, 0x10000); > + > + if (bar0_120000 && bar0_0 && bar0_a00000) { > + u32 val; > + u16 cmd = 0, cmdmask = PCI_COMMAND_MEMORY; > + > + pci_restore_state(pdev); > + pci_read_config_word(pdev, PCI_COMMAND, &cmd); > + if ((cmd & cmdmask) != cmdmask) > + pci_write_config_word(pdev, PCI_COMMAND, cmd | cmdmask); > + > + /* > + * The sequence is from "Tesla P100 and V100 SXM2 NVLink > + * Isolation on Multi-Tenant Systems". > + * The register names are not provided there either, > + * hence raw values. > + */ > + iowrite32(0x4, bar0_120000 + 0x4C); > + iowrite32(0x2, bar0_120000 + 0x2204); > + val = ioread32(bar0_0 + 0x200); > + val |= 0x02000000; > + iowrite32(val, bar0_0 + 0x200); > + val = ioread32(bar0_a00000 + 0x148); > + val |= mask; > + iowrite32(val, bar0_a00000 + 0x148); > + > + if ((cmd | cmdmask) != cmd) > + pci_write_config_word(pdev, PCI_COMMAND, cmd); > + } > + It's a little troubling that if something goes wrong with an iomap we'll silently fail to re-enable the expected isolation. Seems worthy of at least a pci_warn/err. Thanks, Alex > + if (bar0_0) > + pci_iounmap(pdev, bar0_0); > + if (bar0_120000) > + pci_iounmap(pdev, bar0_120000); > + if (bar0_a00000) > + pci_iounmap(pdev, bar0_a00000); > +}