On Thu, Jun 04, 2020 at 04:44:11PM -0700, Krishna Reddy wrote: > NVIDIA's Tegra194 soc uses two ARM MMU-500s together to interleave
s/soc/SoC/ > IOVA accesses across them. > Add NVIDIA implementation for dual ARM MMU-500s and add new compatible > string for Tegra194 soc. Same here. > > Signed-off-by: Krishna Reddy <vdu...@nvidia.com> > --- > MAINTAINERS | 2 + > drivers/iommu/Makefile | 2 +- > drivers/iommu/arm-smmu-impl.c | 3 + > drivers/iommu/arm-smmu-nvidia.c | 161 ++++++++++++++++++++++++++++++++ > drivers/iommu/arm-smmu.h | 1 + > 5 files changed, 168 insertions(+), 1 deletion(-) > create mode 100644 drivers/iommu/arm-smmu-nvidia.c > > diff --git a/MAINTAINERS b/MAINTAINERS > index 50659d76976b7..118da0893c964 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -16572,9 +16572,11 @@ F: drivers/i2c/busses/i2c-tegra.c > > TEGRA IOMMU DRIVERS > M: Thierry Reding <thierry.red...@gmail.com> > +R: Krishna Reddy <vdu...@nvidia.com> > L: linux-te...@vger.kernel.org > S: Supported > F: drivers/iommu/tegra* > +F: drivers/iommu/arm-smmu-nvidia.c > > TEGRA KBC DRIVER > M: Laxman Dewangan <ldewan...@nvidia.com> > diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile > index 57cf4ba5e27cb..35542df00da72 100644 > --- a/drivers/iommu/Makefile > +++ b/drivers/iommu/Makefile > @@ -15,7 +15,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o > amd_iommu_quirks.o > obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o > obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o > obj-$(CONFIG_ARM_SMMU) += arm_smmu.o > -arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o > +arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o arm-smmu-nvidia.o > obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o > obj-$(CONFIG_DMAR_TABLE) += dmar.o > obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o > diff --git a/drivers/iommu/arm-smmu-impl.c b/drivers/iommu/arm-smmu-impl.c > index c75b9d957b702..52c84c30f83e4 100644 > --- a/drivers/iommu/arm-smmu-impl.c > +++ b/drivers/iommu/arm-smmu-impl.c > @@ -160,6 +160,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct > arm_smmu_device *smmu) > */ > switch (smmu->model) { > case ARM_MMU500: > + if (of_device_is_compatible(smmu->dev->of_node, > + "nvidia,tegra194-smmu-500")) > + return nvidia_smmu_impl_init(smmu); Should NVIDIA_TEGRA194_SMMU be a separate value for smmu->model, perhaps? That way we avoid this somewhat odd check here. > smmu->impl = &arm_mmu500_impl; > break; > case CAVIUM_SMMUV2: > diff --git a/drivers/iommu/arm-smmu-nvidia.c b/drivers/iommu/arm-smmu-nvidia.c I wonder if it would be better to name this arm-smmu-tegra.c to make it clearer that this is for a Tegra chip. We do have regular expressions in MAINTAINERS that catch anything with "tegra" in it to make this easier. > new file mode 100644 > index 0000000000000..dafc293a45217 > --- /dev/null > +++ b/drivers/iommu/arm-smmu-nvidia.c > @@ -0,0 +1,161 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +// Nvidia ARM SMMU v2 implementation quirks s/Nvidia/NVIDIA/ > +// Copyright (C) 2019 NVIDIA CORPORATION. All rights reserved. I suppose this should now also include 2020. > + > +#define pr_fmt(fmt) "nvidia-smmu: " fmt Same here. Might be worth making this "tegra-smmu: " for consistency. > + > +#include <linux/bitfield.h> > +#include <linux/delay.h> > +#include <linux/of.h> > +#include <linux/platform_device.h> > +#include <linux/slab.h> > + > +#include "arm-smmu.h" > + > +/* Tegra194 has three ARM MMU-500 Instances. > + * Two of them are used together for Interleaved IOVA accesses and > + * used by Non-Isochronous Hw devices for SMMU translations. "non-isochronous", s/Hw/HW/ > + * Third one is used for SMMU translations from Isochronous HW devices. "isochronous" > + * It is possible to use this Implementation to program either "implementation" > + * all three or two of the instances identically as desired through > + * DT node. > + * > + * Programming all the three instances identically comes with redundant tlb s/tlb/TLB/ > + * invalidations as all three never need to be tlb invalidated for a HW > device. Same here. > + * > + * When Linux Kernel supports multiple SMMU devices, The SMMU device used for "kernel" and "..., the SMMU device" > + * Isochornous HW devices should be added as a separate ARM MMU-500 device "isochronous" > + * in DT and be programmed independently for efficient tlb invalidates. "TLB" > + * > + */ > +#define MAX_SMMU_INSTANCES 3 > + > +#define TLB_LOOP_TIMEOUT 1000000 /* 1s! */ USEC_PER_SEC? > +#define TLB_SPIN_COUNT 10 > + > +struct nvidia_smmu { > + struct arm_smmu_device smmu; > + unsigned int num_inst; > + void __iomem *bases[MAX_SMMU_INSTANCES]; > +}; > + > +#define to_nvidia_smmu(s) container_of(s, struct nvidia_smmu, smmu) Making this static inline can make error messages more readable. > + > +#define nsmmu_page(smmu, inst, page) \ > + (((inst) ? to_nvidia_smmu(smmu)->bases[(inst)] : smmu->base) + \ > + ((page) << smmu->pgshift)) Can we simply define to_nvidia_smmu(smmu)->bases[0] = smmu->base in nvidia_smmu_impl_init()? Then this would become just: to_nvidia_smmu(smmu)->bases[inst] + ((page) << (smmu)->pgshift) Also, the nsmmu_ prefix looks somewhat odd here. You already use struct nvidia_smmu as the name of the structure, so why not be consistent and continue to use nvidia_smmu_ as the prefix for function names? Or perhaps even use tegra_smmu_ as the prefix to match the filename change I suggested earlier. > + > +static u32 nsmmu_read_reg(struct arm_smmu_device *smmu, > + int page, int offset) > +{ > + return readl_relaxed(nsmmu_page(smmu, 0, page) + offset); > +} > + > +static void nsmmu_write_reg(struct arm_smmu_device *smmu, > + int page, int offset, u32 val) > +{ > + unsigned int i; > + > + for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) > + writel_relaxed(val, nsmmu_page(smmu, i, page) + offset); > +} > + > +static u64 nsmmu_read_reg64(struct arm_smmu_device *smmu, > + int page, int offset) > +{ > + return readq_relaxed(nsmmu_page(smmu, 0, page) + offset); > +} > + > +static void nsmmu_write_reg64(struct arm_smmu_device *smmu, > + int page, int offset, u64 val) > +{ > + unsigned int i; > + > + for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) > + writeq_relaxed(val, nsmmu_page(smmu, i, page) + offset); > +} > + > +static void nsmmu_tlb_sync(struct arm_smmu_device *smmu, int page, > + int sync, int status) > +{ > + u32 reg; I see this is being used to store the value read from a register. I find it clearer to call this "value" or "val" (or in this case perhaps even "status") because whenever I read "reg" I immediately think this is meant to be a register offset, which can then be confusing when I see it used in I/O accessors because it is in the wrong position. But anyway, that's just my opinion and this is a bit subjective, so feel free to ignore. > + unsigned int i; > + unsigned int spin_cnt, delay; > + > + arm_smmu_writel(smmu, page, sync, 0); > + > + for (delay = 1; delay < TLB_LOOP_TIMEOUT; delay *= 2) { > + for (spin_cnt = TLB_SPIN_COUNT; spin_cnt > 0; spin_cnt--) { > + reg = 0; You may want to declare the variable at this scope since you never need it outside. Also, use a space between variable initialization and the for block below for better readability. > + for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) { > + reg |= readl_relaxed( > + nsmmu_page(smmu, i, page) + status); > + } Maybe add a separate variable for the page address so this can be a bit uncluttered. Also, I'd prefer a blank line after the block for readability. > + if (!(reg & ARM_SMMU_sTLBGSTATUS_GSACTIVE)) > + return; > + cpu_relax(); Blank line above cpu_relax() for readability. > + } > + udelay(delay); Again, a blank line between blocks and subsequent statements can help readability. > + } > + dev_err_ratelimited(smmu->dev, > + "TLB sync timed out -- SMMU may be deadlocked\n"); Same here. Also, is there anything we can do when this happens? > +} > + > +static int nsmmu_reset(struct arm_smmu_device *smmu) > +{ > + u32 reg; > + unsigned int i; > + > + for (i = 0; i < to_nvidia_smmu(smmu)->num_inst; i++) { > + /* clear global FSR */ > + reg = readl_relaxed(nsmmu_page(smmu, i, ARM_SMMU_GR0) + > + ARM_SMMU_GR0_sGFSR); > + writel_relaxed(reg, nsmmu_page(smmu, i, ARM_SMMU_GR0) + > + ARM_SMMU_GR0_sGFSR); > + } > + > + return 0; > +} > + > +static const struct arm_smmu_impl nvidia_smmu_impl = { > + .read_reg = nsmmu_read_reg, > + .write_reg = nsmmu_write_reg, > + .read_reg64 = nsmmu_read_reg64, > + .write_reg64 = nsmmu_write_reg64, > + .reset = nsmmu_reset, > + .tlb_sync = nsmmu_tlb_sync, > +}; > + > +struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu) > +{ > + unsigned int i; > + struct nvidia_smmu *nsmmu; > + struct resource *res; > + struct device *dev = smmu->dev; > + struct platform_device *pdev = to_platform_device(smmu->dev); > + > + nsmmu = devm_kzalloc(smmu->dev, sizeof(*nsmmu), GFP_KERNEL); > + if (!nsmmu) > + return ERR_PTR(-ENOMEM); > + > + nsmmu->smmu = *smmu; > + /* Instance 0 is ioremapped by arm-smmu.c */ > + nsmmu->num_inst = 1; Maybe add this here to simplify the nsmmu_page() macro above: nsmmu->bases[0] = smmu->base; > + > + for (i = 1; i < MAX_SMMU_INSTANCES; i++) { > + res = platform_get_resource(pdev, IORESOURCE_MEM, i); > + if (!res) > + break; > + nsmmu->bases[i] = devm_ioremap_resource(dev, res); > + if (IS_ERR(nsmmu->bases[i])) > + return (struct arm_smmu_device *)nsmmu->bases[i]; ERR_CAST()? > + nsmmu->num_inst++; > + } More blank lines would make this much easier to read, in my opinion. > + > + nsmmu->smmu.impl = &nvidia_smmu_impl; > + devm_kfree(smmu->dev, smmu); Maybe a comment here would be useful for readers to immediately understand why you're doing this here. > + pr_info("NVIDIA ARM SMMU Implementation, Instances=%d\n", > + nsmmu->num_inst); I think I'd just omit this. In general you should only let the user know when things go wrong, but the above is printed when everything goes as expected. Thierry
signature.asc
Description: PGP signature