Re: [PATCH v4 7/8] linux/log2.h: Fix 64bit calculations in roundup/down_pow_two()
add5860e 100644 > --- a/drivers/net/ethernet/sfc/efx.h > +++ b/drivers/net/ethernet/sfc/efx.h > @@ -52,7 +52,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue); > > #define EFX_MAX_DMAQ_SIZE 4096UL > #define EFX_DEFAULT_DMAQ_SIZE 1024UL > -#define EFX_MIN_DMAQ_SIZE 512UL > +#define EFX_MIN_DMAQ_SIZE 512ULL > > #define EFX_MAX_EVQ_SIZE 16384UL > #define EFX_MIN_EVQ_SIZE 512UL > diff --git a/drivers/net/ethernet/sfc/falcon/efx.h > b/drivers/net/ethernet/sfc/falcon/efx.h > index d3b4646545fa..0d16257156d6 100644 > --- a/drivers/net/ethernet/sfc/falcon/efx.h > +++ b/drivers/net/ethernet/sfc/falcon/efx.h > @@ -55,7 +55,7 @@ void ef4_schedule_slow_fill(struct ef4_rx_queue *rx_queue); > > #define EF4_MAX_DMAQ_SIZE 4096UL > #define EF4_DEFAULT_DMAQ_SIZE 1024UL > -#define EF4_MIN_DMAQ_SIZE 512UL > +#define EF4_MIN_DMAQ_SIZE 512ULL > > #define EF4_MAX_EVQ_SIZE 16384UL > #define EF4_MIN_EVQ_SIZE 512UL > diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c > index c7709e49f0e4..f0391e88bc42 100644 > --- a/drivers/pci/msi.c > +++ b/drivers/pci/msi.c > @@ -578,7 +578,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct > irq_affinity *affd) > entry->msi_attrib.maskbit = !!(control & PCI_MSI_FLAGS_MASKBIT); > entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ > entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; > - entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); > + entry->msi_attrib.multiple = ilog2(roundup_pow_of_two(nvec)); > > if (control & PCI_MSI_FLAGS_64BIT) > entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; > diff --git a/include/linux/log2.h b/include/linux/log2.h > index 83a4a3ca3e8a..53a727303dac 100644 > --- a/include/linux/log2.h > +++ b/include/linux/log2.h > @@ -47,26 +47,6 @@ bool is_power_of_2(unsigned long n) > return (n != 0 && ((n & (n - 1)) == 0)); > } > > -/** > - * __roundup_pow_of_two() - round up to nearest power of two > - * @n: value to round up > - */ > -static inline __attribute__((const)) > -unsigned long __roundup_pow_of_two(unsigned long n) > -{ > - return 1UL << fls_long(n - 1); > -} > - > -/** > - * __rounddown_pow_of_two() - round down to nearest power of two > - * @n: value to round down > - */ > -static inline __attribute__((const)) > -unsigned long __rounddown_pow_of_two(unsigned long n) > -{ > - return 1UL << (fls_long(n) - 1); > -} > - > /** > * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value > * @n: parameter > @@ -170,14 +150,11 @@ unsigned long __rounddown_pow_of_two(unsigned long n) > * - the result is undefined when n == 0 > * - this can be used to initialise global variables from constant data > */ > -#define roundup_pow_of_two(n)\ > -(\ > - __builtin_constant_p(n) ? ( \ > - (n == 1) ? 1 : \ > - (1UL << (ilog2((n) - 1) + 1)) \ > -) : \ > - __roundup_pow_of_two(n) \ > - ) > +#define roundup_pow_of_two(n) \ > +( \ > + (__builtin_constant_p(n) && ((n) == 1)) ? \ > + 1 : (1ULL << (ilog2((n) - 1) + 1))\ > +) > > /** > * rounddown_pow_of_two - round the given value down to nearest power of two > @@ -187,12 +164,11 @@ unsigned long __rounddown_pow_of_two(unsigned long n) > * - the result is undefined when n == 0 > * - this can be used to initialise global variables from constant data > */ > -#define rounddown_pow_of_two(n) \ > -(\ > - __builtin_constant_p(n) ? ( \ > - (1UL << ilog2(n))) :\ > - __rounddown_pow_of_two(n) \ > - ) > +#define rounddown_pow_of_two(n)\ > +( \ > + (__builtin_constant_p(n) && ((n) == 1)) ? \ > + 1 : (1ULL << (ilog2(n))) \ > +) > > static inline __attribute_const__ > int __order_base_2(unsigned long n) > diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c > index 15d70a90b50d..bb9efc6944a4 100644 > --- a/kernel/kexec_core.c > +++ b/kernel/kexec_core.c > @@ -1094,7 +1094,8 @@ static int __init crash_notes_memory_init(void) >* crash_notes is allocated inside one physical page. >*/ > size = sizeof(note_
Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)
Andre- Thank you for the detailed report! Tom- There is a rich set of trace points available in the RPC/RDMA implementation in 5.4/5.5, fwiw. Please keep me in the loop, let me know if there is anything I can do to help. > On Feb 11, 2020, at 2:25 AM, Joerg Roedel wrote: > > Adding Tom's new email address. > > Tom, can you have a look, please? > https://bugzilla.kernel.org/show_bug.cgi?id=206461 seems to be a similar > issue. > > On Tue, Feb 11, 2020 at 06:06:54AM +0100, Andre Tomt wrote: >> Since upgrading my RDMA lab from kernel 5.4.x to 5.5.x, NFSv4 over RDMA >> stopped working. But only on my AMD Ryzen systems. And so far only NFS, >> curiously other RDMA diagnostic tools (like qperf -cm1 rc_bw) work >> fine. >> >> A git bisect points to be62dbf554c5b50718a54a359372c148cd9975c7 iommu/amd: >> Convert AMD iommu driver to the dma-iommu api >> >> 5.5.3-rc1, 5.6-rc1 are also not working. >> >> I verified it by booting with amd_iommu=off on the kernel cmdline - it makes >> everything work again. >> >> The NFS config is a pretty simple NFSv4.x only, sec=sys setup, running over >> RoCEv1 on Mellanox mlx4 hardware (ConnectX-3 Pro, fw 2.42.5000). Nothing >> fancy besides the RoCEv1 and related bits network bits like PFC and storage >> VLAN. Bare metal, no virtualization. >> >> The impacted systems are: >> ASUS ROG STRIX X399-E GAMING, with a Threadripper 1950x, BIOS 1002 >> ASUS Pro WS X570-ACE, with a Ryzen 7 3700x, BIOS 1201 >> >> pcaps off a mirror port can be provided. They show that on 5.5.x, CM >> succeeds, and then a couple of NFS NULL calls comes through (over RoCE), >> both acked, and then the rest just never goes out from the client until the >> mount times out and CM is torn down. >> >> No messages shows up in the kernel log on either side. I was at least >> expecting some scary IOMMU warnings. >> >> More serious hardware is not available for RDMA testing currently, so I dont >> know if a EPYC system or newer mlx5 cards would have similar issues. Intel >> I've only tested as server so far, that worked fine, as expected given the >> bisect result. >> >> >>> git bisect start >>> # bad: [d5226fa6dbae0569ee43ecfc08bdcd6770fc4755] Linux 5.5 >>> git bisect bad d5226fa6dbae0569ee43ecfc08bdcd6770fc4755 >>> # good: [219d54332a09e8d8741c1e1982f5eae56099de85] Linux 5.4 >>> git bisect good 219d54332a09e8d8741c1e1982f5eae56099de85 >>> # good: [8c39f71ee2019e77ee14f88b1321b2348db51820] Merge >>> git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net >>> git bisect good 8c39f71ee2019e77ee14f88b1321b2348db51820 >>> # bad: [76bb8b05960c3d1668e6bee7624ed886cbd135ba] Merge tag 'kbuild-v5.5' >>> of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild >>> git bisect bad 76bb8b05960c3d1668e6bee7624ed886cbd135ba >>> # good: [21b26d2679584c6a60e861aa3e5ca09a6bab0633] Merge tag >>> '5.5-rc-smb3-fixes' of git://git.samba.org/sfrench/cifs-2.6 >>> git bisect good 21b26d2679584c6a60e861aa3e5ca09a6bab0633 >>> # good: [e5b3fc125d768eacd73bb4dc5019f0ce95635af4] Merge branch >>> 'x86-urgent-for-linus' of >>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip >>> git bisect good e5b3fc125d768eacd73bb4dc5019f0ce95635af4 >>> # bad: [937d6eefc716a9071f0e3bada19200de1bb9d048] Merge tag 'docs-5.5a' of >>> git://git.lwn.net/linux >>> git bisect bad 937d6eefc716a9071f0e3bada19200de1bb9d048 >>> # bad: [1daa56bcfd8b329447e0c1b1e91c3925d08489b7] Merge tag >>> 'iommu-updates-v5.5' of >>> git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu >>> git bisect bad 1daa56bcfd8b329447e0c1b1e91c3925d08489b7 >>> # good: [937790699be9c8100e5358625e7dfa8b32bd33f2] mm/page_io.c: annotate >>> refault stalls from swap_readpage >>> git bisect good 937790699be9c8100e5358625e7dfa8b32bd33f2 >>> # good: [a5255bc31673c72e264d837cd13cd3085d72cb58] Merge tag >>> 'dmaengine-5.5-rc1' of git://git.infradead.org/users/vkoul/slave-dma >>> git bisect good a5255bc31673c72e264d837cd13cd3085d72cb58 >>> # good: [34d1b0895dbd10713c73615d8f532e78509e12d9] iommu/arm-smmu: Remove >>> duplicate error message >>> git bisect good 34d1b0895dbd10713c73615d8f532e78509e12d9 >>> # bad: [3c124435e8dd516df4b2fc983f4415386fd6edae] iommu/amd: Support >>> multiple PCI DMA aliases in IRQ Remapping >>> git bisect bad 3c124435e8dd516df4b2fc983f4415386fd6edae >>
Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)
> On Feb 11, 2020, at 10:12 AM, Robin Murphy wrote: > > On 11/02/2020 1:48 pm, Chuck Lever wrote: >> Andre- >> Thank you for the detailed report! >> Tom- >> There is a rich set of trace points available in the RPC/RDMA implementation >> in 5.4/5.5, fwiw. >> Please keep me in the loop, let me know if there is anything I can do to >> help. > > One aspect that may be worth checking is whether there's anywhere that > assumes a successful return value from dma_map_sg() is always the same as the > number of entries passed in - that's the most obvious way the iommu-dma code > differs (legitimately) from the previous amd-iommu implementation. net/sunrpc/xprtrdma/frwr_ops.c: frwr_map() 317 mr->mr_nents = 318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); 319 if (!mr->mr_nents) 320 goto out_dmamap_err; Should that rather be "if (mr->mr_nents != i)" ? > Robin. > >>> On Feb 11, 2020, at 2:25 AM, Joerg Roedel wrote: >>> >>> Adding Tom's new email address. >>> >>> Tom, can you have a look, please? >>> https://bugzilla.kernel.org/show_bug.cgi?id=206461 seems to be a similar >>> issue. >>> >>> On Tue, Feb 11, 2020 at 06:06:54AM +0100, Andre Tomt wrote: >>>> Since upgrading my RDMA lab from kernel 5.4.x to 5.5.x, NFSv4 over RDMA >>>> stopped working. But only on my AMD Ryzen systems. And so far only NFS, >>>> curiously other RDMA diagnostic tools (like qperf -cm1 rc_bw) work >>>> fine. >>>> >>>> A git bisect points to be62dbf554c5b50718a54a359372c148cd9975c7 iommu/amd: >>>> Convert AMD iommu driver to the dma-iommu api >>>> >>>> 5.5.3-rc1, 5.6-rc1 are also not working. >>>> >>>> I verified it by booting with amd_iommu=off on the kernel cmdline - it >>>> makes >>>> everything work again. >>>> >>>> The NFS config is a pretty simple NFSv4.x only, sec=sys setup, running over >>>> RoCEv1 on Mellanox mlx4 hardware (ConnectX-3 Pro, fw 2.42.5000). Nothing >>>> fancy besides the RoCEv1 and related bits network bits like PFC and storage >>>> VLAN. Bare metal, no virtualization. >>>> >>>> The impacted systems are: >>>> ASUS ROG STRIX X399-E GAMING, with a Threadripper 1950x, BIOS 1002 >>>> ASUS Pro WS X570-ACE, with a Ryzen 7 3700x, BIOS 1201 >>>> >>>> pcaps off a mirror port can be provided. They show that on 5.5.x, CM >>>> succeeds, and then a couple of NFS NULL calls comes through (over RoCE), >>>> both acked, and then the rest just never goes out from the client until the >>>> mount times out and CM is torn down. >>>> >>>> No messages shows up in the kernel log on either side. I was at least >>>> expecting some scary IOMMU warnings. >>>> >>>> More serious hardware is not available for RDMA testing currently, so I >>>> dont >>>> know if a EPYC system or newer mlx5 cards would have similar issues. Intel >>>> I've only tested as server so far, that worked fine, as expected given the >>>> bisect result. >>>> >>>> >>>>> git bisect start >>>>> # bad: [d5226fa6dbae0569ee43ecfc08bdcd6770fc4755] Linux 5.5 >>>>> git bisect bad d5226fa6dbae0569ee43ecfc08bdcd6770fc4755 >>>>> # good: [219d54332a09e8d8741c1e1982f5eae56099de85] Linux 5.4 >>>>> git bisect good 219d54332a09e8d8741c1e1982f5eae56099de85 >>>>> # good: [8c39f71ee2019e77ee14f88b1321b2348db51820] Merge >>>>> git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net >>>>> git bisect good 8c39f71ee2019e77ee14f88b1321b2348db51820 >>>>> # bad: [76bb8b05960c3d1668e6bee7624ed886cbd135ba] Merge tag 'kbuild-v5.5' >>>>> of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild >>>>> git bisect bad 76bb8b05960c3d1668e6bee7624ed886cbd135ba >>>>> # good: [21b26d2679584c6a60e861aa3e5ca09a6bab0633] Merge tag >>>>> '5.5-rc-smb3-fixes' of git://git.samba.org/sfrench/cifs-2.6 >>>>> git bisect good 21b26d2679584c6a60e861aa3e5ca09a6bab0633 >>>>> # good: [e5b3fc125d768eacd73bb4dc5019f0ce95635af4] Merge branch >>>>> 'x86-urgent-for-linus' of >>>>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip >>>>> git bisect good
Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)
> On Feb 11, 2020, at 10:32 AM, Robin Murphy wrote: > > On 11/02/2020 3:24 pm, Chuck Lever wrote: >>> On Feb 11, 2020, at 10:12 AM, Robin Murphy wrote: >>> >>> On 11/02/2020 1:48 pm, Chuck Lever wrote: >>>> Andre- >>>> Thank you for the detailed report! >>>> Tom- >>>> There is a rich set of trace points available in the RPC/RDMA >>>> implementation in 5.4/5.5, fwiw. >>>> Please keep me in the loop, let me know if there is anything I can do to >>>> help. >>> >>> One aspect that may be worth checking is whether there's anywhere that >>> assumes a successful return value from dma_map_sg() is always the same as >>> the number of entries passed in - that's the most obvious way the iommu-dma >>> code differs (legitimately) from the previous amd-iommu implementation. >> net/sunrpc/xprtrdma/frwr_ops.c: frwr_map() >> 317 mr->mr_nents = >> 318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, >> mr->mr_dir); >> 319 if (!mr->mr_nents) >> 320 goto out_dmamap_err; >> Should that rather be "if (mr->mr_nents != i)" ? > > No, that much is OK - the point is that dma_map_sg() may pack the DMA > addresses such that sg_dma_len(sg) > sg->length - however, subsequently > passing that mr->nents to dma_unmap_sg() in frwr_mr_recycle() (rather than > the original value of i) looks at a glance like an example of how things may > start to get out-of-whack. Robin, your explanation makes sense to me. I can post a fix for this imbalance later today for Andre to try. -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)
> On Feb 11, 2020, at 11:36 AM, Robin Murphy wrote: > > On 11/02/2020 4:03 pm, Chuck Lever wrote: >>> On Feb 11, 2020, at 10:32 AM, Robin Murphy wrote: >>> >>> On 11/02/2020 3:24 pm, Chuck Lever wrote: >>>>> On Feb 11, 2020, at 10:12 AM, Robin Murphy wrote: >>>>> >>>>> On 11/02/2020 1:48 pm, Chuck Lever wrote: >>>>>> Andre- >>>>>> Thank you for the detailed report! >>>>>> Tom- >>>>>> There is a rich set of trace points available in the RPC/RDMA >>>>>> implementation in 5.4/5.5, fwiw. >>>>>> Please keep me in the loop, let me know if there is anything I can do to >>>>>> help. >>>>> >>>>> One aspect that may be worth checking is whether there's anywhere that >>>>> assumes a successful return value from dma_map_sg() is always the same as >>>>> the number of entries passed in - that's the most obvious way the >>>>> iommu-dma code differs (legitimately) from the previous amd-iommu >>>>> implementation. >>>> net/sunrpc/xprtrdma/frwr_ops.c: frwr_map() >>>> 317 mr->mr_nents = >>>> 318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, >>>> mr->mr_dir); >>>> 319 if (!mr->mr_nents) >>>> 320 goto out_dmamap_err; >>>> Should that rather be "if (mr->mr_nents != i)" ? >>> >>> No, that much is OK - the point is that dma_map_sg() may pack the DMA >>> addresses such that sg_dma_len(sg) > sg->length - however, subsequently >>> passing that mr->nents to dma_unmap_sg() in frwr_mr_recycle() (rather than >>> the original value of i) looks at a glance like an example of how things >>> may start to get out-of-whack. >> Robin, your explanation makes sense to me. I can post a fix for this >> imbalance later today for Andre to try. > > FWIW here's a quick hack which *should* suppress the concatenation behaviour > - if it makes Andre's system any happier then that would indeed point towards > dma_map_sg() handling being the culprit. Even so, 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails") looks like it introduced this problem. > Robin. > > ->8- > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c > index a2e96a5fd9a7..a6b71bad518e 100644 > --- a/drivers/iommu/dma-iommu.c > +++ b/drivers/iommu/dma-iommu.c > @@ -779,7 +779,7 @@ static int __finalise_sg(struct device *dev, struct > scatterlist *sg, int nents, >* - but doesn't fall at a segment boundary >* - and wouldn't make the resulting output segment too long >*/ > - if (cur_len && !s_iova_off && (dma_addr & seg_mask) && > + if (0 && cur_len && !s_iova_off && (dma_addr & seg_mask) && > (max_len - cur_len >= s_length)) { > /* ...then concatenate it with the previous one */ > cur_len += s_length; > @@ -799,6 +799,7 @@ static int __finalise_sg(struct device *dev, struct > scatterlist *sg, int nents, > if (s_length + s_iova_off < s_iova_len) > cur_len = 0; > } > + WARN_ON(count < nents); > return count; > } -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v1] xprtrdma: Fix DMA scatter-gather list mapping imbalance
The @nents value that was passed to ib_dma_map_sg() has to be passed to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to concatenate sg entries, it will return a different nents value than it was passed. The bug was exposed by recent changes to the AMD IOMMU driver. Reported-by: Andre Tomt Suggested-by: Robin Murphy Fixes: 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails") Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/frwr_ops.c |5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) Hey Andre, please try this out. It just reverts the bit of brokenness that Robin observed this morning. I've done basic testing here with Intel IOMMU systems, no change in behavior (ie, all good to go). diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 095be887753e..449bb51e4fe8 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -313,10 +313,9 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, break; } mr->mr_dir = rpcrdma_data_dir(writing); + mr->mr_nents = i; - mr->mr_nents = - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) + if (!ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir)) goto out_dmamap_err; ibmr = mr->frwr.fr_mr; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v1] xprtrdma: Fix DMA scatter-gather list mapping imbalance
Hi Andre, thanks for trying this out. > On Feb 11, 2020, at 3:50 PM, Andre Tomt wrote: > > On 11.02.2020 20:58, Chuck Lever wrote: >> The @nents value that was passed to ib_dma_map_sg() has to be passed >> to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to >> concatenate sg entries, it will return a different nents value than >> it was passed. >> The bug was exposed by recent changes to the AMD IOMMU driver. > > This seems to fail differently on my system; mount fails with: > mount.nfs: mount system call failed > > and the kernel log reports: > [ 38.890344] NFS: Registering the id_resolver key type > [ 38.890351] Key type id_resolver registered > [ 38.890352] Key type id_legacy registered > [ 38.901799] NFS: nfs4_discover_server_trunking unhandled error -5. Exiting > with error EIO > [ 38.901817] NFS4: Couldn't follow remote path > > amd_iommu=off still works > > One detail I accidentally left out of the original report is that the server > (intel system) is running Ubuntu 20.04 ("beta") userspace, and AMD clients > are Ubuntu 19.10 userspace. Although I dont believe this to matter at this > point. Next thing to try: # trace-cmd record -e sunrpc -e rpcrdma then issue the mount command. Once it completes, ^C the trace-cmd and send me trace.dat. Try this with both the v5.4 kernel and the v5.5 kernel (and note that trace-cmd overwrites trace.dat, so copy it out between tests). >> Reported-by: Andre Tomt >> Suggested-by: Robin Murphy >> Fixes: 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails") >> Signed-off-by: Chuck Lever >> --- >> net/sunrpc/xprtrdma/frwr_ops.c |5 ++--- >> 1 file changed, 2 insertions(+), 3 deletions(-) >> Hey Andre, please try this out. It just reverts the bit of brokenness that >> Robin observed this morning. I've done basic testing here with Intel >> IOMMU systems, no change in behavior (ie, all good to go). >> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c >> index 095be887753e..449bb51e4fe8 100644 >> --- a/net/sunrpc/xprtrdma/frwr_ops.c >> +++ b/net/sunrpc/xprtrdma/frwr_ops.c >> @@ -313,10 +313,9 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt >> *r_xprt, >> break; >> } >> mr->mr_dir = rpcrdma_data_dir(writing); >> +mr->mr_nents = i; >> - mr->mr_nents = >> -ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); >> -if (!mr->mr_nents) >> +if (!ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir)) >> goto out_dmamap_err; >> ibmr = mr->frwr.fr_mr; > -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance
The @nents value that was passed to ib_dma_map_sg() has to be passed to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to concatenate sg entries, it will return a different nents value than it was passed. The bug was exposed by recent changes to the AMD IOMMU driver, which enabled sg entry concatenation. Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new memory registration API") and reviewing other kernel ULPs, it's not clear that the frwr_map() logic was ever correct for this case. Reported-by: Andre Tomt Suggested-by: Robin Murphy Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h |6 -- net/sunrpc/xprtrdma/frwr_ops.c | 13 +++-- 2 files changed, 11 insertions(+), 8 deletions(-) Hi Andre, here's take 2, based on the trace data you sent me. Please let me know if this one fares any better. Changes since v1: - Ensure the correct nents value is passed to ib_map_mr_sg - Record the mr_nents value in the MR trace points diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c0e4c93324f5..023c5da45999 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -275,6 +275,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, TP_STRUCT__entry( __field(const void *, mr) + __field(unsigned int, nents) __field(u32, handle) __field(u32, length) __field(u64, offset) @@ -283,14 +284,15 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, TP_fast_assign( __entry->mr = mr; + __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; __entry->dir= mr->mr_dir; ), - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)", - __entry->mr, __entry->length, + TP_printk("mr=%p %d %u@0x%016llx:0x%08x (%s)", + __entry->mr, __entry->mr_nents, __entry->length, (unsigned long long)__entry->offset, __entry->handle, xprtrdma_show_direction(__entry->dir) ) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 095be887753e..75617646702b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_reg_wr *reg_wr; + int i, n, dma_nents; struct ib_mr *ibmr; - int i, n; u8 key; if (nsegs > ia->ri_max_frwr_depth) @@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, break; } mr->mr_dir = rpcrdma_data_dir(writing); + mr->mr_nents = i; - mr->mr_nents = - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, + mr->mr_nents, mr->mr_dir); + if (!dma_nents) goto out_dmamap_err; ibmr = mr->frwr.fr_mr; - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); - if (unlikely(n != mr->mr_nents)) + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); + if (n != dma_nents) goto out_mapmr_err; ibmr->iova &= 0x; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance
> On Feb 12, 2020, at 8:43 AM, Chuck Lever wrote: > > The @nents value that was passed to ib_dma_map_sg() has to be passed > to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to > concatenate sg entries, it will return a different nents value than > it was passed. > > The bug was exposed by recent changes to the AMD IOMMU driver, which > enabled sg entry concatenation. > > Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new > memory registration API") and reviewing other kernel ULPs, it's not > clear that the frwr_map() logic was ever correct for this case. > > Reported-by: Andre Tomt > Suggested-by: Robin Murphy > Signed-off-by: Chuck Lever > --- > include/trace/events/rpcrdma.h |6 -- > net/sunrpc/xprtrdma/frwr_ops.c | 13 +++-- > 2 files changed, 11 insertions(+), 8 deletions(-) > > Hi Andre, here's take 2, based on the trace data you sent me. > Please let me know if this one fares any better. > > Changes since v1: > - Ensure the correct nents value is passed to ib_map_mr_sg > - Record the mr_nents value in the MR trace points > > diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h > index c0e4c93324f5..023c5da45999 100644 > --- a/include/trace/events/rpcrdma.h > +++ b/include/trace/events/rpcrdma.h > @@ -275,6 +275,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, > > TP_STRUCT__entry( > __field(const void *, mr) > + __field(unsigned int, nents) > __field(u32, handle) > __field(u32, length) > __field(u64, offset) > @@ -283,14 +284,15 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, > > TP_fast_assign( > __entry->mr = mr; > + __entry->nents = mr->mr_nents; > __entry->handle = mr->mr_handle; > __entry->length = mr->mr_length; > __entry->offset = mr->mr_offset; > __entry->dir= mr->mr_dir; > ), > > - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)", > - __entry->mr, __entry->length, > + TP_printk("mr=%p %d %u@0x%016llx:0x%08x (%s)", > + __entry->mr, __entry->mr_nents, __entry->length, This should be: __entry->mr, __entry->nents, __entry->length, Sorry about that. > (unsigned long long)__entry->offset, __entry->handle, > xprtrdma_show_direction(__entry->dir) > ) > diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c > index 095be887753e..75617646702b 100644 > --- a/net/sunrpc/xprtrdma/frwr_ops.c > +++ b/net/sunrpc/xprtrdma/frwr_ops.c > @@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt > *r_xprt, > { > struct rpcrdma_ia *ia = &r_xprt->rx_ia; > struct ib_reg_wr *reg_wr; > + int i, n, dma_nents; > struct ib_mr *ibmr; > - int i, n; > u8 key; > > if (nsegs > ia->ri_max_frwr_depth) > @@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt > *r_xprt, > break; > } > mr->mr_dir = rpcrdma_data_dir(writing); > + mr->mr_nents = i; > > - mr->mr_nents = > - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); > - if (!mr->mr_nents) > + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, > + mr->mr_nents, mr->mr_dir); > + if (!dma_nents) > goto out_dmamap_err; > > ibmr = mr->frwr.fr_mr; > - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); > - if (unlikely(n != mr->mr_nents)) > + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); > + if (n != dma_nents) > goto out_mapmr_err; > > ibmr->iova &= 0x; > > -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance
> On Feb 12, 2020, at 11:03 AM, Andre Tomt wrote: > > On 12.02.2020 14:48, Chuck Lever wrote: >>> On Feb 12, 2020, at 8:43 AM, Chuck Lever wrote: >>> >>> The @nents value that was passed to ib_dma_map_sg() has to be passed >>> to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to >>> concatenate sg entries, it will return a different nents value than >>> it was passed. >>> >>> The bug was exposed by recent changes to the AMD IOMMU driver, which >>> enabled sg entry concatenation. >>> >>> Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new >>> memory registration API") and reviewing other kernel ULPs, it's not >>> clear that the frwr_map() logic was ever correct for this case. >>> >>> Reported-by: Andre Tomt >>> Suggested-by: Robin Murphy >>> Signed-off-by: Chuck Lever >>> --- >>> include/trace/events/rpcrdma.h |6 -- >>> net/sunrpc/xprtrdma/frwr_ops.c | 13 +++-- >>> 2 files changed, 11 insertions(+), 8 deletions(-) >>> >>> Hi Andre, here's take 2, based on the trace data you sent me. >>> Please let me know if this one fares any better. >>> >>> Changes since v1: >>> - Ensure the correct nents value is passed to ib_map_mr_sg >>> - Record the mr_nents value in the MR trace points > Verified working (with the patch correction) in my environment, with some > quick testing (mount + some random and bulk I/O) > > client, 5.5.3 + patch + amd iommu on = OK > client, 5.5.3 + patch + amd iommu off = OK > client, 5.6-rc1 + patch + amd iommu on = OK > > server, 5.5.3 + patch + intel iommu on = OK Very good! I'll submit the fix through the NFS tree ASAP, and request backport to v5.5 stable. -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40
> On Jan 26, 2021, at 1:18 AM, Lu Baolu wrote: > > On 2021/1/26 3:31, Chuck Lever wrote: >>> On Jan 25, 2021, at 12:39 PM, Chuck Lever wrote: >>> >>> Hello Lu - >>> >>> Many thanks for your prototype. >>> >>> >>>> On Jan 24, 2021, at 9:38 PM, Lu Baolu wrote: >>>> >>>> This patch series is only for Request-For-Testing purpose. It aims to >>>> fix the performance regression reported here. >>>> >>>> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/ >>>> >>>> The first two patches are borrowed from here. >>>> >>>> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/ >>>> >>>> Please kindly help to verification. >>>> >>>> Best regards, >>>> baolu >>>> >>>> Lu Baolu (1): >>>> iommu/vt-d: Add iotlb_sync_map callback >>>> >>>> Yong Wu (2): >>>> iommu: Move iotlb_sync_map out from __iommu_map >>>> iommu: Add iova and size as parameters in iotlb_sync_map >>>> >>>> drivers/iommu/intel/iommu.c | 86 + >>>> drivers/iommu/iommu.c | 23 +++--- >>>> drivers/iommu/tegra-gart.c | 7 ++- >>>> include/linux/iommu.h | 3 +- >>>> 4 files changed, 83 insertions(+), 36 deletions(-) >>> >>> Here are results with the NFS client at stock v5.11-rc5 and the >>> NFS server at v5.10, showing the regression I reported earlier. >>> >>> Children see throughput for 12 initial writers = 4534582.00 kB/sec >>> Parent sees throughput for 12 initial writers = 4458145.56 kB/sec >>> Min throughput per process = 373101.59 kB/sec >>> Max throughput per process = 382669.50 kB/sec >>> Avg throughput per process = 377881.83 kB/sec >>> Min xfer= 1022720.00 kB >>> CPU Utilization: Wall time2.787CPU time1.922CPU >>> utilization 68.95 % >>> >>> >>> Children see throughput for 12 rewriters= 4542003.12 kB/sec >>> Parent sees throughput for 12 rewriters = 4538024.19 kB/sec >>> Min throughput per process = 374672.00 kB/sec >>> Max throughput per process = 383983.78 kB/sec >>> Avg throughput per process = 378500.26 kB/sec >>> Min xfer= 1022976.00 kB >>> CPU utilization: Wall time2.733CPU time1.947CPU >>> utilization 71.25 % >>> >>> >>> Children see throughput for 12 readers = 4568632.03 kB/sec >>> Parent sees throughput for 12 readers = 4563672.02 kB/sec >>> Min throughput per process = 376727.56 kB/sec >>> Max throughput per process = 383783.91 kB/sec >>> Avg throughput per process = 380719.34 kB/sec >>> Min xfer= 1029376.00 kB >>> CPU utilization: Wall time2.733CPU time1.898CPU >>> utilization 69.46 % >>> >>> >>> Children see throughput for 12 re-readers = 4610702.78 kB/sec >>> Parent sees throughput for 12 re-readers= 4606135.66 kB/sec >>> Min throughput per process = 381532.78 kB/sec >>> Max throughput per process = 387072.53 kB/sec >>> Avg throughput per process = 384225.23 kB/sec >>> Min xfer= 1034496.00 kB >>> CPU utilization: Wall time2.711CPU time1.910CPU >>> utilization 70.45 % >>> >>> Here's the NFS client at v5.11-rc5 with your series applied. >>> The NFS server remains at v5.10: >>> >>> Children see throughput for 12 initial writers = 4434778.81 kB/sec >>> Parent sees throughput for 12 initial writers = 4408190.69 kB/sec >>> Min throughput per process = 367865.28 kB/sec >>> Max throughput per process = 371134.38 kB/sec >>> Avg throughput per process = 369564.90 kB/sec >>> Min xfer
Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40
Hi Robin- > On Jan 26, 2021, at 11:05 AM, Robin Murphy wrote: > > Implementing .iotlb_sync_map means that a single top-level > iommu_map()/iommu_map_sg() call should still only invoke a single "TLB flush" > (really, any maintenance required for the IOMMU to start using the new > mapping) at the end, regardless of how many internal __iommu_map() calls are > made to satisfy the overall request. If you're seeing something other than > that behaviour (with this series), that implies we've not got things quite > right yet. The flush is expensive, but it's not the only cost. DMA-mapping a 120KB SGL in a single domain_mapping() call vs. 30 calls is certainly going to be a detectable difference. Naively speaking, if there are more DMA mappings to keep track of because the IOMMU driver isn't coalescing the SGLs the way it did before, that might trigger TLB thrashing on the NIC. > Is there any significant difference between how the NFS read and write paths > make their DMA API calls and/or get their scatterlists in the first place, > that might help shed some light on the curious half-recovery you got? There isn't a difference in the RPC-over-RDMA code. Client-side DMA mapping is handled in net/sunrpc/xprtrdma/frwr_ops.c :: frwr_map() which is used for both I/O directions. On the server, the RDMA core r/w API is used for mapping and then posting RDMA Read and Write work requests. That API appears in drivers/infiniband/core/rw.c , and as far as I understand, the same mapping functions are used for both I/O directions. It's possible that the NIC is doing something different for RDMA Read and RDMA Write, but I don't have much visibility into that. Reads are very different from Writes, which are posted. -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40
> On Jan 26, 2021, at 8:53 PM, Lu Baolu wrote: > > Hi Chuck, > > On 1/26/21 11:52 PM, Chuck Lever wrote: >>> On Jan 26, 2021, at 1:18 AM, Lu Baolu wrote: >>> >>> On 2021/1/26 3:31, Chuck Lever wrote: >>>>> On Jan 25, 2021, at 12:39 PM, Chuck Lever wrote: >>>>> >>>>> Hello Lu - >>>>> >>>>> Many thanks for your prototype. >>>>> >>>>> >>>>>> On Jan 24, 2021, at 9:38 PM, Lu Baolu wrote: >>>>>> >>>>>> This patch series is only for Request-For-Testing purpose. It aims to >>>>>> fix the performance regression reported here. >>>>>> >>>>>> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/ >>>>>> >>>>>> The first two patches are borrowed from here. >>>>>> >>>>>> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/ >>>>>> >>>>>> Please kindly help to verification. >>>>>> >>>>>> Best regards, >>>>>> baolu >>>>>> >>>>>> Lu Baolu (1): >>>>>> iommu/vt-d: Add iotlb_sync_map callback >>>>>> >>>>>> Yong Wu (2): >>>>>> iommu: Move iotlb_sync_map out from __iommu_map >>>>>> iommu: Add iova and size as parameters in iotlb_sync_map >>>>>> >>>>>> drivers/iommu/intel/iommu.c | 86 + >>>>>> drivers/iommu/iommu.c | 23 +++--- >>>>>> drivers/iommu/tegra-gart.c | 7 ++- >>>>>> include/linux/iommu.h | 3 +- >>>>>> 4 files changed, 83 insertions(+), 36 deletions(-) >>>>> >>>>> Here are results with the NFS client at stock v5.11-rc5 and the >>>>> NFS server at v5.10, showing the regression I reported earlier. >>>>> >>>>> Children see throughput for 12 initial writers = 4534582.00 kB/sec >>>>> Parent sees throughput for 12 initial writers = 4458145.56 kB/sec >>>>> Min throughput per process = 373101.59 kB/sec >>>>> Max throughput per process = 382669.50 kB/sec >>>>> Avg throughput per process = 377881.83 kB/sec >>>>> Min xfer= 1022720.00 kB >>>>> CPU Utilization: Wall time2.787CPU time1.922CPU >>>>> utilization 68.95 % >>>>> >>>>> >>>>> Children see throughput for 12 rewriters= 4542003.12 kB/sec >>>>> Parent sees throughput for 12 rewriters = 4538024.19 kB/sec >>>>> Min throughput per process = 374672.00 kB/sec >>>>> Max throughput per process = 383983.78 kB/sec >>>>> Avg throughput per process = 378500.26 kB/sec >>>>> Min xfer= 1022976.00 kB >>>>> CPU utilization: Wall time2.733CPU time1.947CPU >>>>> utilization 71.25 % >>>>> >>>>> >>>>> Children see throughput for 12 readers = 4568632.03 kB/sec >>>>> Parent sees throughput for 12 readers = 4563672.02 kB/sec >>>>> Min throughput per process = 376727.56 kB/sec >>>>> Max throughput per process = 383783.91 kB/sec >>>>> Avg throughput per process = 380719.34 kB/sec >>>>> Min xfer= 1029376.00 kB >>>>> CPU utilization: Wall time2.733CPU time1.898CPU >>>>> utilization 69.46 % >>>>> >>>>> >>>>> Children see throughput for 12 re-readers = 4610702.78 kB/sec >>>>> Parent sees throughput for 12 re-readers= 4606135.66 kB/sec >>>>> Min throughput per process = 381532.78 kB/sec >>>>> Max throughput per process = 387072.53 kB/sec >>>>> Avg throughput per process = 384225.23 kB/sec >>>>> Min xfer= 1034496.00 kB >>>>> CPU utilization: Wall time
[PATCH RFC 0/9] Possible set of VT-d optimizations
Hi- This collection of patches seems to get the best throughtput results so far. The NFS WRITE result is fully restored, and the NFS READ result is very close to fully restored. Children see throughput for 12 initial writers = 5008474.03 kB/sec Parent sees throughput for 12 initial writers = 4996927.80 kB/sec Min throughput per process = 416956.88 kB/sec Max throughput per process = 417910.22 kB/sec Avg throughput per process = 417372.84 kB/sec Min xfer= 1046272.00 kB CPU Utilization: Wall time2.515CPU time1.996CPU utilization 79.37 % Children see throughput for 12 rewriters= 5020584.59 kB/sec Parent sees throughput for 12 rewriters = 5012539.29 kB/sec Min throughput per process = 417799.00 kB/sec Max throughput per process = 419082.22 kB/sec Avg throughput per process = 418382.05 kB/sec Min xfer= 1046528.00 kB CPU utilization: Wall time2.507CPU time2.024CPU utilization 80.73 % Children see throughput for 12 readers = 5805484.25 kB/sec Parent sees throughput for 12 readers = 5799535.68 kB/sec Min throughput per process = 482888.16 kB/sec Max throughput per process = 48.16 kB/sec Avg throughput per process = 483790.35 kB/sec Min xfer= 1045760.00 kB CPU utilization: Wall time2.167CPU time1.964CPU utilization 90.63 % Children see throughput for 12 re-readers = 5812227.16 kB/sec Parent sees throughput for 12 re-readers= 5803793.06 kB/sec Min throughput per process = 483242.97 kB/sec Max throughput per process = 485724.41 kB/sec Avg throughput per process = 484352.26 kB/sec Min xfer= 1043456.00 kB CPU utilization: Wall time2.161CPU time1.976CPU utilization 91.45 % I've included a simple-minded implementation of a map_sg op for the Intel IOMMU. This is nothing more than a copy of the loop in __iommu_map_sg() with the call to __iommu_map() replaced with a call to intel_iommu_map(). --- Chuck Lever (1): iommu/vt-d: Introduce map_sg() for Intel IOMMUs Isaac J. Manjarres (5): iommu/io-pgtable: Introduce map_sg() as a page table op iommu/io-pgtable-arm: Hook up map_sg() iommu/io-pgtable-arm-v7s: Hook up map_sg() iommu: Introduce map_sg() as an IOMMU op for IOMMU drivers iommu/arm-smmu: Hook up map_sg() Lu Baolu (1): iommu/vt-d: Add iotlb_sync_map callback Yong Wu (2): iommu: Move iotlb_sync_map out from __iommu_map iommu: Add iova and size as parameters in iotlb_sync_map drivers/iommu/arm/arm-smmu/arm-smmu.c | 19 drivers/iommu/intel/iommu.c | 131 -- drivers/iommu/io-pgtable-arm-v7s.c| 90 ++ drivers/iommu/io-pgtable-arm.c| 86 + drivers/iommu/iommu.c | 47 +++-- drivers/iommu/tegra-gart.c| 7 +- include/linux/iommu.h | 16 +++- 7 files changed, 353 insertions(+), 43 deletions(-) -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 1/9] iommu: Move iotlb_sync_map out from __iommu_map
From: Yong Wu In the end of __iommu_map, It alway call iotlb_sync_map. This patch moves iotlb_sync_map out from __iommu_map since it is unnecessary to call this for each sg segment especially iotlb_sync_map is flush tlb all currently. Add a little helper _iommu_map for this. Signed-off-by: Yong Wu Reviewed-by: Robin Murphy Signed-off-by: Chuck Lever --- drivers/iommu/iommu.c | 23 ++- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ffeebda8d6de..c304a6a30d42 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2426,9 +2426,6 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, size -= pgsize; } - if (ops->iotlb_sync_map) - ops->iotlb_sync_map(domain); - /* unroll mapping in case something went wrong */ if (ret) iommu_unmap(domain, orig_iova, orig_size - size); @@ -2438,18 +2435,31 @@ static int __iommu_map(struct iommu_domain *domain, unsigned long iova, return ret; } +static int _iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + const struct iommu_ops *ops = domain->ops; + int ret; + + ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL); + if (ret == 0 && ops->iotlb_sync_map) + ops->iotlb_sync_map(domain); + + return ret; +} + int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { might_sleep(); - return __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL); + return _iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL); } EXPORT_SYMBOL_GPL(iommu_map); int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot) { - return __iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC); + return _iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iommu_map_atomic); @@ -2533,6 +2543,7 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, unsigned int nents, int prot, gfp_t gfp) { + const struct iommu_ops *ops = domain->ops; size_t len = 0, mapped = 0; phys_addr_t start; unsigned int i = 0; @@ -2563,6 +2574,8 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, sg = sg_next(sg); } + if (ops->iotlb_sync_map) + ops->iotlb_sync_map(domain); return mapped; out_err: ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 2/9] iommu: Add iova and size as parameters in iotlb_sync_map
From: Yong Wu iotlb_sync_map allow IOMMU drivers tlb sync after completing the whole mapping. This patch adds iova and size as the parameters in it. then the IOMMU driver could flush tlb with the whole range once after iova mapping to improve performance. Signed-off-by: Yong Wu Reviewed-by: Robin Murphy Signed-off-by: Chuck Lever --- drivers/iommu/iommu.c |4 ++-- drivers/iommu/tegra-gart.c |7 +-- include/linux/iommu.h |3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index c304a6a30d42..3d099a31ddca 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2443,7 +2443,7 @@ static int _iommu_map(struct iommu_domain *domain, unsigned long iova, ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL); if (ret == 0 && ops->iotlb_sync_map) - ops->iotlb_sync_map(domain); + ops->iotlb_sync_map(domain, iova, size); return ret; } @@ -2575,7 +2575,7 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, } if (ops->iotlb_sync_map) - ops->iotlb_sync_map(domain); + ops->iotlb_sync_map(domain, iova, mapped); return mapped; out_err: diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index fac720273889..05e8e19b8269 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -261,7 +261,8 @@ static int gart_iommu_of_xlate(struct device *dev, return 0; } -static void gart_iommu_sync_map(struct iommu_domain *domain) +static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long iova, + size_t size) { FLUSH_GART_REGS(gart_handle); } @@ -269,7 +270,9 @@ static void gart_iommu_sync_map(struct iommu_domain *domain) static void gart_iommu_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *gather) { - gart_iommu_sync_map(domain); + size_t length = gather->end - gather->start; + + gart_iommu_sync_map(domain, gather->start, length); } static const struct iommu_ops gart_iommu_ops = { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index b3f0e2018c62..9ce0aa9e236b 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -246,7 +246,8 @@ struct iommu_ops { size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); - void (*iotlb_sync_map)(struct iommu_domain *domain); + void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova, + size_t size); void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 6/9] iommu/io-pgtable-arm-v7s: Hook up map_sg()
From: Isaac J. Manjarres Implement the map_sg io-pgtable op for the ARMv7s io-pgtable code, so that IOMMU drivers can call it when they need to map a scatter-gather list. Signed-off-by: Isaac J. Manjarres Tested-by: Sai Prakash Ranjan Signed-off-by: Chuck Lever --- drivers/iommu/io-pgtable-arm-v7s.c | 90 1 file changed, 90 insertions(+) diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 1d92ac948db7..8665dabb753b 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -545,6 +545,95 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, unsigned long iova, return ret; } +static int arm_v7s_map_by_pgsize(struct io_pgtable_ops *ops, +unsigned long iova, phys_addr_t paddr, +size_t size, int prot, gfp_t gfp, +size_t *mapped) +{ + struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable *iop = &data->iop; + struct io_pgtable_cfg *cfg = &iop->cfg; + unsigned int min_pagesz = 1 << __ffs(cfg->pgsize_bitmap); + int ret; + size_t pgsize; + + if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", + iova, &paddr, size, min_pagesz); + return -EINVAL; + } + + if (WARN_ON((iova + size - 1) >= (1ULL << cfg->ias) || + (paddr + size - 1) >= (1ULL << cfg->oas))) + return -ERANGE; + + while (size) { + pgsize = iommu_pgsize(cfg->pgsize_bitmap, iova | paddr, size); + ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, + data->pgd, gfp); + + if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) { + io_pgtable_tlb_flush_walk(&data->iop, iova, size, + ARM_V7S_BLOCK_SIZE(2)); + } else { + wmb(); + } + + if (ret) + return ret; + + iova += pgsize; + paddr += pgsize; + *mapped += pgsize; + size -= pgsize; + } + + return 0; +} + +static int arm_v7s_map_sg(struct io_pgtable_ops *ops, unsigned long iova, + struct scatterlist *sg, unsigned int nents, + int iommu_prot, gfp_t gfp, size_t *mapped) +{ + size_t len = 0; + unsigned int i = 0; + int ret; + phys_addr_t start; + + *mapped = 0; + + /* If no access, then nothing to do */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return 0; + + while (i <= nents) { + phys_addr_t s_phys = sg_phys(sg); + + if (len && s_phys != start + len) { + ret = arm_v7s_map_by_pgsize(ops, iova + *mapped, start, + len, iommu_prot, gfp, + mapped); + + if (ret) + return ret; + + len = 0; + } + + if (len) { + len += sg->length; + } else { + len = sg->length; + start = s_phys; + } + + if (++i < nents) + sg = sg_next(sg); + } + + return 0; +} + static void arm_v7s_free_pgtable(struct io_pgtable *iop) { struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop); @@ -783,6 +872,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, data->iop.ops = (struct io_pgtable_ops) { .map= arm_v7s_map, + .map_sg = arm_v7s_map_sg, .unmap = arm_v7s_unmap, .iova_to_phys = arm_v7s_iova_to_phys, }; ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 5/9] iommu/io-pgtable-arm: Hook up map_sg()
From: Isaac J. Manjarres Implement the map_sg io-pgtable op for the ARM LPAE io-pgtable code, so that IOMMU drivers can call it when they need to map a scatter-gather list. Signed-off-by: Isaac J. Manjarres Tested-by: Sai Prakash Ranjan Signed-off-by: Chuck Lever --- drivers/iommu/io-pgtable-arm.c | 86 drivers/iommu/iommu.c | 12 +++--- include/linux/iommu.h |8 3 files changed, 101 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 87def58e79b5..0c11529442b8 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -473,6 +473,91 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, return ret; } +static int arm_lpae_map_by_pgsize(struct io_pgtable_ops *ops, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot, gfp_t gfp, + size_t *mapped) +{ + struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &data->iop.cfg; + arm_lpae_iopte *ptep = data->pgd; + int ret, lvl = data->start_level; + arm_lpae_iopte prot = arm_lpae_prot_to_pte(data, iommu_prot); + unsigned int min_pagesz = 1 << __ffs(cfg->pgsize_bitmap); + long iaext = (s64)(iova + size - 1) >> cfg->ias; + size_t pgsize; + + if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 0x%x\n", + iova, &paddr, size, min_pagesz); + return -EINVAL; + } + + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) + iaext = ~iaext; + if (WARN_ON(iaext || (paddr + size - 1) >> cfg->oas)) + return -ERANGE; + + while (size) { + pgsize = iommu_pgsize(cfg->pgsize_bitmap, iova | paddr, size); + ret = __arm_lpae_map(data, iova, paddr, pgsize, prot, lvl, ptep, +gfp); + if (ret) + return ret; + + iova += pgsize; + paddr += pgsize; + *mapped += pgsize; + size -= pgsize; + } + + return 0; +} + +static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova, + struct scatterlist *sg, unsigned int nents, + int iommu_prot, gfp_t gfp, size_t *mapped) +{ + + size_t len = 0; + unsigned int i = 0; + int ret; + phys_addr_t start; + + *mapped = 0; + + /* If no access, then nothing to do */ + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) + return 0; + + while (i <= nents) { + phys_addr_t s_phys = sg_phys(sg); + + if (len && s_phys != start + len) { + ret = arm_lpae_map_by_pgsize(ops, iova + *mapped, start, +len, iommu_prot, gfp, +mapped); + + if (ret) + return ret; + + len = 0; + } + + if (len) { + len += sg->length; + } else { + len = sg->length; + start = s_phys; + } + + if (++i < nents) + sg = sg_next(sg); + } + + return 0; +} + static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl, arm_lpae_iopte *ptep) { @@ -750,6 +835,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) data->iop.ops = (struct io_pgtable_ops) { .map= arm_lpae_map, + .map_sg = arm_lpae_map_sg, .unmap = arm_lpae_unmap, .iova_to_phys = arm_lpae_iova_to_phys, }; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3d099a31ddca..ed879a4d7fac 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2346,8 +2346,8 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); -static size_t iommu_pgsize(struct iommu_domain *domain, - unsigned long addr_merge, size_t size) +size_t iommu_pgsize(unsigned long pgsize_bitmap, unsigned long addr_merge, + size_t size) { unsigned int pgsize_idx; size_t pgsize; @@ -2366,7 +2366,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain, pgsize = (1UL << (pgsize_idx + 1)) - 1; /* throw away page sizes not supp
[PATCH RFC 7/9] iommu: Introduce map_sg() as an IOMMU op for IOMMU drivers
From: Isaac J. Manjarres Add support for IOMMU drivers to have their own map_sg() callbacks. This completes the path for having iommu_map_sg() invoke an IOMMU driver's map_sg() callback, which can then invoke the io-pgtable map_sg() callback with the entire scatter-gather list, so that it can be processed entirely in the io-pgtable layer. For IOMMU drivers that do not provide a callback, the default implementation of iterating through the scatter-gather list, while calling iommu_map() will be used. Signed-off-by: Isaac J. Manjarres Tested-by: Sai Prakash Ranjan [ cel: adjusted new iotlb_sync_map call site ] Signed-off-by: Chuck Lever --- drivers/iommu/iommu.c | 12 include/linux/iommu.h |5 + 2 files changed, 17 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index ed879a4d7fac..bd7adbd0339b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2551,6 +2551,18 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova, unsigned int i = 0; int ret; + if (ops->map_sg) { + ret = ops->map_sg(domain, iova, sg, nents, prot, gfp, &mapped); + + if (ops->iotlb_sync_map) + ops->iotlb_sync_map(domain, iova, mapped); + + if (ret) + goto out_err; + + return mapped; + } + while (i <= nents) { phys_addr_t s_phys = sg_phys(sg); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index cd5f35022a25..667edc7b034a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -192,6 +192,8 @@ struct iommu_iotlb_gather { * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain * @map: map a physically contiguous memory region to an iommu domain + * @map_sg: map a scatter-gather list of physically contiguous chunks to + * an iommu domain. * @unmap: unmap a physically contiguous memory region from an iommu domain * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain * @iotlb_sync_map: Sync mappings created recently using @map to the hardware @@ -243,6 +245,9 @@ struct iommu_ops { void (*detach_dev)(struct iommu_domain *domain, struct device *dev); int (*map)(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_sg)(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot, + gfp_t gfp, size_t *mapped); size_t (*unmap)(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *iotlb_gather); void (*flush_iotlb_all)(struct iommu_domain *domain); ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 3/9] iommu/vt-d: Add iotlb_sync_map callback
From: Lu Baolu Some Intel VT-d hardware implementations don't support memory coherency for page table walk (presented by the Page-Walk-coherency bit in the ecap register), so that software must flush the corresponding CPU cache lines explicitly after each page table entry update. The iommu_map_sg() code iterates through the given scatter-gather list and invokes iommu_map() for each element in the scatter-gather list, which calls into the vendor IOMMU driver through iommu_ops callback. As the result, a single sg mapping may lead to multiple cache line flushes, which leads to the degradation of I/O performance after the commit ("iommu/vt-d: Convert intel iommu driver to the iommu ops"). Fix this by adding iotlb_sync_map callback and centralizing the clflush operations after all sg mappings. Fixes: c588072bba6b5 ("iommu/vt-d: Convert intel iommu driver to the iommu ops") Reported-by: Chuck Lever Link: https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/ Signed-off-by: Lu Baolu Cc: Robin Murphy [ cel: removed @first_pte, which is no longer used ] Signed-off-by: Chuck Lever --- drivers/iommu/intel/iommu.c | 90 +-- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f665322a0991..013097b6d55f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -2298,9 +2298,9 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, unsigned long phys_pfn, unsigned long nr_pages, int prot) { - struct dma_pte *first_pte = NULL, *pte = NULL; unsigned int largepage_lvl = 0; unsigned long lvl_pages = 0; + struct dma_pte *pte = NULL; phys_addr_t pteval; u64 attr; @@ -2322,7 +2322,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, nr_pages); - first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); + pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl); if (!pte) return -ENOMEM; /* It is large page*/ @@ -2383,34 +2383,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, * recalculate 'pte' and switch back to smaller pages for the * end of the mapping, if the trailing size is not enough to * use another superpage (i.e. nr_pages < lvl_pages). +* +* We leave clflush for the leaf pte changes to iotlb_sync_map() +* callback. */ pte++; if (!nr_pages || first_pte_in_page(pte) || - (largepage_lvl > 1 && nr_pages < lvl_pages)) { - domain_flush_cache(domain, first_pte, - (void *)pte - (void *)first_pte); + (largepage_lvl > 1 && nr_pages < lvl_pages)) pte = NULL; - } - } - - return 0; -} - -static int -domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, - unsigned long phys_pfn, unsigned long nr_pages, int prot) -{ - int iommu_id, ret; - struct intel_iommu *iommu; - - /* Do the real mapping first */ - ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot); - if (ret) - return ret; - - for_each_domain_iommu(iommu_id, domain) { - iommu = g_iommus[iommu_id]; - __mapping_notify_one(iommu, domain, iov_pfn, nr_pages); } return 0; @@ -4943,7 +4923,6 @@ static int intel_iommu_map(struct iommu_domain *domain, struct dmar_domain *dmar_domain = to_dmar_domain(domain); u64 max_addr; int prot = 0; - int ret; if (iommu_prot & IOMMU_READ) prot |= DMA_PTE_READ; @@ -4969,9 +4948,8 @@ static int intel_iommu_map(struct iommu_domain *domain, /* Round up size to next multiple of PAGE_SIZE, if it and the low bits of hpa would take us onto the next page */ size = aligned_nrpages(hpa, size); - ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, -hpa >> VTD_PAGE_SHIFT, size, prot); - return ret; + return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, + hpa >> VTD_PAGE_SHIFT, size, prot); } static size_t intel_iommu_unmap(struct iommu_domain *domain, @@ -5478,6 +5456,57 @@ static bool risky_device(struct pci_dev *pdev) return false; } +static void clflush_sync_map(s
[PATCH RFC 8/9] iommu/arm-smmu: Hook up map_sg()
From: Isaac J. Manjarres Now that everything is in place for iommu_map_sg() to defer mapping a scatter-gather list to the io-pgtable layer, implement the map_sg() callback in the SMMU driver, so that iommu_map_sg() can invoke it with the entire scatter-gather list that will be mapped. Signed-off-by: Isaac J. Manjarres Tested-by: Sai Prakash Ranjan Signed-off-by: Chuck Lever --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index d8c6bfde6a61..52acc6858512 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1208,6 +1208,24 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, return ret; } +static int arm_smmu_map_sg(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot, + gfp_t gfp, size_t *mapped) +{ + struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; + struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; + int ret; + + if (!ops) + return -ENODEV; + + arm_smmu_rpm_get(smmu); + ret = ops->map_sg(ops, iova, sg, nents, prot, gfp, mapped); + arm_smmu_rpm_put(smmu); + + return ret; +} + static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather) { @@ -1624,6 +1642,7 @@ static struct iommu_ops arm_smmu_ops = { .domain_free= arm_smmu_domain_free, .attach_dev = arm_smmu_attach_dev, .map= arm_smmu_map, + .map_sg = arm_smmu_map_sg, .unmap = arm_smmu_unmap, .flush_iotlb_all= arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 4/9] iommu/io-pgtable: Introduce map_sg() as a page table op
From: Isaac J. Manjarres While mapping a scatter-gather list, iommu_map_sg() calls into the IOMMU driver through an indirect call, which can call into the io-pgtable code through another indirect call. This sequence of going through the IOMMU core code, the IOMMU driver, and finally the io-pgtable code, occurs for every element in the scatter-gather list, in the worse case, which is not optimal. Introduce a map_sg callback in the io-pgtable ops so that IOMMU drivers can invoke it with the complete scatter-gather list, so that it can be processed within the io-pgtable code entirely, reducing the number of indirect calls, and boosting overall iommu_map_sg() performance. Signed-off-by: Isaac J. Manjarres Tested-by: Sai Prakash Ranjan Signed-off-by: Chuck Lever --- include/linux/io-pgtable.h |6 ++ 1 file changed, 6 insertions(+) diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ea727eb1a1a9..6d0e73172603 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -147,6 +147,9 @@ struct io_pgtable_cfg { * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map: Map a physically contiguous memory region. + * @map_sg: Map a scatter-gather list of physically contiguous memory + *chunks. The mapped pointer argument is used to store how + *many bytes are mapped. * @unmap:Unmap a physically contiguous memory region. * @iova_to_phys: Translate iova to physical address. * @@ -156,6 +159,9 @@ struct io_pgtable_cfg { struct io_pgtable_ops { int (*map)(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp); + int (*map_sg)(struct io_pgtable_ops *ops, unsigned long iova, + struct scatterlist *sg, unsigned int nents, int prot, + gfp_t gfp, size_t *mapped); size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
[PATCH RFC 9/9] iommu/vt-d: Introduce map_sg() for Intel IOMMUs
Attempt to reduce indirect call overhead when mapping a substantial scatter-gather list. Signed-off-by: Chuck Lever --- drivers/iommu/intel/iommu.c | 37 + 1 file changed, 37 insertions(+) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 013097b6d55f..deae39f1477a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4952,6 +4952,42 @@ static int intel_iommu_map(struct iommu_domain *domain, hpa >> VTD_PAGE_SHIFT, size, prot); } +static int intel_iommu_map_sg(struct iommu_domain *domain, unsigned long iova, + struct scatterlist *sg, unsigned int nents, + int prot, gfp_t gfp, size_t *mapped) +{ + unsigned int i = 0; + phys_addr_t start; + size_t len = 0; + int ret; + + while (i <= nents) { + phys_addr_t s_phys = sg_phys(sg); + + if (len && s_phys != start + len) { + ret = intel_iommu_map(domain, iova + *mapped, start, + len, prot, gfp); + if (ret) + return ret; + + *mapped += len; + len = 0; + } + + if (len) { + len += sg->length; + } else { + len = sg->length; + start = s_phys; + } + + if (++i < nents) + sg = sg_next(sg); + } + + return 0; +} + static size_t intel_iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size, struct iommu_iotlb_gather *gather) @@ -5519,6 +,7 @@ const struct iommu_ops intel_iommu_ops = { .aux_detach_dev = intel_iommu_aux_detach_device, .aux_get_pasid = intel_iommu_aux_get_pasid, .map= intel_iommu_map, + .map_sg = intel_iommu_map_sg, .iotlb_sync_map = intel_iommu_iotlb_sync_map, .unmap = intel_iommu_unmap, .flush_iotlb_all= intel_flush_iotlb_all, ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH RFC 0/9] Possible set of VT-d optimizations
> On Jan 28, 2021, at 8:59 AM, Robin Murphy wrote: > > On 2021-01-27 20:00, Chuck Lever wrote: >> Hi- >> This collection of patches seems to get the best throughtput results >> so far. The NFS WRITE result is fully restored, and the NFS READ >> result is very close to fully restored. >> Children see throughput for 12 initial writers = 5008474.03 kB/sec >> Parent sees throughput for 12 initial writers = 4996927.80 kB/sec >> Min throughput per process = 416956.88 kB/sec >> Max throughput per process = 417910.22 kB/sec >> Avg throughput per process = 417372.84 kB/sec >> Min xfer= 1046272.00 kB >> CPU Utilization: Wall time2.515CPU time1.996CPU >> utilization 79.37 % >> Children see throughput for 12 rewriters= 5020584.59 kB/sec >> Parent sees throughput for 12 rewriters = 5012539.29 kB/sec >> Min throughput per process = 417799.00 kB/sec >> Max throughput per process = 419082.22 kB/sec >> Avg throughput per process = 418382.05 kB/sec >> Min xfer= 1046528.00 kB >> CPU utilization: Wall time2.507CPU time2.024CPU >> utilization 80.73 % >> Children see throughput for 12 readers = 5805484.25 kB/sec >> Parent sees throughput for 12 readers = 5799535.68 kB/sec >> Min throughput per process = 482888.16 kB/sec >> Max throughput per process = 48.16 kB/sec >> Avg throughput per process = 483790.35 kB/sec >> Min xfer= 1045760.00 kB >> CPU utilization: Wall time2.167CPU time1.964CPU >> utilization 90.63 % >> Children see throughput for 12 re-readers = 5812227.16 kB/sec >> Parent sees throughput for 12 re-readers= 5803793.06 kB/sec >> Min throughput per process = 483242.97 kB/sec >> Max throughput per process = 485724.41 kB/sec >> Avg throughput per process = 484352.26 kB/sec >> Min xfer= 1043456.00 kB >> CPU utilization: Wall time2.161CPU time1.976CPU >> utilization 91.45 % >> I've included a simple-minded implementation of a map_sg op for >> the Intel IOMMU. This is nothing more than a copy of the loop in >> __iommu_map_sg() with the call to __iommu_map() replaced with a >> call to intel_iommu_map(). > > ...which is the main reason I continue to strongly dislike patches #4-#9 (#3 > definitely seems to makes sense either way, now that #1 and #2 are going to > land). If a common operation is worth optimising anywhere, then it deserves > optimising everywhere, so we end up with a dozen diverging copies of > essentially the same code - particularly when the driver-specific > functionality *is* already in the drivers, so what gets duplicated is solely > the "generic" parts. I don't disagree with that assessment, but I don't immediately see an alternative API arrangement that would be more successful in the short term. If 4/9 - 9/9 are not acceptable, then the responsible thing to do would be to revert: - 58a8bb39490d ("iommu/vt-d: Cleanup after converting to dma-iommu ops") - c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops") for v5.11, work out the proper API design, and then try the VT-d conversion again. IMHO. > And if there's justification for pushing iommu_map_sg() entirely into > drivers, then it's verging on self-contradictory not to do the same for > iommu_map() and iommu_unmap(). Some IOMMU drivers - mainly intel-iommu, as it > happens - are already implementing hacks around the "one call per page" > interface being inherently inefficient, so the logical thing to do here is > take a step back and reconsider the fundamental design of the whole map/unmap > interface. Implementing hacks on top of hacks to make particular things > faster on particular systems that particular people care about is not going > to do us any favours in the long run. > > As it stands, I can easily see a weird anti-pattern emerging where people > start adding code to fake up scatterlists in random drivers because they see > dma_map_sg() performing paradoxically better than dma_map_page(). > > Robin. > >> --- >> Chuck Lever (1): &g
Re: [PATCH RFC 0/9] Possible set of VT-d optimizations
> On Jan 28, 2021, at 9:52 AM, Chuck Lever wrote: > > > >> On Jan 28, 2021, at 8:59 AM, Robin Murphy wrote: >> >> On 2021-01-27 20:00, Chuck Lever wrote: >>> Hi- >>> This collection of patches seems to get the best throughtput results >>> so far. The NFS WRITE result is fully restored, and the NFS READ >>> result is very close to fully restored. >>> Children see throughput for 12 initial writers = 5008474.03 kB/sec >>> Parent sees throughput for 12 initial writers = 4996927.80 kB/sec >>> Min throughput per process = 416956.88 kB/sec >>> Max throughput per process = 417910.22 kB/sec >>> Avg throughput per process = 417372.84 kB/sec >>> Min xfer= 1046272.00 kB >>> CPU Utilization: Wall time2.515CPU time1.996CPU >>> utilization 79.37 % >>> Children see throughput for 12 rewriters= 5020584.59 kB/sec >>> Parent sees throughput for 12 rewriters = 5012539.29 kB/sec >>> Min throughput per process = 417799.00 kB/sec >>> Max throughput per process = 419082.22 kB/sec >>> Avg throughput per process = 418382.05 kB/sec >>> Min xfer= 1046528.00 kB >>> CPU utilization: Wall time2.507CPU time2.024CPU >>> utilization 80.73 % >>> Children see throughput for 12 readers = 5805484.25 kB/sec >>> Parent sees throughput for 12 readers = 5799535.68 kB/sec >>> Min throughput per process = 482888.16 kB/sec >>> Max throughput per process = 48.16 kB/sec >>> Avg throughput per process = 483790.35 kB/sec >>> Min xfer= 1045760.00 kB >>> CPU utilization: Wall time2.167CPU time1.964CPU >>> utilization 90.63 % >>> Children see throughput for 12 re-readers = 5812227.16 kB/sec >>> Parent sees throughput for 12 re-readers= 5803793.06 kB/sec >>> Min throughput per process = 483242.97 kB/sec >>> Max throughput per process = 485724.41 kB/sec >>> Avg throughput per process = 484352.26 kB/sec >>> Min xfer= 1043456.00 kB >>> CPU utilization: Wall time2.161CPU time1.976CPU >>> utilization 91.45 % >>> I've included a simple-minded implementation of a map_sg op for >>> the Intel IOMMU. This is nothing more than a copy of the loop in >>> __iommu_map_sg() with the call to __iommu_map() replaced with a >>> call to intel_iommu_map(). >> >> ...which is the main reason I continue to strongly dislike patches #4-#9 (#3 >> definitely seems to makes sense either way, now that #1 and #2 are going to >> land). If a common operation is worth optimising anywhere, then it deserves >> optimising everywhere, so we end up with a dozen diverging copies of >> essentially the same code - particularly when the driver-specific >> functionality *is* already in the drivers, so what gets duplicated is solely >> the "generic" parts. > > I don't disagree with that assessment, but I don't immediately see an > alternative API arrangement that would be more successful in the short > term. If 4/9 - 9/9 are not acceptable, then the responsible thing to > do would be to revert: > > - 58a8bb39490d ("iommu/vt-d: Cleanup after converting to dma-iommu ops") > - c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops") > > for v5.11, work out the proper API design, and then try the VT-d conversion > again. > > IMHO. Are all y'all waiting for me to post such patches? ;-) >> And if there's justification for pushing iommu_map_sg() entirely into >> drivers, then it's verging on self-contradictory not to do the same for >> iommu_map() and iommu_unmap(). Some IOMMU drivers - mainly intel-iommu, as >> it happens - are already implementing hacks around the "one call per page" >> interface being inherently inefficient, so the logical thing to do here is >> take a step back and reconsider the fundamental design of the whole >> map/unmap interface. Implementing hacks on top of hacks to make particular >> things faster on particu
performance regression noted in v5.11-rc after c062db039f40
Children see throughput for 12 re-readers = 5410601.12 kB/sec Parent sees throughput for 12 re-readers= 5403504.40 kB/sec Min throughput per process = 449918.12 kB/sec Max throughput per process = 452489.28 kB/sec Avg throughput per process = 450883.43 kB/sec Min xfer= 1043456.00 kB CPU utilization: Wall time2.321CPU time1.978CPU utilization 85.21 % And here's c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops"). Significant throughput loss. Children see throughput for 12 initial writers = 3812036.91 kB/sec Parent sees throughput for 12 initial writers = 3753683.40 kB/sec Min throughput per process = 313672.25 kB/sec Max throughput per process = 321719.44 kB/sec Avg throughput per process = 317669.74 kB/sec Min xfer= 1022464.00 kB CPU Utilization: Wall time3.309CPU time1.986CPU utilization 60.02 % Children see throughput for 12 rewriters= 3786831.94 kB/sec Parent sees throughput for 12 rewriters = 3783205.58 kB/sec Min throughput per process = 313654.44 kB/sec Max throughput per process = 317844.50 kB/sec Avg throughput per process = 315569.33 kB/sec Min xfer= 1035520.00 kB CPU utilization: Wall time3.302CPU time1.945CPU utilization 58.90 % Children see throughput for 12 readers = 4265828.28 kB/sec Parent sees throughput for 12 readers = 4261844.88 kB/sec Min throughput per process = 352305.00 kB/sec Max throughput per process = 357726.22 kB/sec Avg throughput per process = 355485.69 kB/sec Min xfer= 1032960.00 kB CPU utilization: Wall time2.934CPU time1.942CPU utilization 66.20 % Children see throughput for 12 re-readers = 4220651.19 kB/sec Parent sees throughput for 12 re-readers= 4216096.04 kB/sec Min throughput per process = 348677.16 kB/sec Max throughput per process = 353467.44 kB/sec Avg throughput per process = 351720.93 kB/sec Min xfer= 1035264.00 kB CPU utilization: Wall time2.969CPU time1.952CPU utilization 65.74 % The regression appears to be 100% reproducible. -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 12, 2021, at 9:25 PM, Lu Baolu wrote: > > Hi, > > On 1/12/21 10:38 PM, Will Deacon wrote: >> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>> Hi- >>> >>> [ Please cc: me on replies, I'm not currently subscribed to >>> iommu@lists ]. >>> >>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>> >>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>> >>> For those not familiar with the way storage protocols use RDMA, The >>> initiator/client sets up memory regions and the target/server uses >>> RDMA Read and Write to move data out of and into those regions. The >>> initiator/client uses only RDMA memory registration and invalidation >>> operations, and the target/server uses RDMA Read and Write. >>> >>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>> enabled using the kernel command line options "intel_iommu=on >>> iommu=strict". >>> >>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>> I was able to bisect on my client to the following commits. >>> >>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>> map_sg"). This is about normal for this test. >>> >>> Children see throughput for 12 initial writers = 4732581.09 kB/sec >>> Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >>> Min throughput per process = 387764.34 kB/sec >>> Max throughput per process = 399655.47 kB/sec >>> Avg throughput per process = 394381.76 kB/sec >>> Min xfer= 1017344.00 kB >>> CPU Utilization: Wall time2.671CPU time1.974CPU >>> utilization 73.89 % >>> Children see throughput for 12 rewriters= 4837741.94 kB/sec >>> Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >>> Min throughput per process = 398983.72 kB/sec >>> Max throughput per process = 406199.66 kB/sec >>> Avg throughput per process = 403145.16 kB/sec >>> Min xfer= 1030656.00 kB >>> CPU utilization: Wall time2.584CPU time1.959CPU >>> utilization 75.82 % >>> Children see throughput for 12 readers = 5921370.94 kB/sec >>> Parent sees throughput for 12 readers = 5914106.69 kB/sec >>> Min throughput per process = 491812.38 kB/sec >>> Max throughput per process = 494777.28 kB/sec >>> Avg throughput per process = 493447.58 kB/sec >>> Min xfer= 1042688.00 kB >>> CPU utilization: Wall time2.122CPU time1.968CPU >>> utilization 92.75 % >>> Children see throughput for 12 re-readers = 5947985.69 kB/sec >>> Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >>> Min throughput per process = 492805.81 kB/sec >>> Max throughput per process = 497280.19 kB/sec >>> Avg throughput per process = 495665.47 kB/sec >>> Min xfer= 1039360.00 kB >>> CPU utilization: Wall time2.111CPU time1.968CPU >>> utilization 93.22 % >>> >>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in >>> iommu_ops.at(de)tach_dev"). It's losing some steam here. >>> >>> Children see throughput for 12 initial writers = 4342419.12 kB/sec >>> Parent sees throughput for 12 initial writers = 4310612.79 kB/sec >>> Min throughput per process = 359299.06 kB/sec >>> Max throughput per process = 363866.16 kB/sec >>> Avg throughput per process = 361868.26 kB/sec >>> Min xfer= 1035520.00 kB >>> CPU Utilization: Wall time2.902CPU time1.951CPU >>> utilization 67.22 % >>> Children see throughput for 12 rewriters
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 13, 2021, at 9:07 AM, Chuck Lever wrote: > > > >> On Jan 12, 2021, at 9:25 PM, Lu Baolu wrote: >> >> Hi, >> >> On 1/12/21 10:38 PM, Will Deacon wrote: >>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>>> Hi- >>>> >>>> [ Please cc: me on replies, I'm not currently subscribed to >>>> iommu@lists ]. >>>> >>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>>> >>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>>> >>>> For those not familiar with the way storage protocols use RDMA, The >>>> initiator/client sets up memory regions and the target/server uses >>>> RDMA Read and Write to move data out of and into those regions. The >>>> initiator/client uses only RDMA memory registration and invalidation >>>> operations, and the target/server uses RDMA Read and Write. >>>> >>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>>> enabled using the kernel command line options "intel_iommu=on >>>> iommu=strict". >>>> >>>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>>> I was able to bisect on my client to the following commits. >>>> >>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>>> map_sg"). This is about normal for this test. >>>> >>>>Children see throughput for 12 initial writers = 4732581.09 kB/sec >>>>Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >>>>Min throughput per process = 387764.34 kB/sec >>>>Max throughput per process = 399655.47 kB/sec >>>>Avg throughput per process = 394381.76 kB/sec >>>>Min xfer= 1017344.00 kB >>>>CPU Utilization: Wall time2.671CPU time1.974CPU >>>> utilization 73.89 % >>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec >>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >>>>Min throughput per process = 398983.72 kB/sec >>>>Max throughput per process = 406199.66 kB/sec >>>>Avg throughput per process = 403145.16 kB/sec >>>>Min xfer= 1030656.00 kB >>>>CPU utilization: Wall time2.584CPU time1.959CPU >>>> utilization 75.82 % >>>>Children see throughput for 12 readers = 5921370.94 kB/sec >>>>Parent sees throughput for 12 readers = 5914106.69 kB/sec >>>>Min throughput per process = 491812.38 kB/sec >>>>Max throughput per process = 494777.28 kB/sec >>>>Avg throughput per process = 493447.58 kB/sec >>>>Min xfer= 1042688.00 kB >>>>CPU utilization: Wall time2.122CPU time1.968CPU >>>> utilization 92.75 % >>>>Children see throughput for 12 re-readers = 5947985.69 kB/sec >>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >>>>Min throughput per process = 492805.81 kB/sec >>>>Max throughput per process = 497280.19 kB/sec >>>>Avg throughput per process = 495665.47 kB/sec >>>>Min xfer= 1039360.00 kB >>>>CPU utilization: Wall time2.111CPU time1.968CPU >>>> utilization 93.22 % >>>> >>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in >>>> iommu_ops.at(de)tach_dev"). It's losing some steam here. >>>> >>>>Children see throughput for 12 initial writers = 4342419.12 kB/sec >>>>Parent sees throughput for 12 initial writers = 4310612.79 kB/sec >>>>Min throughput per process = 359299.06 kB/sec >>>>Max throughput per process = 363866.16 kB/sec >>
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 12, 2021, at 9:38 AM, Will Deacon wrote: > > [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] > > On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >> Hi- >> >> [ Please cc: me on replies, I'm not currently subscribed to >> iommu@lists ]. >> >> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >> >> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >> >> For those not familiar with the way storage protocols use RDMA, The >> initiator/client sets up memory regions and the target/server uses >> RDMA Read and Write to move data out of and into those regions. The >> initiator/client uses only RDMA memory registration and invalidation >> operations, and the target/server uses RDMA Read and Write. >> >> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >> enabled using the kernel command line options "intel_iommu=on >> iommu=strict". >> >> Recently I've noticed a significant (25-30%) loss in NFS throughput. >> I was able to bisect on my client to the following commits. >> >> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >> map_sg"). This is about normal for this test. >> >> Children see throughput for 12 initial writers = 4732581.09 kB/sec >> Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >> Min throughput per process = 387764.34 kB/sec >> Max throughput per process = 399655.47 kB/sec >> Avg throughput per process = 394381.76 kB/sec >> Min xfer= 1017344.00 kB >> CPU Utilization: Wall time2.671CPU time1.974CPU >> utilization 73.89 % >> Children see throughput for 12 rewriters= 4837741.94 kB/sec >> Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >> Min throughput per process = 398983.72 kB/sec >> Max throughput per process = 406199.66 kB/sec >> Avg throughput per process = 403145.16 kB/sec >> Min xfer= 1030656.00 kB >> CPU utilization: Wall time2.584CPU time1.959CPU >> utilization 75.82 % >> Children see throughput for 12 readers = 5921370.94 kB/sec >> Parent sees throughput for 12 readers = 5914106.69 kB/sec >> Min throughput per process = 491812.38 kB/sec >> Max throughput per process = 494777.28 kB/sec >> Avg throughput per process = 493447.58 kB/sec >> Min xfer= 1042688.00 kB >> CPU utilization: Wall time2.122CPU time1.968CPU >> utilization 92.75 % >> Children see throughput for 12 re-readers = 5947985.69 kB/sec >> Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >> Min throughput per process = 492805.81 kB/sec >> Max throughput per process = 497280.19 kB/sec >> Avg throughput per process = 495665.47 kB/sec >> Min xfer= 1039360.00 kB >> CPU utilization: Wall time2.111CPU time1.968CPU >> utilization 93.22 % >> >> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in >> iommu_ops.at(de)tach_dev"). It's losing some steam here. >> >> Children see throughput for 12 initial writers = 4342419.12 kB/sec >> Parent sees throughput for 12 initial writers = 4310612.79 kB/sec >> Min throughput per process = 359299.06 kB/sec >> Max throughput per process = 363866.16 kB/sec >> Avg throughput per process = 361868.26 kB/sec >> Min xfer= 1035520.00 kB >> CPU Utilization: Wall time2.902CPU time1.951CPU >> utilization 67.22 % >> Children see throughput for 12 rewriters= 4408576.66 kB/sec >> Parent sees throughput for 12 rewriters = 4404280.87 kB/sec >> Min throughput per process = 364553.88 kB/sec >> Max throughput per process = 370029.28 kB/sec >> Avg throughput per process
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 18, 2021, at 1:00 PM, Robin Murphy wrote: > > On 2021-01-18 16:18, Chuck Lever wrote: >>> On Jan 12, 2021, at 9:38 AM, Will Deacon wrote: >>> >>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >>> >>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>>> Hi- >>>> >>>> [ Please cc: me on replies, I'm not currently subscribed to >>>> iommu@lists ]. >>>> >>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>>> >>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>>> >>>> For those not familiar with the way storage protocols use RDMA, The >>>> initiator/client sets up memory regions and the target/server uses >>>> RDMA Read and Write to move data out of and into those regions. The >>>> initiator/client uses only RDMA memory registration and invalidation >>>> operations, and the target/server uses RDMA Read and Write. >>>> >>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>>> enabled using the kernel command line options "intel_iommu=on >>>> iommu=strict". >>>> >>>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>>> I was able to bisect on my client to the following commits. >>>> >>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>>> map_sg"). This is about normal for this test. >>>> >>>>Children see throughput for 12 initial writers = 4732581.09 kB/sec >>>>Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >>>>Min throughput per process = 387764.34 kB/sec >>>>Max throughput per process = 399655.47 kB/sec >>>>Avg throughput per process = 394381.76 kB/sec >>>>Min xfer= 1017344.00 kB >>>>CPU Utilization: Wall time2.671CPU time1.974CPU >>>> utilization 73.89 % >>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec >>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >>>>Min throughput per process = 398983.72 kB/sec >>>>Max throughput per process = 406199.66 kB/sec >>>>Avg throughput per process = 403145.16 kB/sec >>>>Min xfer= 1030656.00 kB >>>>CPU utilization: Wall time2.584CPU time1.959CPU >>>> utilization 75.82 % >>>>Children see throughput for 12 readers = 5921370.94 kB/sec >>>>Parent sees throughput for 12 readers = 5914106.69 kB/sec >>>>Min throughput per process = 491812.38 kB/sec >>>>Max throughput per process = 494777.28 kB/sec >>>>Avg throughput per process = 493447.58 kB/sec >>>>Min xfer= 1042688.00 kB >>>>CPU utilization: Wall time2.122CPU time1.968CPU >>>> utilization 92.75 % >>>>Children see throughput for 12 re-readers = 5947985.69 kB/sec >>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >>>>Min throughput per process = 492805.81 kB/sec >>>>Max throughput per process = 497280.19 kB/sec >>>>Avg throughput per process = 495665.47 kB/sec >>>>Min xfer= 1039360.00 kB >>>>CPU utilization: Wall time2.111CPU time1.968CPU >>>> utilization 93.22 % >>>> >>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in >>>> iommu_ops.at(de)tach_dev"). It's losing some steam here. >>>> >>>>Children see throughput for 12 initial writers = 4342419.12 kB/sec >>>>Parent sees throughput for 12 initial writers = 4310612.79 kB/sec >>>>Min throughput per process = 359299.06 kB/sec >>>>Max throughput per process = 363866.16 kB/sec >>>&
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 18, 2021, at 8:22 PM, Lu Baolu wrote: > > Do you mind posting the cap and ecap of the iommu used by your device? > > You can get it via sysfs, for example: > > /sys/bus/pci/devices/:00:14.0/iommu/intel-iommu# ls > address cap domains_supported domains_used ecap version [root@manet intel-iommu]# lspci | grep Mellanox 03:00.0 Network controller: Mellanox Technologies MT27520 Family [ConnectX-3 Pro] [root@manet intel-iommu]# pwd /sys/devices/pci:00/:00:03.0/:03:00.0/iommu/intel-iommu [root@manet intel-iommu]# for i in *; do echo -n $i ": "; cat $i; done address : c7ffc000 cap : d2078c106f0466 domains_supported : 65536 domains_used : 62 ecap : f020de version : 1:0 [root@manet intel-iommu]# >> Fwiw, this system uses the Intel C612 chipset with Intel(R) Xeon(R) >> E5-2603 v3 @ 1.60GHz CPUs. > > Can you please also hack a line of code to check the return value of > iommu_dma_map_sg()? diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index baca49fe83af..e811562ead0e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -328,6 +328,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); + trace_printk("ib_dma_map_sg(%d) returns %d\n", mr->mr_nents, dma_nents); if (!dma_nents) goto out_dmamap_err; mr->mr_device = ep->re_id->device; During the 256KB iozone test I used before, this trace log is generated: kworker/u28:3-1269 [000] 336.054743: bprint: frwr_map: ib_dma_map_sg(30) returns 1 kworker/u28:3-1269 [000] 336.054835: bprint: frwr_map: ib_dma_map_sg(30) returns 1 kworker/u28:3-1269 [000] 336.055022: bprint: frwr_map: ib_dma_map_sg(4) returns 1 kworker/u28:3-1269 [000] 336.055118: bprint: frwr_map: ib_dma_map_sg(30) returns 1 kworker/u28:3-1269 [000] 336.055312: bprint: frwr_map: ib_dma_map_sg(30) returns 1 kworker/u28:3-1269 [000] 336.055407: bprint: frwr_map: ib_dma_map_sg(4) returns 1 -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 19, 2021, at 9:11 PM, Lu Baolu wrote: > > On 1/19/21 10:37 PM, Chuck Lever wrote: >>> On Jan 18, 2021, at 8:22 PM, Lu Baolu wrote: >>> >>> Do you mind posting the cap and ecap of the iommu used by your device? >>> >>> You can get it via sysfs, for example: >>> >>> /sys/bus/pci/devices/:00:14.0/iommu/intel-iommu# ls >>> address cap domains_supported domains_used ecap version >> [root@manet intel-iommu]# lspci | grep Mellanox >> 03:00.0 Network controller: Mellanox Technologies MT27520 Family [ConnectX-3 >> Pro] >> [root@manet intel-iommu]# pwd >> /sys/devices/pci:00/:00:03.0/:03:00.0/iommu/intel-iommu >> [root@manet intel-iommu]# for i in *; do echo -n $i ": "; cat $i; done >> address : c7ffc000 >> cap : d2078c106f0466 > > MGAW: 10 (supporting 48-bit address width) > SAGAW: 00100 (supporting 48-bit 4-level page table) > > So the calculation of domain->domain.geometry.aperture_end is right. I found the cause of the performance loss with c062db039f40: it was a testing error on my part. I will begin looking at c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops"). -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 18, 2021, at 1:00 PM, Robin Murphy wrote: > > On 2021-01-18 16:18, Chuck Lever wrote: >>> On Jan 12, 2021, at 9:38 AM, Will Deacon wrote: >>> >>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >>> >>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>>> Hi- >>>> >>>> [ Please cc: me on replies, I'm not currently subscribed to >>>> iommu@lists ]. >>>> >>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>>> >>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>>> >>>> For those not familiar with the way storage protocols use RDMA, The >>>> initiator/client sets up memory regions and the target/server uses >>>> RDMA Read and Write to move data out of and into those regions. The >>>> initiator/client uses only RDMA memory registration and invalidation >>>> operations, and the target/server uses RDMA Read and Write. >>>> >>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>>> enabled using the kernel command line options "intel_iommu=on >>>> iommu=strict". >>>> >>>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>>> I was able to bisect on my client to the following commits. >>>> >>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>>> map_sg"). This is about normal for this test. >>>> >>>>Children see throughput for 12 initial writers = 4732581.09 kB/sec >>>>Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >>>>Min throughput per process = 387764.34 kB/sec >>>>Max throughput per process = 399655.47 kB/sec >>>>Avg throughput per process = 394381.76 kB/sec >>>>Min xfer= 1017344.00 kB >>>>CPU Utilization: Wall time2.671CPU time1.974CPU >>>> utilization 73.89 % >>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec >>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >>>>Min throughput per process = 398983.72 kB/sec >>>>Max throughput per process = 406199.66 kB/sec >>>>Avg throughput per process = 403145.16 kB/sec >>>>Min xfer= 1030656.00 kB >>>>CPU utilization: Wall time2.584CPU time1.959CPU >>>> utilization 75.82 % >>>>Children see throughput for 12 readers = 5921370.94 kB/sec >>>>Parent sees throughput for 12 readers = 5914106.69 kB/sec >>>>Min throughput per process = 491812.38 kB/sec >>>>Max throughput per process = 494777.28 kB/sec >>>>Avg throughput per process = 493447.58 kB/sec >>>>Min xfer= 1042688.00 kB >>>>CPU utilization: Wall time2.122CPU time1.968CPU >>>> utilization 92.75 % >>>>Children see throughput for 12 re-readers = 5947985.69 kB/sec >>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >>>>Min throughput per process = 492805.81 kB/sec >>>>Max throughput per process = 497280.19 kB/sec >>>>Avg throughput per process = 495665.47 kB/sec >>>>Min xfer= 1039360.00 kB >>>>CPU utilization: Wall time2.111CPU time1.968CPU >>>> utilization 93.22 % >>>> >>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in >>>> iommu_ops.at(de)tach_dev"). It's losing some steam here. >>>> >>>>Children see throughput for 12 initial writers = 4342419.12 kB/sec >>>>Parent sees throughput for 12 initial writers = 4310612.79 kB/sec >>>>Min throughput per process = 359299.06 kB/sec >>>>Max throughput per process = 363866.16 kB/sec >>>&
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 21, 2021, at 10:00 PM, Lu Baolu wrote: > > +Isaac > > On 1/22/21 3:09 AM, Chuck Lever wrote: >>> On Jan 18, 2021, at 1:00 PM, Robin Murphy wrote: >>> >>> On 2021-01-18 16:18, Chuck Lever wrote: >>>>> On Jan 12, 2021, at 9:38 AM, Will Deacon wrote: >>>>> >>>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >>>>> >>>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>>>>> Hi- >>>>>> >>>>>> [ Please cc: me on replies, I'm not currently subscribed to >>>>>> iommu@lists ]. >>>>>> >>>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>>>>> >>>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>>>>> >>>>>> For those not familiar with the way storage protocols use RDMA, The >>>>>> initiator/client sets up memory regions and the target/server uses >>>>>> RDMA Read and Write to move data out of and into those regions. The >>>>>> initiator/client uses only RDMA memory registration and invalidation >>>>>> operations, and the target/server uses RDMA Read and Write. >>>>>> >>>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>>>>> enabled using the kernel command line options "intel_iommu=on >>>>>> iommu=strict". >>>>>> >>>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>>>>> I was able to bisect on my client to the following commits. >>>>>> >>>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>>>>> map_sg"). This is about normal for this test. >>>>>> >>>>>> Children see throughput for 12 initial writers = 4732581.09 kB/sec >>>>>> Parent sees throughput for 12 initial writers = 4646810.21 kB/sec >>>>>> Min throughput per process = 387764.34 kB/sec >>>>>> Max throughput per process = 399655.47 kB/sec >>>>>> Avg throughput per process = 394381.76 kB/sec >>>>>> Min xfer= 1017344.00 kB >>>>>> CPU Utilization: Wall time2.671CPU time1.974CPU >>>>>> utilization 73.89 % >>>>>> Children see throughput for 12 rewriters= 4837741.94 kB/sec >>>>>> Parent sees throughput for 12 rewriters = 4833509.35 kB/sec >>>>>> Min throughput per process = 398983.72 kB/sec >>>>>> Max throughput per process = 406199.66 kB/sec >>>>>> Avg throughput per process = 403145.16 kB/sec >>>>>> Min xfer= 1030656.00 kB >>>>>> CPU utilization: Wall time2.584CPU time1.959CPU >>>>>> utilization 75.82 % >>>>>> Children see throughput for 12 readers = 5921370.94 kB/sec >>>>>> Parent sees throughput for 12 readers = 5914106.69 kB/sec >>>>>> Min throughput per process = 491812.38 kB/sec >>>>>> Max throughput per process = 494777.28 kB/sec >>>>>> Avg throughput per process = 493447.58 kB/sec >>>>>> Min xfer= 1042688.00 kB >>>>>> CPU utilization: Wall time2.122CPU time1.968CPU >>>>>> utilization 92.75 % >>>>>> Children see throughput for 12 re-readers = 5947985.69 kB/sec >>>>>> Parent sees throughput for 12 re-readers= 5941348.51 kB/sec >>>>>> Min throughput per process = 492805.81 kB/sec >>>>>> Max throughput per process = 497280.19 kB/sec >>>>>> Avg throughput per process = 495665.47 kB/sec >>>>>> Min xfer= 1039360.00 kB >>>>>> CPU utilization: Wall time2.111CPU time1.968CPU >>>>>> utilization 93
Re: performance regression noted in v5.11-rc after c062db039f40
> On Jan 22, 2021, at 12:38 PM, Robin Murphy wrote: > > On 2021-01-22 16:18, Chuck Lever wrote: >>> On Jan 21, 2021, at 10:00 PM, Lu Baolu wrote: >>> >>> +Isaac >>> >>> On 1/22/21 3:09 AM, Chuck Lever wrote: >>>>> On Jan 18, 2021, at 1:00 PM, Robin Murphy wrote: >>>>> >>>>> On 2021-01-18 16:18, Chuck Lever wrote: >>>>>>> On Jan 12, 2021, at 9:38 AM, Will Deacon wrote: >>>>>>> >>>>>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks] >>>>>>> >>>>>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote: >>>>>>>> Hi- >>>>>>>> >>>>>>>> [ Please cc: me on replies, I'm not currently subscribed to >>>>>>>> iommu@lists ]. >>>>>>>> >>>>>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards >>>>>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount: >>>>>>>> >>>>>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I >>>>>>>> >>>>>>>> For those not familiar with the way storage protocols use RDMA, The >>>>>>>> initiator/client sets up memory regions and the target/server uses >>>>>>>> RDMA Read and Write to move data out of and into those regions. The >>>>>>>> initiator/client uses only RDMA memory registration and invalidation >>>>>>>> operations, and the target/server uses RDMA Read and Write. >>>>>>>> >>>>>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU >>>>>>>> enabled using the kernel command line options "intel_iommu=on >>>>>>>> iommu=strict". >>>>>>>> >>>>>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput. >>>>>>>> I was able to bisect on my client to the following commits. >>>>>>>> >>>>>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in >>>>>>>> map_sg"). This is about normal for this test. >>>>>>>> >>>>>>>>Children see throughput for 12 initial writers = 4732581.09 >>>>>>>> kB/sec >>>>>>>>Parent sees throughput for 12 initial writers = 4646810.21 >>>>>>>> kB/sec >>>>>>>>Min throughput per process = 387764.34 >>>>>>>> kB/sec >>>>>>>>Max throughput per process = 399655.47 >>>>>>>> kB/sec >>>>>>>>Avg throughput per process = 394381.76 >>>>>>>> kB/sec >>>>>>>>Min xfer= 1017344.00 kB >>>>>>>>CPU Utilization: Wall time2.671CPU time1.974CPU >>>>>>>> utilization 73.89 % >>>>>>>>Children see throughput for 12 rewriters= 4837741.94 >>>>>>>> kB/sec >>>>>>>>Parent sees throughput for 12 rewriters = 4833509.35 >>>>>>>> kB/sec >>>>>>>>Min throughput per process = 398983.72 >>>>>>>> kB/sec >>>>>>>>Max throughput per process = 406199.66 >>>>>>>> kB/sec >>>>>>>>Avg throughput per process = 403145.16 >>>>>>>> kB/sec >>>>>>>>Min xfer= 1030656.00 kB >>>>>>>>CPU utilization: Wall time2.584CPU time1.959CPU >>>>>>>> utilization 75.82 % >>>>>>>>Children see throughput for 12 readers = 5921370.94 >>>>>>>> kB/sec >>>>>>>>Parent sees throughput for 12 readers = 5914106.69 >>>>>>>> kB/sec >>>>>>>>Min throughput per process = 491812.38 >>>>&
Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40
CPU utilization: Wall time2.144CPU time1.895CPU utilization 88.41 % Children see throughput for 12 re-readers = 5847438.62 kB/sec Parent sees throughput for 12 re-readers= 5839292.18 kB/sec Min throughput per process = 485835.03 kB/sec Max throughput per process = 488702.12 kB/sec Avg throughput per process = 487286.55 kB/sec Min xfer= 1042688.00 kB CPU utilization: Wall time2.148CPU time1.909CPU utilization 88.84 % NFS READ throughput is almost fully restored. A normal-looking throughput result, copied from the previous thread, is: Children see throughput for 12 readers = 5921370.94 kB/sec Parent sees throughput for 12 readers = 5914106.69 kB/sec The NFS WRITE throughput result appears to be unchanged, or slightly worse than before. I don't have an explanation for this result. I applied your patches on the NFS server also without noting improvement. -- Chuck Lever ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40
> On Jan 25, 2021, at 12:39 PM, Chuck Lever wrote: > > Hello Lu - > > Many thanks for your prototype. > > >> On Jan 24, 2021, at 9:38 PM, Lu Baolu wrote: >> >> This patch series is only for Request-For-Testing purpose. It aims to >> fix the performance regression reported here. >> >> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/ >> >> The first two patches are borrowed from here. >> >> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/ >> >> Please kindly help to verification. >> >> Best regards, >> baolu >> >> Lu Baolu (1): >> iommu/vt-d: Add iotlb_sync_map callback >> >> Yong Wu (2): >> iommu: Move iotlb_sync_map out from __iommu_map >> iommu: Add iova and size as parameters in iotlb_sync_map >> >> drivers/iommu/intel/iommu.c | 86 + >> drivers/iommu/iommu.c | 23 +++--- >> drivers/iommu/tegra-gart.c | 7 ++- >> include/linux/iommu.h | 3 +- >> 4 files changed, 83 insertions(+), 36 deletions(-) > > Here are results with the NFS client at stock v5.11-rc5 and the > NFS server at v5.10, showing the regression I reported earlier. > > Children see throughput for 12 initial writers = 4534582.00 kB/sec > Parent sees throughput for 12 initial writers = 4458145.56 kB/sec > Min throughput per process = 373101.59 kB/sec > Max throughput per process = 382669.50 kB/sec > Avg throughput per process = 377881.83 kB/sec > Min xfer= 1022720.00 kB > CPU Utilization: Wall time2.787CPU time1.922CPU > utilization 68.95 % > > > Children see throughput for 12 rewriters= 4542003.12 kB/sec > Parent sees throughput for 12 rewriters = 4538024.19 kB/sec > Min throughput per process = 374672.00 kB/sec > Max throughput per process = 383983.78 kB/sec > Avg throughput per process = 378500.26 kB/sec > Min xfer= 1022976.00 kB > CPU utilization: Wall time2.733CPU time1.947CPU > utilization 71.25 % > > > Children see throughput for 12 readers = 4568632.03 kB/sec > Parent sees throughput for 12 readers = 4563672.02 kB/sec > Min throughput per process = 376727.56 kB/sec > Max throughput per process = 383783.91 kB/sec > Avg throughput per process = 380719.34 kB/sec > Min xfer= 1029376.00 kB > CPU utilization: Wall time2.733CPU time1.898CPU > utilization 69.46 % > > > Children see throughput for 12 re-readers = 4610702.78 kB/sec > Parent sees throughput for 12 re-readers= 4606135.66 kB/sec > Min throughput per process = 381532.78 kB/sec > Max throughput per process = 387072.53 kB/sec > Avg throughput per process = 384225.23 kB/sec > Min xfer= 1034496.00 kB > CPU utilization: Wall time2.711CPU time1.910CPU > utilization 70.45 % > > Here's the NFS client at v5.11-rc5 with your series applied. > The NFS server remains at v5.10: > > Children see throughput for 12 initial writers = 4434778.81 kB/sec > Parent sees throughput for 12 initial writers = 4408190.69 kB/sec > Min throughput per process = 367865.28 kB/sec > Max throughput per process = 371134.38 kB/sec > Avg throughput per process = 369564.90 kB/sec > Min xfer= 1039360.00 kB > CPU Utilization: Wall time2.842CPU time1.904CPU > utilization 66.99 % > > > Children see throughput for 12 rewriters= 4476870.69 kB/sec > Parent sees throughput for 12 rewriters = 4471701.48 kB/sec > Min throughput per process = 370985.34 kB/sec > Max throughput per process = 374752.28 kB/sec > Avg throughput per process = 373072.56 kB/sec > Min xfer= 1038592.00 kB > CPU utilization: Wall time2.801CPU time1.902CPU > utilization 67.91 % > > &g