Re: [PATCH v4 7/8] linux/log2.h: Fix 64bit calculations in roundup/down_pow_two()

2019-12-03 Thread Chuck Lever
add5860e 100644
> --- a/drivers/net/ethernet/sfc/efx.h
> +++ b/drivers/net/ethernet/sfc/efx.h
> @@ -52,7 +52,7 @@ void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
> 
> #define EFX_MAX_DMAQ_SIZE 4096UL
> #define EFX_DEFAULT_DMAQ_SIZE 1024UL
> -#define EFX_MIN_DMAQ_SIZE 512UL
> +#define EFX_MIN_DMAQ_SIZE 512ULL
> 
> #define EFX_MAX_EVQ_SIZE 16384UL
> #define EFX_MIN_EVQ_SIZE 512UL
> diff --git a/drivers/net/ethernet/sfc/falcon/efx.h 
> b/drivers/net/ethernet/sfc/falcon/efx.h
> index d3b4646545fa..0d16257156d6 100644
> --- a/drivers/net/ethernet/sfc/falcon/efx.h
> +++ b/drivers/net/ethernet/sfc/falcon/efx.h
> @@ -55,7 +55,7 @@ void ef4_schedule_slow_fill(struct ef4_rx_queue *rx_queue);
> 
> #define EF4_MAX_DMAQ_SIZE 4096UL
> #define EF4_DEFAULT_DMAQ_SIZE 1024UL
> -#define EF4_MIN_DMAQ_SIZE 512UL
> +#define EF4_MIN_DMAQ_SIZE 512ULL
> 
> #define EF4_MAX_EVQ_SIZE 16384UL
> #define EF4_MIN_EVQ_SIZE 512UL
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index c7709e49f0e4..f0391e88bc42 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -578,7 +578,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct 
> irq_affinity *affd)
>   entry->msi_attrib.maskbit   = !!(control & PCI_MSI_FLAGS_MASKBIT);
>   entry->msi_attrib.default_irq   = dev->irq; /* Save IOAPIC IRQ */
>   entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1;
> - entry->msi_attrib.multiple  = ilog2(__roundup_pow_of_two(nvec));
> + entry->msi_attrib.multiple  = ilog2(roundup_pow_of_two(nvec));
> 
>   if (control & PCI_MSI_FLAGS_64BIT)
>   entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
> diff --git a/include/linux/log2.h b/include/linux/log2.h
> index 83a4a3ca3e8a..53a727303dac 100644
> --- a/include/linux/log2.h
> +++ b/include/linux/log2.h
> @@ -47,26 +47,6 @@ bool is_power_of_2(unsigned long n)
>   return (n != 0 && ((n & (n - 1)) == 0));
> }
> 
> -/**
> - * __roundup_pow_of_two() - round up to nearest power of two
> - * @n: value to round up
> - */
> -static inline __attribute__((const))
> -unsigned long __roundup_pow_of_two(unsigned long n)
> -{
> - return 1UL << fls_long(n - 1);
> -}
> -
> -/**
> - * __rounddown_pow_of_two() - round down to nearest power of two
> - * @n: value to round down
> - */
> -static inline __attribute__((const))
> -unsigned long __rounddown_pow_of_two(unsigned long n)
> -{
> - return 1UL << (fls_long(n) - 1);
> -}
> -
> /**
>  * const_ilog2 - log base 2 of 32-bit or a 64-bit constant unsigned value
>  * @n: parameter
> @@ -170,14 +150,11 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
>  * - the result is undefined when n == 0
>  * - this can be used to initialise global variables from constant data
>  */
> -#define roundup_pow_of_two(n)\
> -(\
> - __builtin_constant_p(n) ? ( \
> - (n == 1) ? 1 :  \
> - (1UL << (ilog2((n) - 1) + 1))   \
> -) :  \
> - __roundup_pow_of_two(n) \
> - )
> +#define roundup_pow_of_two(n)  \
> +(  \
> + (__builtin_constant_p(n) && ((n) == 1)) ? \
> + 1 : (1ULL << (ilog2((n) - 1) + 1))\
> +)
> 
> /**
>  * rounddown_pow_of_two - round the given value down to nearest power of two
> @@ -187,12 +164,11 @@ unsigned long __rounddown_pow_of_two(unsigned long n)
>  * - the result is undefined when n == 0
>  * - this can be used to initialise global variables from constant data
>  */
> -#define rounddown_pow_of_two(n)  \
> -(\
> - __builtin_constant_p(n) ? ( \
> - (1UL << ilog2(n))) :\
> - __rounddown_pow_of_two(n)   \
> - )
> +#define rounddown_pow_of_two(n)\
> +(  \
> + (__builtin_constant_p(n) && ((n) == 1)) ? \
> + 1 : (1ULL << (ilog2(n)))  \
> +)
> 
> static inline __attribute_const__
> int __order_base_2(unsigned long n)
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 15d70a90b50d..bb9efc6944a4 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -1094,7 +1094,8 @@ static int __init crash_notes_memory_init(void)
>* crash_notes is allocated inside one physical page.
>*/
>       size = sizeof(note_

Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)

2020-02-11 Thread Chuck Lever
Andre-

Thank you for the detailed report!

Tom-

There is a rich set of trace points available in the RPC/RDMA implementation in 
5.4/5.5, fwiw.
Please keep me in the loop, let me know if there is anything I can do to help.


> On Feb 11, 2020, at 2:25 AM, Joerg Roedel  wrote:
> 
> Adding Tom's new email address.
> 
> Tom, can you have a look, please? 
> https://bugzilla.kernel.org/show_bug.cgi?id=206461 seems to be a similar
> issue.
> 
> On Tue, Feb 11, 2020 at 06:06:54AM +0100, Andre Tomt wrote:
>> Since upgrading my RDMA lab from kernel 5.4.x to 5.5.x, NFSv4 over RDMA
>> stopped working. But only on my AMD Ryzen systems. And so far only NFS,
>> curiously other RDMA diagnostic tools (like qperf  -cm1 rc_bw) work
>> fine.
>> 
>> A git bisect points to be62dbf554c5b50718a54a359372c148cd9975c7 iommu/amd:
>> Convert AMD iommu driver to the dma-iommu api
>> 
>> 5.5.3-rc1, 5.6-rc1 are also not working.
>> 
>> I verified it by booting with amd_iommu=off on the kernel cmdline - it makes
>> everything work again.
>> 
>> The NFS config is a pretty simple NFSv4.x only, sec=sys setup, running over
>> RoCEv1 on Mellanox mlx4 hardware (ConnectX-3 Pro, fw 2.42.5000). Nothing
>> fancy besides the RoCEv1 and related bits network bits like PFC and storage
>> VLAN. Bare metal, no virtualization.
>> 
>> The impacted systems are:
>> ASUS ROG STRIX X399-E GAMING, with a Threadripper 1950x, BIOS 1002
>> ASUS Pro WS X570-ACE, with a Ryzen 7 3700x, BIOS 1201
>> 
>> pcaps off a mirror port can be provided. They show that on 5.5.x, CM
>> succeeds, and then a couple of NFS NULL calls comes through (over RoCE),
>> both acked, and then the rest just never goes out from the client until the
>> mount times out and CM is torn down.
>> 
>> No messages shows up in the kernel log on either side. I was at least
>> expecting some scary IOMMU warnings.
>> 
>> More serious hardware is not available for RDMA testing currently, so I dont
>> know if a EPYC system or newer mlx5 cards would have similar issues. Intel
>> I've only tested as server so far, that worked fine, as expected given the
>> bisect result.
>> 
>> 
>>> git bisect start
>>> # bad: [d5226fa6dbae0569ee43ecfc08bdcd6770fc4755] Linux 5.5
>>> git bisect bad d5226fa6dbae0569ee43ecfc08bdcd6770fc4755
>>> # good: [219d54332a09e8d8741c1e1982f5eae56099de85] Linux 5.4
>>> git bisect good 219d54332a09e8d8741c1e1982f5eae56099de85
>>> # good: [8c39f71ee2019e77ee14f88b1321b2348db51820] Merge 
>>> git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
>>> git bisect good 8c39f71ee2019e77ee14f88b1321b2348db51820
>>> # bad: [76bb8b05960c3d1668e6bee7624ed886cbd135ba] Merge tag 'kbuild-v5.5' 
>>> of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild
>>> git bisect bad 76bb8b05960c3d1668e6bee7624ed886cbd135ba
>>> # good: [21b26d2679584c6a60e861aa3e5ca09a6bab0633] Merge tag 
>>> '5.5-rc-smb3-fixes' of git://git.samba.org/sfrench/cifs-2.6
>>> git bisect good 21b26d2679584c6a60e861aa3e5ca09a6bab0633
>>> # good: [e5b3fc125d768eacd73bb4dc5019f0ce95635af4] Merge branch 
>>> 'x86-urgent-for-linus' of 
>>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
>>> git bisect good e5b3fc125d768eacd73bb4dc5019f0ce95635af4
>>> # bad: [937d6eefc716a9071f0e3bada19200de1bb9d048] Merge tag 'docs-5.5a' of 
>>> git://git.lwn.net/linux
>>> git bisect bad 937d6eefc716a9071f0e3bada19200de1bb9d048
>>> # bad: [1daa56bcfd8b329447e0c1b1e91c3925d08489b7] Merge tag 
>>> 'iommu-updates-v5.5' of 
>>> git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
>>> git bisect bad 1daa56bcfd8b329447e0c1b1e91c3925d08489b7
>>> # good: [937790699be9c8100e5358625e7dfa8b32bd33f2] mm/page_io.c: annotate 
>>> refault stalls from swap_readpage
>>> git bisect good 937790699be9c8100e5358625e7dfa8b32bd33f2
>>> # good: [a5255bc31673c72e264d837cd13cd3085d72cb58] Merge tag 
>>> 'dmaengine-5.5-rc1' of git://git.infradead.org/users/vkoul/slave-dma
>>> git bisect good a5255bc31673c72e264d837cd13cd3085d72cb58
>>> # good: [34d1b0895dbd10713c73615d8f532e78509e12d9] iommu/arm-smmu: Remove 
>>> duplicate error message
>>> git bisect good 34d1b0895dbd10713c73615d8f532e78509e12d9
>>> # bad: [3c124435e8dd516df4b2fc983f4415386fd6edae] iommu/amd: Support 
>>> multiple PCI DMA aliases in IRQ Remapping
>>> git bisect bad 3c124435e8dd516df4b2fc983f4415386fd6edae
>>

Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)

2020-02-11 Thread Chuck Lever



> On Feb 11, 2020, at 10:12 AM, Robin Murphy  wrote:
> 
> On 11/02/2020 1:48 pm, Chuck Lever wrote:
>> Andre-
>> Thank you for the detailed report!
>> Tom-
>> There is a rich set of trace points available in the RPC/RDMA implementation 
>> in 5.4/5.5, fwiw.
>> Please keep me in the loop, let me know if there is anything I can do to 
>> help.
> 
> One aspect that may be worth checking is whether there's anywhere that 
> assumes a successful return value from dma_map_sg() is always the same as the 
> number of entries passed in - that's the most obvious way the iommu-dma code 
> differs (legitimately) from the previous amd-iommu implementation.

net/sunrpc/xprtrdma/frwr_ops.c: frwr_map()

317 mr->mr_nents =
318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
319 if (!mr->mr_nents)
320 goto out_dmamap_err;

Should that rather be "if (mr->mr_nents != i)" ?


> Robin.
> 
>>> On Feb 11, 2020, at 2:25 AM, Joerg Roedel  wrote:
>>> 
>>> Adding Tom's new email address.
>>> 
>>> Tom, can you have a look, please?
>>> https://bugzilla.kernel.org/show_bug.cgi?id=206461 seems to be a similar
>>> issue.
>>> 
>>> On Tue, Feb 11, 2020 at 06:06:54AM +0100, Andre Tomt wrote:
>>>> Since upgrading my RDMA lab from kernel 5.4.x to 5.5.x, NFSv4 over RDMA
>>>> stopped working. But only on my AMD Ryzen systems. And so far only NFS,
>>>> curiously other RDMA diagnostic tools (like qperf  -cm1 rc_bw) work
>>>> fine.
>>>> 
>>>> A git bisect points to be62dbf554c5b50718a54a359372c148cd9975c7 iommu/amd:
>>>> Convert AMD iommu driver to the dma-iommu api
>>>> 
>>>> 5.5.3-rc1, 5.6-rc1 are also not working.
>>>> 
>>>> I verified it by booting with amd_iommu=off on the kernel cmdline - it 
>>>> makes
>>>> everything work again.
>>>> 
>>>> The NFS config is a pretty simple NFSv4.x only, sec=sys setup, running over
>>>> RoCEv1 on Mellanox mlx4 hardware (ConnectX-3 Pro, fw 2.42.5000). Nothing
>>>> fancy besides the RoCEv1 and related bits network bits like PFC and storage
>>>> VLAN. Bare metal, no virtualization.
>>>> 
>>>> The impacted systems are:
>>>> ASUS ROG STRIX X399-E GAMING, with a Threadripper 1950x, BIOS 1002
>>>> ASUS Pro WS X570-ACE, with a Ryzen 7 3700x, BIOS 1201
>>>> 
>>>> pcaps off a mirror port can be provided. They show that on 5.5.x, CM
>>>> succeeds, and then a couple of NFS NULL calls comes through (over RoCE),
>>>> both acked, and then the rest just never goes out from the client until the
>>>> mount times out and CM is torn down.
>>>> 
>>>> No messages shows up in the kernel log on either side. I was at least
>>>> expecting some scary IOMMU warnings.
>>>> 
>>>> More serious hardware is not available for RDMA testing currently, so I 
>>>> dont
>>>> know if a EPYC system or newer mlx5 cards would have similar issues. Intel
>>>> I've only tested as server so far, that worked fine, as expected given the
>>>> bisect result.
>>>> 
>>>> 
>>>>> git bisect start
>>>>> # bad: [d5226fa6dbae0569ee43ecfc08bdcd6770fc4755] Linux 5.5
>>>>> git bisect bad d5226fa6dbae0569ee43ecfc08bdcd6770fc4755
>>>>> # good: [219d54332a09e8d8741c1e1982f5eae56099de85] Linux 5.4
>>>>> git bisect good 219d54332a09e8d8741c1e1982f5eae56099de85
>>>>> # good: [8c39f71ee2019e77ee14f88b1321b2348db51820] Merge 
>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
>>>>> git bisect good 8c39f71ee2019e77ee14f88b1321b2348db51820
>>>>> # bad: [76bb8b05960c3d1668e6bee7624ed886cbd135ba] Merge tag 'kbuild-v5.5' 
>>>>> of git://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild
>>>>> git bisect bad 76bb8b05960c3d1668e6bee7624ed886cbd135ba
>>>>> # good: [21b26d2679584c6a60e861aa3e5ca09a6bab0633] Merge tag 
>>>>> '5.5-rc-smb3-fixes' of git://git.samba.org/sfrench/cifs-2.6
>>>>> git bisect good 21b26d2679584c6a60e861aa3e5ca09a6bab0633
>>>>> # good: [e5b3fc125d768eacd73bb4dc5019f0ce95635af4] Merge branch 
>>>>> 'x86-urgent-for-linus' of 
>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
>>>>> git bisect good 

Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)

2020-02-11 Thread Chuck Lever



> On Feb 11, 2020, at 10:32 AM, Robin Murphy  wrote:
> 
> On 11/02/2020 3:24 pm, Chuck Lever wrote:
>>> On Feb 11, 2020, at 10:12 AM, Robin Murphy  wrote:
>>> 
>>> On 11/02/2020 1:48 pm, Chuck Lever wrote:
>>>> Andre-
>>>> Thank you for the detailed report!
>>>> Tom-
>>>> There is a rich set of trace points available in the RPC/RDMA 
>>>> implementation in 5.4/5.5, fwiw.
>>>> Please keep me in the loop, let me know if there is anything I can do to 
>>>> help.
>>> 
>>> One aspect that may be worth checking is whether there's anywhere that 
>>> assumes a successful return value from dma_map_sg() is always the same as 
>>> the number of entries passed in - that's the most obvious way the iommu-dma 
>>> code differs (legitimately) from the previous amd-iommu implementation.
>> net/sunrpc/xprtrdma/frwr_ops.c: frwr_map()
>> 317 mr->mr_nents =
>> 318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, 
>> mr->mr_dir);
>> 319 if (!mr->mr_nents)
>> 320 goto out_dmamap_err;
>> Should that rather be "if (mr->mr_nents != i)" ?
> 
> No, that much is OK - the point is that dma_map_sg() may pack the DMA 
> addresses such that sg_dma_len(sg) > sg->length - however, subsequently 
> passing that mr->nents to dma_unmap_sg() in frwr_mr_recycle() (rather than 
> the original value of i) looks at a glance like an example of how things may 
> start to get out-of-whack.

Robin, your explanation makes sense to me. I can post a fix for this imbalance 
later today for Andre to try.


--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: AMD IOMMU stops RDMA NFS from working since kernel 5.5 (bisected)

2020-02-11 Thread Chuck Lever



> On Feb 11, 2020, at 11:36 AM, Robin Murphy  wrote:
> 
> On 11/02/2020 4:03 pm, Chuck Lever wrote:
>>> On Feb 11, 2020, at 10:32 AM, Robin Murphy  wrote:
>>> 
>>> On 11/02/2020 3:24 pm, Chuck Lever wrote:
>>>>> On Feb 11, 2020, at 10:12 AM, Robin Murphy  wrote:
>>>>> 
>>>>> On 11/02/2020 1:48 pm, Chuck Lever wrote:
>>>>>> Andre-
>>>>>> Thank you for the detailed report!
>>>>>> Tom-
>>>>>> There is a rich set of trace points available in the RPC/RDMA 
>>>>>> implementation in 5.4/5.5, fwiw.
>>>>>> Please keep me in the loop, let me know if there is anything I can do to 
>>>>>> help.
>>>>> 
>>>>> One aspect that may be worth checking is whether there's anywhere that 
>>>>> assumes a successful return value from dma_map_sg() is always the same as 
>>>>> the number of entries passed in - that's the most obvious way the 
>>>>> iommu-dma code differs (legitimately) from the previous amd-iommu 
>>>>> implementation.
>>>> net/sunrpc/xprtrdma/frwr_ops.c: frwr_map()
>>>> 317 mr->mr_nents =
>>>> 318 ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, 
>>>> mr->mr_dir);
>>>> 319 if (!mr->mr_nents)
>>>> 320 goto out_dmamap_err;
>>>> Should that rather be "if (mr->mr_nents != i)" ?
>>> 
>>> No, that much is OK - the point is that dma_map_sg() may pack the DMA 
>>> addresses such that sg_dma_len(sg) > sg->length - however, subsequently 
>>> passing that mr->nents to dma_unmap_sg() in frwr_mr_recycle() (rather than 
>>> the original value of i) looks at a glance like an example of how things 
>>> may start to get out-of-whack.
>> Robin, your explanation makes sense to me. I can post a fix for this 
>> imbalance later today for Andre to try.
> 
> FWIW here's a quick hack which *should* suppress the concatenation behaviour 
> - if it makes Andre's system any happier then that would indeed point towards 
> dma_map_sg() handling being the culprit.

Even so, 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails")
looks like it introduced this problem.


> Robin.
> 
> ->8-
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index a2e96a5fd9a7..a6b71bad518e 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -779,7 +779,7 @@ static int __finalise_sg(struct device *dev, struct 
> scatterlist *sg, int nents,
>* - but doesn't fall at a segment boundary
>* - and wouldn't make the resulting output segment too long
>*/
> -     if (cur_len && !s_iova_off && (dma_addr & seg_mask) &&
> + if (0 && cur_len && !s_iova_off && (dma_addr & seg_mask) &&
>   (max_len - cur_len >= s_length)) {
>   /* ...then concatenate it with the previous one */
>   cur_len += s_length;
> @@ -799,6 +799,7 @@ static int __finalise_sg(struct device *dev, struct 
> scatterlist *sg, int nents,
>   if (s_length + s_iova_off < s_iova_len)
>   cur_len = 0;
>   }
> + WARN_ON(count < nents);
>   return count;
> }

--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v1] xprtrdma: Fix DMA scatter-gather list mapping imbalance

2020-02-11 Thread Chuck Lever
The @nents value that was passed to ib_dma_map_sg() has to be passed
to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to
concatenate sg entries, it will return a different nents value than
it was passed.

The bug was exposed by recent changes to the AMD IOMMU driver.

Reported-by: Andre Tomt 
Suggested-by: Robin Murphy 
Fixes: 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails")
Signed-off-by: Chuck Lever 
---
 net/sunrpc/xprtrdma/frwr_ops.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

Hey Andre, please try this out. It just reverts the bit of brokenness that
Robin observed this morning. I've done basic testing here with Intel
IOMMU systems, no change in behavior (ie, all good to go).

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 095be887753e..449bb51e4fe8 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -313,10 +313,9 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt 
*r_xprt,
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
+   mr->mr_nents = i;
 
-   mr->mr_nents =
-   ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
-   if (!mr->mr_nents)
+   if (!ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir))
goto out_dmamap_err;
 
ibmr = mr->frwr.fr_mr;


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v1] xprtrdma: Fix DMA scatter-gather list mapping imbalance

2020-02-11 Thread Chuck Lever
Hi Andre, thanks for trying this out.

> On Feb 11, 2020, at 3:50 PM, Andre Tomt  wrote:
> 
> On 11.02.2020 20:58, Chuck Lever wrote:
>> The @nents value that was passed to ib_dma_map_sg() has to be passed
>> to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to
>> concatenate sg entries, it will return a different nents value than
>> it was passed.
>> The bug was exposed by recent changes to the AMD IOMMU driver.
> 
> This seems to fail differently on my system; mount fails with:
> mount.nfs: mount system call failed
> 
> and the kernel log reports:
> [   38.890344] NFS: Registering the id_resolver key type
> [   38.890351] Key type id_resolver registered
> [   38.890352] Key type id_legacy registered
> [   38.901799] NFS: nfs4_discover_server_trunking unhandled error -5. Exiting 
> with error EIO
> [   38.901817] NFS4: Couldn't follow remote path
> 
> amd_iommu=off still works
> 
> One detail I accidentally left out of the original report is that the server 
> (intel system) is running Ubuntu 20.04 ("beta") userspace, and AMD clients 
> are Ubuntu 19.10 userspace. Although I dont believe this to matter at this 
> point.

Next thing to try:

# trace-cmd record -e sunrpc -e rpcrdma

then issue the mount command. Once it completes, ^C the trace-cmd and send me 
trace.dat.

Try this with both the v5.4 kernel and the v5.5 kernel (and note that trace-cmd 
overwrites trace.dat, so copy it out between tests).


>> Reported-by: Andre Tomt 
>> Suggested-by: Robin Murphy 
>> Fixes: 1f541895dae9 ("xprtrdma: Don't defer MR recovery if ro_map fails")
>> Signed-off-by: Chuck Lever 
>> ---
>>  net/sunrpc/xprtrdma/frwr_ops.c |5 ++---
>>  1 file changed, 2 insertions(+), 3 deletions(-)
>> Hey Andre, please try this out. It just reverts the bit of brokenness that
>> Robin observed this morning. I've done basic testing here with Intel
>> IOMMU systems, no change in behavior (ie, all good to go).
>> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
>> index 095be887753e..449bb51e4fe8 100644
>> --- a/net/sunrpc/xprtrdma/frwr_ops.c
>> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
>> @@ -313,10 +313,9 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt 
>> *r_xprt,
>>  break;
>>  }
>>  mr->mr_dir = rpcrdma_data_dir(writing);
>> +mr->mr_nents = i;
>>  -   mr->mr_nents =
>> -ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
>> -if (!mr->mr_nents)
>> +if (!ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir))
>>  goto out_dmamap_err;
>>  ibmr = mr->frwr.fr_mr;
> 

--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance

2020-02-12 Thread Chuck Lever
The @nents value that was passed to ib_dma_map_sg() has to be passed
to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to
concatenate sg entries, it will return a different nents value than
it was passed.

The bug was exposed by recent changes to the AMD IOMMU driver, which
enabled sg entry concatenation.

Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new
memory registration API") and reviewing other kernel ULPs, it's not
clear that the frwr_map() logic was ever correct for this case.

Reported-by: Andre Tomt 
Suggested-by: Robin Murphy 
Signed-off-by: Chuck Lever 
---
 include/trace/events/rpcrdma.h |6 --
 net/sunrpc/xprtrdma/frwr_ops.c |   13 +++--
 2 files changed, 11 insertions(+), 8 deletions(-)

Hi Andre, here's take 2, based on the trace data you sent me.
Please let me know if this one fares any better.

Changes since v1:
- Ensure the correct nents value is passed to ib_map_mr_sg
- Record the mr_nents value in the MR trace points

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index c0e4c93324f5..023c5da45999 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -275,6 +275,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
 
TP_STRUCT__entry(
__field(const void *, mr)
+   __field(unsigned int, nents)
__field(u32, handle)
__field(u32, length)
__field(u64, offset)
@@ -283,14 +284,15 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
 
TP_fast_assign(
__entry->mr = mr;
+   __entry->nents = mr->mr_nents;
__entry->handle = mr->mr_handle;
__entry->length = mr->mr_length;
__entry->offset = mr->mr_offset;
__entry->dir= mr->mr_dir;
),
 
-   TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)",
-   __entry->mr, __entry->length,
+   TP_printk("mr=%p %d %u@0x%016llx:0x%08x (%s)",
+   __entry->mr, __entry->mr_nents, __entry->length,
(unsigned long long)__entry->offset, __entry->handle,
xprtrdma_show_direction(__entry->dir)
)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 095be887753e..75617646702b 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 {
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_reg_wr *reg_wr;
+   int i, n, dma_nents;
struct ib_mr *ibmr;
-   int i, n;
u8 key;
 
if (nsegs > ia->ri_max_frwr_depth)
@@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt 
*r_xprt,
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
+   mr->mr_nents = i;
 
-   mr->mr_nents =
-   ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
-   if (!mr->mr_nents)
+   dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg,
+ mr->mr_nents, mr->mr_dir);
+   if (!dma_nents)
goto out_dmamap_err;
 
ibmr = mr->frwr.fr_mr;
-   n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
-   if (unlikely(n != mr->mr_nents))
+   n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
+   if (n != dma_nents)
goto out_mapmr_err;
 
ibmr->iova &= 0x;


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance

2020-02-12 Thread Chuck Lever



> On Feb 12, 2020, at 8:43 AM, Chuck Lever  wrote:
> 
> The @nents value that was passed to ib_dma_map_sg() has to be passed
> to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to
> concatenate sg entries, it will return a different nents value than
> it was passed.
> 
> The bug was exposed by recent changes to the AMD IOMMU driver, which
> enabled sg entry concatenation.
> 
> Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new
> memory registration API") and reviewing other kernel ULPs, it's not
> clear that the frwr_map() logic was ever correct for this case.
> 
> Reported-by: Andre Tomt 
> Suggested-by: Robin Murphy 
> Signed-off-by: Chuck Lever 
> ---
> include/trace/events/rpcrdma.h |6 --
> net/sunrpc/xprtrdma/frwr_ops.c |   13 +++--
> 2 files changed, 11 insertions(+), 8 deletions(-)
> 
> Hi Andre, here's take 2, based on the trace data you sent me.
> Please let me know if this one fares any better.
> 
> Changes since v1:
> - Ensure the correct nents value is passed to ib_map_mr_sg
> - Record the mr_nents value in the MR trace points
> 
> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
> index c0e4c93324f5..023c5da45999 100644
> --- a/include/trace/events/rpcrdma.h
> +++ b/include/trace/events/rpcrdma.h
> @@ -275,6 +275,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
> 
>   TP_STRUCT__entry(
>   __field(const void *, mr)
> + __field(unsigned int, nents)
>   __field(u32, handle)
>   __field(u32, length)
>   __field(u64, offset)
> @@ -283,14 +284,15 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
> 
>   TP_fast_assign(
>   __entry->mr = mr;
> + __entry->nents = mr->mr_nents;
>   __entry->handle = mr->mr_handle;
>   __entry->length = mr->mr_length;
>   __entry->offset = mr->mr_offset;
>   __entry->dir= mr->mr_dir;
>   ),
> 
> - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)",
> - __entry->mr, __entry->length,
> + TP_printk("mr=%p %d %u@0x%016llx:0x%08x (%s)",
> + __entry->mr, __entry->mr_nents, __entry->length,

This should be:
__entry->mr, __entry->nents, __entry->length,

Sorry about that.


>   (unsigned long long)__entry->offset, __entry->handle,
>   xprtrdma_show_direction(__entry->dir)
>   )
> diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
> index 095be887753e..75617646702b 100644
> --- a/net/sunrpc/xprtrdma/frwr_ops.c
> +++ b/net/sunrpc/xprtrdma/frwr_ops.c
> @@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt 
> *r_xprt,
> {
>   struct rpcrdma_ia *ia = &r_xprt->rx_ia;
>   struct ib_reg_wr *reg_wr;
> + int i, n, dma_nents;
>   struct ib_mr *ibmr;
> - int i, n;
>   u8 key;
> 
>   if (nsegs > ia->ri_max_frwr_depth)
> @@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt 
> *r_xprt,
>   break;
>   }
>   mr->mr_dir = rpcrdma_data_dir(writing);
> + mr->mr_nents = i;
> 
> - mr->mr_nents =
> - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir);
> - if (!mr->mr_nents)
> + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg,
> +   mr->mr_nents, mr->mr_dir);
> + if (!dma_nents)
>   goto out_dmamap_err;
> 
>   ibmr = mr->frwr.fr_mr;
> - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
> - if (unlikely(n != mr->mr_nents))
> + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
> + if (n != dma_nents)
>   goto out_mapmr_err;
> 
>   ibmr->iova &= 0x;
> 
> 

--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2] xprtrdma: Fix DMA scatter-gather list mapping imbalance

2020-02-12 Thread Chuck Lever



> On Feb 12, 2020, at 11:03 AM, Andre Tomt  wrote:
> 
> On 12.02.2020 14:48, Chuck Lever wrote:
>>> On Feb 12, 2020, at 8:43 AM, Chuck Lever  wrote:
>>> 
>>> The @nents value that was passed to ib_dma_map_sg() has to be passed
>>> to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to
>>> concatenate sg entries, it will return a different nents value than
>>> it was passed.
>>> 
>>> The bug was exposed by recent changes to the AMD IOMMU driver, which
>>> enabled sg entry concatenation.
>>> 
>>> Looking all the way back to 4143f34e01e9 ("xprtrdma: Port to new
>>> memory registration API") and reviewing other kernel ULPs, it's not
>>> clear that the frwr_map() logic was ever correct for this case.
>>> 
>>> Reported-by: Andre Tomt 
>>> Suggested-by: Robin Murphy 
>>> Signed-off-by: Chuck Lever 
>>> ---
>>> include/trace/events/rpcrdma.h |6 --
>>> net/sunrpc/xprtrdma/frwr_ops.c |   13 +++--
>>> 2 files changed, 11 insertions(+), 8 deletions(-)
>>> 
>>> Hi Andre, here's take 2, based on the trace data you sent me.
>>> Please let me know if this one fares any better.
>>> 
>>> Changes since v1:
>>> - Ensure the correct nents value is passed to ib_map_mr_sg
>>> - Record the mr_nents value in the MR trace points
> Verified working (with the patch correction) in my environment, with some 
> quick testing (mount + some random and bulk I/O)
> 
> client, 5.5.3 + patch + amd iommu on = OK
> client, 5.5.3 + patch + amd iommu off = OK
> client, 5.6-rc1 + patch + amd iommu on = OK
> 
> server, 5.5.3 + patch + intel iommu on = OK

Very good! I'll submit the fix through the NFS tree ASAP, and request backport 
to v5.5 stable.

--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40

2021-01-26 Thread Chuck Lever



> On Jan 26, 2021, at 1:18 AM, Lu Baolu  wrote:
> 
> On 2021/1/26 3:31, Chuck Lever wrote:
>>> On Jan 25, 2021, at 12:39 PM, Chuck Lever  wrote:
>>> 
>>> Hello Lu -
>>> 
>>> Many thanks for your prototype.
>>> 
>>> 
>>>> On Jan 24, 2021, at 9:38 PM, Lu Baolu  wrote:
>>>> 
>>>> This patch series is only for Request-For-Testing purpose. It aims to
>>>> fix the performance regression reported here.
>>>> 
>>>> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/
>>>> 
>>>> The first two patches are borrowed from here.
>>>> 
>>>> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/
>>>> 
>>>> Please kindly help to verification.
>>>> 
>>>> Best regards,
>>>> baolu
>>>> 
>>>> Lu Baolu (1):
>>>> iommu/vt-d: Add iotlb_sync_map callback
>>>> 
>>>> Yong Wu (2):
>>>> iommu: Move iotlb_sync_map out from __iommu_map
>>>> iommu: Add iova and size as parameters in iotlb_sync_map
>>>> 
>>>> drivers/iommu/intel/iommu.c | 86 +
>>>> drivers/iommu/iommu.c   | 23 +++---
>>>> drivers/iommu/tegra-gart.c  |  7 ++-
>>>> include/linux/iommu.h   |  3 +-
>>>> 4 files changed, 83 insertions(+), 36 deletions(-)
>>> 
>>> Here are results with the NFS client at stock v5.11-rc5 and the
>>> NFS server at v5.10, showing the regression I reported earlier.
>>> 
>>> Children see throughput for 12 initial writers  = 4534582.00 kB/sec
>>> Parent sees throughput for 12 initial writers   = 4458145.56 kB/sec
>>> Min throughput per process  = 373101.59 kB/sec
>>> Max throughput per process  = 382669.50 kB/sec
>>> Avg throughput per process  = 377881.83 kB/sec
>>> Min xfer= 1022720.00 kB
>>> CPU Utilization: Wall time2.787CPU time1.922CPU 
>>> utilization  68.95 %
>>> 
>>> 
>>> Children see throughput for 12 rewriters= 4542003.12 kB/sec
>>> Parent sees throughput for 12 rewriters = 4538024.19 kB/sec
>>> Min throughput per process  = 374672.00 kB/sec
>>> Max throughput per process  = 383983.78 kB/sec
>>> Avg throughput per process  = 378500.26 kB/sec
>>> Min xfer= 1022976.00 kB
>>> CPU utilization: Wall time2.733CPU time1.947CPU 
>>> utilization  71.25 %
>>> 
>>> 
>>> Children see throughput for 12 readers  = 4568632.03 kB/sec
>>> Parent sees throughput for 12 readers   = 4563672.02 kB/sec
>>> Min throughput per process  = 376727.56 kB/sec
>>> Max throughput per process  = 383783.91 kB/sec
>>> Avg throughput per process  = 380719.34 kB/sec
>>> Min xfer= 1029376.00 kB
>>> CPU utilization: Wall time2.733CPU time1.898CPU 
>>> utilization  69.46 %
>>> 
>>> 
>>> Children see throughput for 12 re-readers   = 4610702.78 kB/sec
>>> Parent sees throughput for 12 re-readers= 4606135.66 kB/sec
>>> Min throughput per process  = 381532.78 kB/sec
>>> Max throughput per process  = 387072.53 kB/sec
>>> Avg throughput per process  = 384225.23 kB/sec
>>> Min xfer= 1034496.00 kB
>>> CPU utilization: Wall time2.711CPU time1.910CPU 
>>> utilization  70.45 %
>>> 
>>> Here's the NFS client at v5.11-rc5 with your series applied.
>>> The NFS server remains at v5.10:
>>> 
>>> Children see throughput for 12 initial writers  = 4434778.81 kB/sec
>>> Parent sees throughput for 12 initial writers   = 4408190.69 kB/sec
>>> Min throughput per process  = 367865.28 kB/sec
>>> Max throughput per process  = 371134.38 kB/sec
>>> Avg throughput per process  = 369564.90 kB/sec
>>> Min xfer  

Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40

2021-01-26 Thread Chuck Lever
Hi Robin-

> On Jan 26, 2021, at 11:05 AM, Robin Murphy  wrote:
> 
> Implementing .iotlb_sync_map means that a single top-level 
> iommu_map()/iommu_map_sg() call should still only invoke a single "TLB flush" 
> (really, any maintenance required for the IOMMU to start using the new 
> mapping) at the end, regardless of how many internal __iommu_map() calls are 
> made to satisfy the overall request. If you're seeing something other than 
> that behaviour (with this series), that implies we've not got things quite 
> right yet.


The flush is expensive, but it's not the only cost. DMA-mapping a 120KB
SGL in a single domain_mapping() call vs. 30 calls is certainly going to
be a detectable difference.

Naively speaking, if there are more DMA mappings to keep track of because
the IOMMU driver isn't coalescing the SGLs the way it did before, that
might trigger TLB thrashing on the NIC.


> Is there any significant difference between how the NFS read and write paths 
> make their DMA API calls and/or get their scatterlists in the first place, 
> that might help shed some light on the curious half-recovery you got?

There isn't a difference in the RPC-over-RDMA code. Client-side DMA mapping
is handled in net/sunrpc/xprtrdma/frwr_ops.c :: frwr_map() which is used for
both I/O directions.

On the server, the RDMA core r/w API is used for mapping and then posting
RDMA Read and Write work requests. That API appears in
drivers/infiniband/core/rw.c , and as far as I understand, the same mapping
functions are used for both I/O directions.

It's possible that the NIC is doing something different for RDMA Read and
RDMA Write, but I don't have much visibility into that. Reads are very
different from Writes, which are posted.


--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40

2021-01-26 Thread Chuck Lever



> On Jan 26, 2021, at 8:53 PM, Lu Baolu  wrote:
> 
> Hi Chuck,
> 
> On 1/26/21 11:52 PM, Chuck Lever wrote:
>>> On Jan 26, 2021, at 1:18 AM, Lu Baolu  wrote:
>>> 
>>> On 2021/1/26 3:31, Chuck Lever wrote:
>>>>> On Jan 25, 2021, at 12:39 PM, Chuck Lever  wrote:
>>>>> 
>>>>> Hello Lu -
>>>>> 
>>>>> Many thanks for your prototype.
>>>>> 
>>>>> 
>>>>>> On Jan 24, 2021, at 9:38 PM, Lu Baolu  wrote:
>>>>>> 
>>>>>> This patch series is only for Request-For-Testing purpose. It aims to
>>>>>> fix the performance regression reported here.
>>>>>> 
>>>>>> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/
>>>>>> 
>>>>>> The first two patches are borrowed from here.
>>>>>> 
>>>>>> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/
>>>>>> 
>>>>>> Please kindly help to verification.
>>>>>> 
>>>>>> Best regards,
>>>>>> baolu
>>>>>> 
>>>>>> Lu Baolu (1):
>>>>>> iommu/vt-d: Add iotlb_sync_map callback
>>>>>> 
>>>>>> Yong Wu (2):
>>>>>> iommu: Move iotlb_sync_map out from __iommu_map
>>>>>> iommu: Add iova and size as parameters in iotlb_sync_map
>>>>>> 
>>>>>> drivers/iommu/intel/iommu.c | 86 +
>>>>>> drivers/iommu/iommu.c   | 23 +++---
>>>>>> drivers/iommu/tegra-gart.c  |  7 ++-
>>>>>> include/linux/iommu.h   |  3 +-
>>>>>> 4 files changed, 83 insertions(+), 36 deletions(-)
>>>>> 
>>>>> Here are results with the NFS client at stock v5.11-rc5 and the
>>>>> NFS server at v5.10, showing the regression I reported earlier.
>>>>> 
>>>>>   Children see throughput for 12 initial writers  = 4534582.00 kB/sec
>>>>>   Parent sees throughput for 12 initial writers   = 4458145.56 kB/sec
>>>>>   Min throughput per process  = 373101.59 kB/sec
>>>>>   Max throughput per process  = 382669.50 kB/sec
>>>>>   Avg throughput per process  = 377881.83 kB/sec
>>>>>   Min xfer= 1022720.00 kB
>>>>>   CPU Utilization: Wall time2.787CPU time1.922CPU 
>>>>> utilization  68.95 %
>>>>> 
>>>>> 
>>>>>   Children see throughput for 12 rewriters= 4542003.12 kB/sec
>>>>>   Parent sees throughput for 12 rewriters = 4538024.19 kB/sec
>>>>>   Min throughput per process  = 374672.00 kB/sec
>>>>>   Max throughput per process  = 383983.78 kB/sec
>>>>>   Avg throughput per process  = 378500.26 kB/sec
>>>>>   Min xfer= 1022976.00 kB
>>>>>   CPU utilization: Wall time2.733CPU time1.947CPU 
>>>>> utilization  71.25 %
>>>>> 
>>>>> 
>>>>>   Children see throughput for 12 readers  = 4568632.03 kB/sec
>>>>>   Parent sees throughput for 12 readers   = 4563672.02 kB/sec
>>>>>   Min throughput per process  = 376727.56 kB/sec
>>>>>   Max throughput per process  = 383783.91 kB/sec
>>>>>   Avg throughput per process  = 380719.34 kB/sec
>>>>>   Min xfer= 1029376.00 kB
>>>>>   CPU utilization: Wall time2.733CPU time1.898CPU 
>>>>> utilization  69.46 %
>>>>> 
>>>>> 
>>>>>   Children see throughput for 12 re-readers   = 4610702.78 kB/sec
>>>>>   Parent sees throughput for 12 re-readers= 4606135.66 kB/sec
>>>>>   Min throughput per process  = 381532.78 kB/sec
>>>>>   Max throughput per process  = 387072.53 kB/sec
>>>>>   Avg throughput per process  = 384225.23 kB/sec
>>>>>   Min xfer= 1034496.00 kB
>>>>>   CPU utilization: Wall time   

[PATCH RFC 0/9] Possible set of VT-d optimizations

2021-01-27 Thread Chuck Lever
Hi-

This collection of patches seems to get the best throughtput results
so far. The NFS WRITE result is fully restored, and the NFS READ
result is very close to fully restored.

Children see throughput for 12 initial writers  = 5008474.03 kB/sec
Parent sees throughput for 12 initial writers   = 4996927.80 kB/sec
Min throughput per process  = 416956.88 kB/sec 
Max throughput per process  = 417910.22 kB/sec
Avg throughput per process  = 417372.84 kB/sec
Min xfer= 1046272.00 kB
CPU Utilization: Wall time2.515CPU time1.996CPU 
utilization  79.37 %


Children see throughput for 12 rewriters= 5020584.59 kB/sec
Parent sees throughput for 12 rewriters = 5012539.29 kB/sec
Min throughput per process  = 417799.00 kB/sec 
Max throughput per process  = 419082.22 kB/sec
Avg throughput per process  = 418382.05 kB/sec
Min xfer= 1046528.00 kB
CPU utilization: Wall time2.507CPU time2.024CPU 
utilization  80.73 %


Children see throughput for 12 readers  = 5805484.25 kB/sec
Parent sees throughput for 12 readers   = 5799535.68 kB/sec
Min throughput per process  = 482888.16 kB/sec 
Max throughput per process  = 48.16 kB/sec
Avg throughput per process  = 483790.35 kB/sec
Min xfer= 1045760.00 kB
CPU utilization: Wall time2.167CPU time1.964CPU 
utilization  90.63 %


Children see throughput for 12 re-readers   = 5812227.16 kB/sec
Parent sees throughput for 12 re-readers= 5803793.06 kB/sec
Min throughput per process  = 483242.97 kB/sec 
Max throughput per process  = 485724.41 kB/sec
Avg throughput per process  = 484352.26 kB/sec
Min xfer= 1043456.00 kB
CPU utilization: Wall time2.161CPU time1.976CPU 
utilization  91.45 %

I've included a simple-minded implementation of a map_sg op for
the Intel IOMMU. This is nothing more than a copy of the loop in
__iommu_map_sg() with the call to __iommu_map() replaced with a
call to intel_iommu_map().

---

Chuck Lever (1):
  iommu/vt-d: Introduce map_sg() for Intel IOMMUs

Isaac J. Manjarres (5):
  iommu/io-pgtable: Introduce map_sg() as a page table op
  iommu/io-pgtable-arm: Hook up map_sg()
  iommu/io-pgtable-arm-v7s: Hook up map_sg()
  iommu: Introduce map_sg() as an IOMMU op for IOMMU drivers
  iommu/arm-smmu: Hook up map_sg()

Lu Baolu (1):
  iommu/vt-d: Add iotlb_sync_map callback

Yong Wu (2):
  iommu: Move iotlb_sync_map out from __iommu_map
  iommu: Add iova and size as parameters in iotlb_sync_map


 drivers/iommu/arm/arm-smmu/arm-smmu.c |  19 
 drivers/iommu/intel/iommu.c   | 131 --
 drivers/iommu/io-pgtable-arm-v7s.c|  90 ++
 drivers/iommu/io-pgtable-arm.c|  86 +
 drivers/iommu/iommu.c |  47 +++--
 drivers/iommu/tegra-gart.c|   7 +-
 include/linux/iommu.h |  16 +++-
 7 files changed, 353 insertions(+), 43 deletions(-)

--
Chuck Lever

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 1/9] iommu: Move iotlb_sync_map out from __iommu_map

2021-01-27 Thread Chuck Lever
From: Yong Wu 

In the end of __iommu_map, It alway call iotlb_sync_map.

This patch moves iotlb_sync_map out from __iommu_map since it is
unnecessary to call this for each sg segment especially iotlb_sync_map
is flush tlb all currently. Add a little helper _iommu_map for this.

Signed-off-by: Yong Wu 
Reviewed-by: Robin Murphy 
Signed-off-by: Chuck Lever 
---
 drivers/iommu/iommu.c |   23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffeebda8d6de..c304a6a30d42 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2426,9 +2426,6 @@ static int __iommu_map(struct iommu_domain *domain, 
unsigned long iova,
size -= pgsize;
}
 
-   if (ops->iotlb_sync_map)
-   ops->iotlb_sync_map(domain);
-
/* unroll mapping in case something went wrong */
if (ret)
iommu_unmap(domain, orig_iova, orig_size - size);
@@ -2438,18 +2435,31 @@ static int __iommu_map(struct iommu_domain *domain, 
unsigned long iova,
return ret;
 }
 
+static int _iommu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+   const struct iommu_ops *ops = domain->ops;
+   int ret;
+
+   ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+   if (ret == 0 && ops->iotlb_sync_map)
+   ops->iotlb_sync_map(domain);
+
+   return ret;
+}
+
 int iommu_map(struct iommu_domain *domain, unsigned long iova,
  phys_addr_t paddr, size_t size, int prot)
 {
might_sleep();
-   return __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
+   return _iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
 int iommu_map_atomic(struct iommu_domain *domain, unsigned long iova,
  phys_addr_t paddr, size_t size, int prot)
 {
-   return __iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
+   return _iommu_map(domain, iova, paddr, size, prot, GFP_ATOMIC);
 }
 EXPORT_SYMBOL_GPL(iommu_map_atomic);
 
@@ -2533,6 +2543,7 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, 
unsigned long iova,
 struct scatterlist *sg, unsigned int nents, int 
prot,
 gfp_t gfp)
 {
+   const struct iommu_ops *ops = domain->ops;
size_t len = 0, mapped = 0;
phys_addr_t start;
unsigned int i = 0;
@@ -2563,6 +2574,8 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, 
unsigned long iova,
sg = sg_next(sg);
}
 
+   if (ops->iotlb_sync_map)
+   ops->iotlb_sync_map(domain);
return mapped;
 
 out_err:


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 2/9] iommu: Add iova and size as parameters in iotlb_sync_map

2021-01-27 Thread Chuck Lever
From: Yong Wu 

iotlb_sync_map allow IOMMU drivers tlb sync after completing the whole
mapping. This patch adds iova and size as the parameters in it. then the
IOMMU driver could flush tlb with the whole range once after iova mapping
to improve performance.

Signed-off-by: Yong Wu 
Reviewed-by: Robin Murphy 
Signed-off-by: Chuck Lever 
---
 drivers/iommu/iommu.c  |4 ++--
 drivers/iommu/tegra-gart.c |7 +--
 include/linux/iommu.h  |3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c304a6a30d42..3d099a31ddca 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2443,7 +2443,7 @@ static int _iommu_map(struct iommu_domain *domain, 
unsigned long iova,
 
ret = __iommu_map(domain, iova, paddr, size, prot, GFP_KERNEL);
if (ret == 0 && ops->iotlb_sync_map)
-   ops->iotlb_sync_map(domain);
+   ops->iotlb_sync_map(domain, iova, size);
 
return ret;
 }
@@ -2575,7 +2575,7 @@ static size_t __iommu_map_sg(struct iommu_domain *domain, 
unsigned long iova,
}
 
if (ops->iotlb_sync_map)
-   ops->iotlb_sync_map(domain);
+   ops->iotlb_sync_map(domain, iova, mapped);
return mapped;
 
 out_err:
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index fac720273889..05e8e19b8269 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -261,7 +261,8 @@ static int gart_iommu_of_xlate(struct device *dev,
return 0;
 }
 
-static void gart_iommu_sync_map(struct iommu_domain *domain)
+static void gart_iommu_sync_map(struct iommu_domain *domain, unsigned long 
iova,
+   size_t size)
 {
FLUSH_GART_REGS(gart_handle);
 }
@@ -269,7 +270,9 @@ static void gart_iommu_sync_map(struct iommu_domain *domain)
 static void gart_iommu_sync(struct iommu_domain *domain,
struct iommu_iotlb_gather *gather)
 {
-   gart_iommu_sync_map(domain);
+   size_t length = gather->end - gather->start;
+
+   gart_iommu_sync_map(domain, gather->start, length);
 }
 
 static const struct iommu_ops gart_iommu_ops = {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b3f0e2018c62..9ce0aa9e236b 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -246,7 +246,8 @@ struct iommu_ops {
size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 size_t size, struct iommu_iotlb_gather *iotlb_gather);
void (*flush_iotlb_all)(struct iommu_domain *domain);
-   void (*iotlb_sync_map)(struct iommu_domain *domain);
+   void (*iotlb_sync_map)(struct iommu_domain *domain, unsigned long iova,
+  size_t size);
void (*iotlb_sync)(struct iommu_domain *domain,
   struct iommu_iotlb_gather *iotlb_gather);
phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t 
iova);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 6/9] iommu/io-pgtable-arm-v7s: Hook up map_sg()

2021-01-27 Thread Chuck Lever
From: Isaac J. Manjarres 

Implement the map_sg io-pgtable op for the ARMv7s io-pgtable
code, so that IOMMU drivers can call it when they need to map
a scatter-gather list.

Signed-off-by: Isaac J. Manjarres 
Tested-by: Sai Prakash Ranjan 
Signed-off-by: Chuck Lever 
---
 drivers/iommu/io-pgtable-arm-v7s.c |   90 
 1 file changed, 90 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
b/drivers/iommu/io-pgtable-arm-v7s.c
index 1d92ac948db7..8665dabb753b 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -545,6 +545,95 @@ static int arm_v7s_map(struct io_pgtable_ops *ops, 
unsigned long iova,
return ret;
 }
 
+static int arm_v7s_map_by_pgsize(struct io_pgtable_ops *ops,
+unsigned long iova, phys_addr_t paddr,
+size_t size, int prot, gfp_t gfp,
+size_t *mapped)
+{
+   struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
+   struct io_pgtable *iop = &data->iop;
+   struct io_pgtable_cfg *cfg = &iop->cfg;
+   unsigned int min_pagesz = 1 << __ffs(cfg->pgsize_bitmap);
+   int ret;
+   size_t pgsize;
+
+   if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+   pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 
0x%x\n",
+  iova, &paddr, size, min_pagesz);
+   return -EINVAL;
+   }
+
+   if (WARN_ON((iova + size - 1) >= (1ULL << cfg->ias) ||
+   (paddr + size - 1) >= (1ULL << cfg->oas)))
+   return -ERANGE;
+
+   while (size) {
+   pgsize = iommu_pgsize(cfg->pgsize_bitmap, iova | paddr, size);
+   ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1,
+   data->pgd, gfp);
+
+   if (iop->cfg.quirks & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
+   io_pgtable_tlb_flush_walk(&data->iop, iova, size,
+ ARM_V7S_BLOCK_SIZE(2));
+   } else {
+   wmb();
+   }
+
+   if (ret)
+   return ret;
+
+   iova += pgsize;
+   paddr += pgsize;
+   *mapped += pgsize;
+   size -= pgsize;
+   }
+
+   return 0;
+}
+
+static int arm_v7s_map_sg(struct io_pgtable_ops *ops, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents,
+ int iommu_prot, gfp_t gfp, size_t *mapped)
+{
+   size_t len = 0;
+   unsigned int i = 0;
+   int ret;
+   phys_addr_t start;
+
+   *mapped = 0;
+
+   /* If no access, then nothing to do */
+   if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+   return 0;
+
+   while (i <= nents) {
+   phys_addr_t s_phys = sg_phys(sg);
+
+   if (len && s_phys != start + len) {
+   ret = arm_v7s_map_by_pgsize(ops, iova + *mapped, start,
+   len, iommu_prot, gfp,
+   mapped);
+
+   if (ret)
+   return ret;
+
+   len = 0;
+   }
+
+   if (len) {
+   len += sg->length;
+   } else {
+   len = sg->length;
+   start = s_phys;
+   }
+
+   if (++i < nents)
+   sg = sg_next(sg);
+   }
+
+   return 0;
+}
+
 static void arm_v7s_free_pgtable(struct io_pgtable *iop)
 {
struct arm_v7s_io_pgtable *data = io_pgtable_to_data(iop);
@@ -783,6 +872,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct 
io_pgtable_cfg *cfg,
 
data->iop.ops = (struct io_pgtable_ops) {
.map= arm_v7s_map,
+   .map_sg = arm_v7s_map_sg,
.unmap  = arm_v7s_unmap,
.iova_to_phys   = arm_v7s_iova_to_phys,
};


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 5/9] iommu/io-pgtable-arm: Hook up map_sg()

2021-01-27 Thread Chuck Lever
From: Isaac J. Manjarres 

Implement the map_sg io-pgtable op for the ARM LPAE io-pgtable
code, so that IOMMU drivers can call it when they need to map
a scatter-gather list.

Signed-off-by: Isaac J. Manjarres 
Tested-by: Sai Prakash Ranjan 
Signed-off-by: Chuck Lever 
---
 drivers/iommu/io-pgtable-arm.c |   86 
 drivers/iommu/iommu.c  |   12 +++---
 include/linux/iommu.h  |8 
 3 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 87def58e79b5..0c11529442b8 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -473,6 +473,91 @@ static int arm_lpae_map(struct io_pgtable_ops *ops, 
unsigned long iova,
return ret;
 }
 
+static int arm_lpae_map_by_pgsize(struct io_pgtable_ops *ops,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int iommu_prot, gfp_t gfp,
+ size_t *mapped)
+{
+   struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+   struct io_pgtable_cfg *cfg = &data->iop.cfg;
+   arm_lpae_iopte *ptep = data->pgd;
+   int ret, lvl = data->start_level;
+   arm_lpae_iopte prot = arm_lpae_prot_to_pte(data, iommu_prot);
+   unsigned int min_pagesz = 1 << __ffs(cfg->pgsize_bitmap);
+   long iaext = (s64)(iova + size - 1) >> cfg->ias;
+   size_t pgsize;
+
+   if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) {
+   pr_err("unaligned: iova 0x%lx pa %pa size 0x%zx min_pagesz 
0x%x\n",
+  iova, &paddr, size, min_pagesz);
+   return -EINVAL;
+   }
+
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)
+   iaext = ~iaext;
+   if (WARN_ON(iaext || (paddr + size - 1) >> cfg->oas))
+   return -ERANGE;
+
+   while (size) {
+   pgsize = iommu_pgsize(cfg->pgsize_bitmap, iova | paddr, size);
+   ret = __arm_lpae_map(data, iova, paddr, pgsize, prot, lvl, ptep,
+gfp);
+   if (ret)
+   return ret;
+
+   iova += pgsize;
+   paddr += pgsize;
+   *mapped += pgsize;
+   size -= pgsize;
+   }
+
+   return 0;
+}
+
+static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova,
+  struct scatterlist *sg, unsigned int nents,
+  int iommu_prot, gfp_t gfp, size_t *mapped)
+{
+
+   size_t len = 0;
+   unsigned int i = 0;
+   int ret;
+   phys_addr_t start;
+
+   *mapped = 0;
+
+   /* If no access, then nothing to do */
+   if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
+   return 0;
+
+   while (i <= nents) {
+   phys_addr_t s_phys = sg_phys(sg);
+
+   if (len && s_phys != start + len) {
+   ret = arm_lpae_map_by_pgsize(ops, iova + *mapped, start,
+len, iommu_prot, gfp,
+mapped);
+
+   if (ret)
+   return ret;
+
+   len = 0;
+   }
+
+   if (len) {
+   len += sg->length;
+   } else {
+   len = sg->length;
+   start = s_phys;
+   }
+
+   if (++i < nents)
+   sg = sg_next(sg);
+   }
+
+   return 0;
+}
+
 static void __arm_lpae_free_pgtable(struct arm_lpae_io_pgtable *data, int lvl,
arm_lpae_iopte *ptep)
 {
@@ -750,6 +835,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 
data->iop.ops = (struct io_pgtable_ops) {
.map= arm_lpae_map,
+   .map_sg = arm_lpae_map_sg,
.unmap  = arm_lpae_unmap,
.iova_to_phys   = arm_lpae_iova_to_phys,
};
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3d099a31ddca..ed879a4d7fac 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2346,8 +2346,8 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain 
*domain, dma_addr_t iova)
 }
 EXPORT_SYMBOL_GPL(iommu_iova_to_phys);
 
-static size_t iommu_pgsize(struct iommu_domain *domain,
-  unsigned long addr_merge, size_t size)
+size_t iommu_pgsize(unsigned long pgsize_bitmap, unsigned long addr_merge,
+   size_t size)
 {
unsigned int pgsize_idx;
size_t pgsize;
@@ -2366,7 +2366,7 @@ static size_t iommu_pgsize(struct iommu_domain *domain,
pgsize = (1UL << (pgsize_idx + 1)) - 1;
 
/* throw away page sizes not supp

[PATCH RFC 7/9] iommu: Introduce map_sg() as an IOMMU op for IOMMU drivers

2021-01-27 Thread Chuck Lever
From: Isaac J. Manjarres 

Add support for IOMMU drivers to have their own map_sg() callbacks.
This completes the path for having iommu_map_sg() invoke an IOMMU
driver's map_sg() callback, which can then invoke the io-pgtable
map_sg() callback with the entire scatter-gather list, so that it
can be processed entirely in the io-pgtable layer.

For IOMMU drivers that do not provide a callback, the default
implementation of iterating through the scatter-gather list, while
calling iommu_map() will be used.

Signed-off-by: Isaac J. Manjarres 
Tested-by: Sai Prakash Ranjan 
[ cel: adjusted new iotlb_sync_map call site ]
Signed-off-by: Chuck Lever 
---
 drivers/iommu/iommu.c |   12 
 include/linux/iommu.h |5 +
 2 files changed, 17 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ed879a4d7fac..bd7adbd0339b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2551,6 +2551,18 @@ static size_t __iommu_map_sg(struct iommu_domain 
*domain, unsigned long iova,
unsigned int i = 0;
int ret;
 
+   if (ops->map_sg) {
+   ret = ops->map_sg(domain, iova, sg, nents, prot, gfp, &mapped);
+
+   if (ops->iotlb_sync_map)
+   ops->iotlb_sync_map(domain, iova, mapped);
+
+   if (ret)
+   goto out_err;
+
+   return mapped;
+   }
+
while (i <= nents) {
phys_addr_t s_phys = sg_phys(sg);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cd5f35022a25..667edc7b034a 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -192,6 +192,8 @@ struct iommu_iotlb_gather {
  * @attach_dev: attach device to an iommu domain
  * @detach_dev: detach device from an iommu domain
  * @map: map a physically contiguous memory region to an iommu domain
+ * @map_sg: map a scatter-gather list of physically contiguous chunks to
+ *  an iommu domain.
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @flush_iotlb_all: Synchronously flush all hardware TLBs for this domain
  * @iotlb_sync_map: Sync mappings created recently using @map to the hardware
@@ -243,6 +245,9 @@ struct iommu_ops {
void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
int (*map)(struct iommu_domain *domain, unsigned long iova,
   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+   int (*map_sg)(struct iommu_domain *domain, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents, int prot,
+ gfp_t gfp, size_t *mapped);
size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 size_t size, struct iommu_iotlb_gather *iotlb_gather);
void (*flush_iotlb_all)(struct iommu_domain *domain);


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 3/9] iommu/vt-d: Add iotlb_sync_map callback

2021-01-27 Thread Chuck Lever
From: Lu Baolu 

Some Intel VT-d hardware implementations don't support memory coherency
for page table walk (presented by the Page-Walk-coherency bit in the
ecap register), so that software must flush the corresponding CPU cache
lines explicitly after each page table entry update.

The iommu_map_sg() code iterates through the given scatter-gather list
and invokes iommu_map() for each element in the scatter-gather list,
which calls into the vendor IOMMU driver through iommu_ops callback. As
the result, a single sg mapping may lead to multiple cache line flushes,
which leads to the degradation of I/O performance after the commit
 ("iommu/vt-d: Convert intel iommu driver to the iommu
ops").

Fix this by adding iotlb_sync_map callback and centralizing the clflush
operations after all sg mappings.

Fixes: c588072bba6b5 ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
Reported-by: Chuck Lever 
Link: 
https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/
Signed-off-by: Lu Baolu 
Cc: Robin Murphy 
[ cel: removed @first_pte, which is no longer used ]
Signed-off-by: Chuck Lever 
---
 drivers/iommu/intel/iommu.c |   90 +--
 1 file changed, 60 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index f665322a0991..013097b6d55f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2298,9 +2298,9 @@ static int
 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 unsigned long phys_pfn, unsigned long nr_pages, int prot)
 {
-   struct dma_pte *first_pte = NULL, *pte = NULL;
unsigned int largepage_lvl = 0;
unsigned long lvl_pages = 0;
+   struct dma_pte *pte = NULL;
phys_addr_t pteval;
u64 attr;
 
@@ -2322,7 +2322,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned 
long iov_pfn,
largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
phys_pfn, nr_pages);
 
-   first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, 
&largepage_lvl);
+   pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
if (!pte)
return -ENOMEM;
/* It is large page*/
@@ -2383,34 +2383,14 @@ __domain_mapping(struct dmar_domain *domain, unsigned 
long iov_pfn,
 * recalculate 'pte' and switch back to smaller pages for the
 * end of the mapping, if the trailing size is not enough to
 * use another superpage (i.e. nr_pages < lvl_pages).
+*
+* We leave clflush for the leaf pte changes to iotlb_sync_map()
+* callback.
 */
pte++;
if (!nr_pages || first_pte_in_page(pte) ||
-   (largepage_lvl > 1 && nr_pages < lvl_pages)) {
-   domain_flush_cache(domain, first_pte,
-  (void *)pte - (void *)first_pte);
+   (largepage_lvl > 1 && nr_pages < lvl_pages))
pte = NULL;
-   }
-   }
-
-   return 0;
-}
-
-static int
-domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
-  unsigned long phys_pfn, unsigned long nr_pages, int prot)
-{
-   int iommu_id, ret;
-   struct intel_iommu *iommu;
-
-   /* Do the real mapping first */
-   ret = __domain_mapping(domain, iov_pfn, phys_pfn, nr_pages, prot);
-   if (ret)
-   return ret;
-
-   for_each_domain_iommu(iommu_id, domain) {
-   iommu = g_iommus[iommu_id];
-   __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
}
 
return 0;
@@ -4943,7 +4923,6 @@ static int intel_iommu_map(struct iommu_domain *domain,
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
u64 max_addr;
int prot = 0;
-   int ret;
 
if (iommu_prot & IOMMU_READ)
prot |= DMA_PTE_READ;
@@ -4969,9 +4948,8 @@ static int intel_iommu_map(struct iommu_domain *domain,
/* Round up size to next multiple of PAGE_SIZE, if it and
   the low bits of hpa would take us onto the next page */
size = aligned_nrpages(hpa, size);
-   ret = domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
-hpa >> VTD_PAGE_SHIFT, size, prot);
-   return ret;
+   return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
+   hpa >> VTD_PAGE_SHIFT, size, prot);
 }
 
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
@@ -5478,6 +5456,57 @@ static bool risky_device(struct pci_dev *pdev)
return false;
 }
 
+static void clflush_sync_map(s

[PATCH RFC 8/9] iommu/arm-smmu: Hook up map_sg()

2021-01-27 Thread Chuck Lever
From: Isaac J. Manjarres 

Now that everything is in place for iommu_map_sg() to defer
mapping a scatter-gather list to the io-pgtable layer, implement
the map_sg() callback in the SMMU driver, so that iommu_map_sg()
can invoke it with the entire scatter-gather list that will be
mapped.

Signed-off-by: Isaac J. Manjarres 
Tested-by: Sai Prakash Ranjan 
Signed-off-by: Chuck Lever 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c |   19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index d8c6bfde6a61..52acc6858512 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1208,6 +1208,24 @@ static int arm_smmu_map(struct iommu_domain *domain, 
unsigned long iova,
return ret;
 }
 
+static int arm_smmu_map_sg(struct iommu_domain *domain, unsigned long iova,
+  struct scatterlist *sg, unsigned int nents, int prot,
+  gfp_t gfp, size_t *mapped)
+{
+   struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+   struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
+   int ret;
+
+   if (!ops)
+   return -ENODEV;
+
+   arm_smmu_rpm_get(smmu);
+   ret = ops->map_sg(ops, iova, sg, nents, prot, gfp, mapped);
+   arm_smmu_rpm_put(smmu);
+
+   return ret;
+}
+
 static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 size_t size, struct iommu_iotlb_gather *gather)
 {
@@ -1624,6 +1642,7 @@ static struct iommu_ops arm_smmu_ops = {
.domain_free= arm_smmu_domain_free,
.attach_dev = arm_smmu_attach_dev,
.map= arm_smmu_map,
+   .map_sg = arm_smmu_map_sg,
.unmap  = arm_smmu_unmap,
.flush_iotlb_all= arm_smmu_flush_iotlb_all,
.iotlb_sync = arm_smmu_iotlb_sync,


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 4/9] iommu/io-pgtable: Introduce map_sg() as a page table op

2021-01-27 Thread Chuck Lever
From: Isaac J. Manjarres 

While mapping a scatter-gather list, iommu_map_sg() calls
into the IOMMU driver through an indirect call, which can
call into the io-pgtable code through another indirect call.

This sequence of going through the IOMMU core code, the IOMMU
driver, and finally the io-pgtable code, occurs for every
element in the scatter-gather list, in the worse case, which
is not optimal.

Introduce a map_sg callback in the io-pgtable ops so that
IOMMU drivers can invoke it with the complete scatter-gather
list, so that it can be processed within the io-pgtable
code entirely, reducing the number of indirect calls, and
boosting overall iommu_map_sg() performance.

Signed-off-by: Isaac J. Manjarres 
Tested-by: Sai Prakash Ranjan 
Signed-off-by: Chuck Lever 
---
 include/linux/io-pgtable.h |6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index ea727eb1a1a9..6d0e73172603 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -147,6 +147,9 @@ struct io_pgtable_cfg {
  * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers.
  *
  * @map:  Map a physically contiguous memory region.
+ * @map_sg:   Map a scatter-gather list of physically contiguous memory
+ *chunks. The mapped pointer argument is used to store how
+ *many bytes are mapped.
  * @unmap:Unmap a physically contiguous memory region.
  * @iova_to_phys: Translate iova to physical address.
  *
@@ -156,6 +159,9 @@ struct io_pgtable_cfg {
 struct io_pgtable_ops {
int (*map)(struct io_pgtable_ops *ops, unsigned long iova,
   phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
+   int (*map_sg)(struct io_pgtable_ops *ops, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents, int prot,
+ gfp_t gfp, size_t *mapped);
size_t (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
size_t size, struct iommu_iotlb_gather *gather);
phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 9/9] iommu/vt-d: Introduce map_sg() for Intel IOMMUs

2021-01-27 Thread Chuck Lever
Attempt to reduce indirect call overhead when mapping a substantial
scatter-gather list.

Signed-off-by: Chuck Lever 
---
 drivers/iommu/intel/iommu.c |   37 +
 1 file changed, 37 insertions(+)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 013097b6d55f..deae39f1477a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4952,6 +4952,42 @@ static int intel_iommu_map(struct iommu_domain *domain,
hpa >> VTD_PAGE_SHIFT, size, prot);
 }
 
+static int intel_iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+ struct scatterlist *sg, unsigned int nents,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+   unsigned int i = 0;
+   phys_addr_t start;
+   size_t len = 0;
+   int ret;
+
+   while (i <= nents) {
+   phys_addr_t s_phys = sg_phys(sg);
+
+   if (len && s_phys != start + len) {
+   ret = intel_iommu_map(domain, iova + *mapped, start,
+ len, prot, gfp);
+   if (ret)
+   return ret;
+
+   *mapped += len;
+   len = 0;
+   }
+
+   if (len) {
+   len += sg->length;
+   } else {
+   len = sg->length;
+   start = s_phys;
+   }
+
+   if (++i < nents)
+   sg = sg_next(sg);
+   }
+
+   return 0;
+}
+
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
unsigned long iova, size_t size,
struct iommu_iotlb_gather *gather)
@@ -5519,6 +,7 @@ const struct iommu_ops intel_iommu_ops = {
.aux_detach_dev = intel_iommu_aux_detach_device,
.aux_get_pasid  = intel_iommu_aux_get_pasid,
.map= intel_iommu_map,
+   .map_sg = intel_iommu_map_sg,
.iotlb_sync_map = intel_iommu_iotlb_sync_map,
.unmap  = intel_iommu_unmap,
.flush_iotlb_all= intel_flush_iotlb_all,


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 0/9] Possible set of VT-d optimizations

2021-01-28 Thread Chuck Lever



> On Jan 28, 2021, at 8:59 AM, Robin Murphy  wrote:
> 
> On 2021-01-27 20:00, Chuck Lever wrote:
>> Hi-
>> This collection of patches seems to get the best throughtput results
>> so far. The NFS WRITE result is fully restored, and the NFS READ
>> result is very close to fully restored.
>>  Children see throughput for 12 initial writers  = 5008474.03 kB/sec
>>  Parent sees throughput for 12 initial writers   = 4996927.80 kB/sec
>>  Min throughput per process  = 416956.88 kB/sec
>>  Max throughput per process  = 417910.22 kB/sec
>>  Avg throughput per process  = 417372.84 kB/sec
>>  Min xfer= 1046272.00 kB
>>  CPU Utilization: Wall time2.515CPU time1.996CPU 
>> utilization  79.37 %
>>  Children see throughput for 12 rewriters= 5020584.59 kB/sec
>>  Parent sees throughput for 12 rewriters = 5012539.29 kB/sec
>>  Min throughput per process  = 417799.00 kB/sec
>>  Max throughput per process  = 419082.22 kB/sec
>>  Avg throughput per process  = 418382.05 kB/sec
>>  Min xfer= 1046528.00 kB
>>  CPU utilization: Wall time2.507CPU time2.024CPU 
>> utilization  80.73 %
>>  Children see throughput for 12 readers  = 5805484.25 kB/sec
>>  Parent sees throughput for 12 readers   = 5799535.68 kB/sec
>>  Min throughput per process  = 482888.16 kB/sec
>>  Max throughput per process  = 48.16 kB/sec
>>  Avg throughput per process  = 483790.35 kB/sec
>>  Min xfer= 1045760.00 kB
>>  CPU utilization: Wall time2.167CPU time1.964CPU 
>> utilization  90.63 %
>>  Children see throughput for 12 re-readers   = 5812227.16 kB/sec
>>  Parent sees throughput for 12 re-readers= 5803793.06 kB/sec
>>  Min throughput per process  = 483242.97 kB/sec
>>  Max throughput per process  = 485724.41 kB/sec
>>  Avg throughput per process  = 484352.26 kB/sec
>>  Min xfer= 1043456.00 kB
>>  CPU utilization: Wall time2.161CPU time1.976CPU 
>> utilization  91.45 %
>> I've included a simple-minded implementation of a map_sg op for
>> the Intel IOMMU. This is nothing more than a copy of the loop in
>> __iommu_map_sg() with the call to __iommu_map() replaced with a
>> call to intel_iommu_map().
> 
> ...which is the main reason I continue to strongly dislike patches #4-#9 (#3 
> definitely seems to makes sense either way, now that #1 and #2 are going to 
> land). If a common operation is worth optimising anywhere, then it deserves 
> optimising everywhere, so we end up with a dozen diverging copies of 
> essentially the same code - particularly when the driver-specific 
> functionality *is* already in the drivers, so what gets duplicated is solely 
> the "generic" parts.

I don't disagree with that assessment, but I don't immediately see an
alternative API arrangement that would be more successful in the short
term. If 4/9 - 9/9 are not acceptable, then the responsible thing to
do would be to revert:

 - 58a8bb39490d ("iommu/vt-d: Cleanup after converting to dma-iommu ops")
 - c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops")

for v5.11, work out the proper API design, and then try the VT-d conversion
again.

IMHO.


> And if there's justification for pushing iommu_map_sg() entirely into 
> drivers, then it's verging on self-contradictory not to do the same for 
> iommu_map() and iommu_unmap(). Some IOMMU drivers - mainly intel-iommu, as it 
> happens - are already implementing hacks around the "one call per page" 
> interface being inherently inefficient, so the logical thing to do here is 
> take a step back and reconsider the fundamental design of the whole map/unmap 
> interface. Implementing hacks on top of hacks to make particular things 
> faster on particular systems that particular people care about is not going 
> to do us any favours in the long run.
> 
> As it stands, I can easily see a weird anti-pattern emerging where people 
> start adding code to fake up scatterlists in random drivers because they see 
> dma_map_sg() performing paradoxically better than dma_map_page().
> 
> Robin.
> 
>> ---
>> Chuck Lever (1):
&g

Re: [PATCH RFC 0/9] Possible set of VT-d optimizations

2021-02-01 Thread Chuck Lever



> On Jan 28, 2021, at 9:52 AM, Chuck Lever  wrote:
> 
> 
> 
>> On Jan 28, 2021, at 8:59 AM, Robin Murphy  wrote:
>> 
>> On 2021-01-27 20:00, Chuck Lever wrote:
>>> Hi-
>>> This collection of patches seems to get the best throughtput results
>>> so far. The NFS WRITE result is fully restored, and the NFS READ
>>> result is very close to fully restored.
>>> Children see throughput for 12 initial writers  = 5008474.03 kB/sec
>>> Parent sees throughput for 12 initial writers   = 4996927.80 kB/sec
>>> Min throughput per process  = 416956.88 kB/sec
>>> Max throughput per process  = 417910.22 kB/sec
>>> Avg throughput per process  = 417372.84 kB/sec
>>> Min xfer= 1046272.00 kB
>>> CPU Utilization: Wall time2.515CPU time1.996CPU 
>>> utilization  79.37 %
>>> Children see throughput for 12 rewriters= 5020584.59 kB/sec
>>> Parent sees throughput for 12 rewriters = 5012539.29 kB/sec
>>> Min throughput per process  = 417799.00 kB/sec
>>> Max throughput per process  = 419082.22 kB/sec
>>> Avg throughput per process  = 418382.05 kB/sec
>>> Min xfer= 1046528.00 kB
>>> CPU utilization: Wall time2.507CPU time2.024CPU 
>>> utilization  80.73 %
>>> Children see throughput for 12 readers  = 5805484.25 kB/sec
>>> Parent sees throughput for 12 readers   = 5799535.68 kB/sec
>>> Min throughput per process  = 482888.16 kB/sec
>>> Max throughput per process  = 48.16 kB/sec
>>> Avg throughput per process  = 483790.35 kB/sec
>>> Min xfer= 1045760.00 kB
>>> CPU utilization: Wall time2.167CPU time1.964CPU 
>>> utilization  90.63 %
>>> Children see throughput for 12 re-readers   = 5812227.16 kB/sec
>>> Parent sees throughput for 12 re-readers= 5803793.06 kB/sec
>>> Min throughput per process  = 483242.97 kB/sec
>>> Max throughput per process  = 485724.41 kB/sec
>>> Avg throughput per process  = 484352.26 kB/sec
>>> Min xfer= 1043456.00 kB
>>> CPU utilization: Wall time2.161CPU time1.976CPU 
>>> utilization  91.45 %
>>> I've included a simple-minded implementation of a map_sg op for
>>> the Intel IOMMU. This is nothing more than a copy of the loop in
>>> __iommu_map_sg() with the call to __iommu_map() replaced with a
>>> call to intel_iommu_map().
>> 
>> ...which is the main reason I continue to strongly dislike patches #4-#9 (#3 
>> definitely seems to makes sense either way, now that #1 and #2 are going to 
>> land). If a common operation is worth optimising anywhere, then it deserves 
>> optimising everywhere, so we end up with a dozen diverging copies of 
>> essentially the same code - particularly when the driver-specific 
>> functionality *is* already in the drivers, so what gets duplicated is solely 
>> the "generic" parts.
> 
> I don't disagree with that assessment, but I don't immediately see an
> alternative API arrangement that would be more successful in the short
> term. If 4/9 - 9/9 are not acceptable, then the responsible thing to
> do would be to revert:
> 
> - 58a8bb39490d ("iommu/vt-d: Cleanup after converting to dma-iommu ops")
> - c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops")
> 
> for v5.11, work out the proper API design, and then try the VT-d conversion
> again.
> 
> IMHO.

Are all y'all waiting for me to post such patches? ;-)


>> And if there's justification for pushing iommu_map_sg() entirely into 
>> drivers, then it's verging on self-contradictory not to do the same for 
>> iommu_map() and iommu_unmap(). Some IOMMU drivers - mainly intel-iommu, as 
>> it happens - are already implementing hacks around the "one call per page" 
>> interface being inherently inefficient, so the logical thing to do here is 
>> take a step back and reconsider the fundamental design of the whole 
>> map/unmap interface. Implementing hacks on top of hacks to make particular 
>> things faster on particu

performance regression noted in v5.11-rc after c062db039f40

2021-01-08 Thread Chuck Lever
Children see throughput for 12 re-readers   = 5410601.12 kB/sec
Parent sees throughput for 12 re-readers= 5403504.40 kB/sec
Min throughput per process  =  449918.12 kB/sec
Max throughput per process  =  452489.28 kB/sec
Avg throughput per process  =  450883.43 kB/sec
Min xfer= 1043456.00 kB
CPU utilization: Wall time2.321CPU time1.978CPU 
utilization  85.21 %

And here's c588072bba6b ("iommu/vt-d: Convert intel iommu driver to
the iommu ops"). Significant throughput loss.

Children see throughput for 12 initial writers  = 3812036.91 kB/sec
Parent sees throughput for 12 initial writers   = 3753683.40 kB/sec
Min throughput per process  =  313672.25 kB/sec
Max throughput per process  =  321719.44 kB/sec
Avg throughput per process  =  317669.74 kB/sec
Min xfer= 1022464.00 kB
CPU Utilization: Wall time3.309CPU time1.986CPU 
utilization  60.02 %
Children see throughput for 12 rewriters= 3786831.94 kB/sec
Parent sees throughput for 12 rewriters = 3783205.58 kB/sec
Min throughput per process  =  313654.44 kB/sec
Max throughput per process  =  317844.50 kB/sec
Avg throughput per process  =  315569.33 kB/sec
Min xfer= 1035520.00 kB
CPU utilization: Wall time3.302CPU time1.945CPU 
utilization  58.90 %
Children see throughput for 12 readers  = 4265828.28 kB/sec
Parent sees throughput for 12 readers   = 4261844.88 kB/sec
Min throughput per process  =  352305.00 kB/sec
Max throughput per process  =  357726.22 kB/sec
Avg throughput per process  =  355485.69 kB/sec
Min xfer= 1032960.00 kB
CPU utilization: Wall time2.934CPU time1.942CPU 
utilization  66.20 %
Children see throughput for 12 re-readers   = 4220651.19 kB/sec
Parent sees throughput for 12 re-readers= 4216096.04 kB/sec
Min throughput per process  =  348677.16 kB/sec
Max throughput per process  =  353467.44 kB/sec
Avg throughput per process  =  351720.93 kB/sec
Min xfer= 1035264.00 kB
CPU utilization: Wall time2.969CPU time1.952CPU 
utilization  65.74 %

The regression appears to be 100% reproducible. 


--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-13 Thread Chuck Lever



> On Jan 12, 2021, at 9:25 PM, Lu Baolu  wrote:
> 
> Hi,
> 
> On 1/12/21 10:38 PM, Will Deacon wrote:
>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>> Hi-
>>> 
>>> [ Please cc: me on replies, I'm not currently subscribed to
>>> iommu@lists ].
>>> 
>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>> 
>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>> 
>>> For those not familiar with the way storage protocols use RDMA, The
>>> initiator/client sets up memory regions and the target/server uses
>>> RDMA Read and Write to move data out of and into those regions. The
>>> initiator/client uses only RDMA memory registration and invalidation
>>> operations, and the target/server uses RDMA Read and Write.
>>> 
>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>> enabled using the kernel command line options "intel_iommu=on
>>> iommu=strict".
>>> 
>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>> I was able to bisect on my client to the following commits.
>>> 
>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>> map_sg"). This is about normal for this test.
>>> 
>>> Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>> Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>> Min throughput per process  =  387764.34 kB/sec
>>> Max throughput per process  =  399655.47 kB/sec
>>> Avg throughput per process  =  394381.76 kB/sec
>>> Min xfer= 1017344.00 kB
>>> CPU Utilization: Wall time2.671CPU time1.974CPU 
>>> utilization  73.89 %
>>> Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>> Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>> Min throughput per process  =  398983.72 kB/sec
>>> Max throughput per process  =  406199.66 kB/sec
>>> Avg throughput per process  =  403145.16 kB/sec
>>> Min xfer= 1030656.00 kB
>>> CPU utilization: Wall time2.584CPU time1.959CPU 
>>> utilization  75.82 %
>>> Children see throughput for 12 readers  = 5921370.94 kB/sec
>>> Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>> Min throughput per process  =  491812.38 kB/sec
>>> Max throughput per process  =  494777.28 kB/sec
>>> Avg throughput per process  =  493447.58 kB/sec
>>> Min xfer= 1042688.00 kB
>>> CPU utilization: Wall time2.122CPU time1.968CPU 
>>> utilization  92.75 %
>>> Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>> Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>> Min throughput per process  =  492805.81 kB/sec
>>> Max throughput per process  =  497280.19 kB/sec
>>> Avg throughput per process  =  495665.47 kB/sec
>>> Min xfer= 1039360.00 kB
>>> CPU utilization: Wall time2.111CPU time1.968CPU 
>>> utilization  93.22 %
>>> 
>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in
>>> iommu_ops.at(de)tach_dev"). It's losing some steam here.
>>> 
>>> Children see throughput for 12 initial writers  = 4342419.12 kB/sec
>>> Parent sees throughput for 12 initial writers   = 4310612.79 kB/sec
>>> Min throughput per process  =  359299.06 kB/sec
>>> Max throughput per process  =  363866.16 kB/sec
>>> Avg throughput per process  =  361868.26 kB/sec
>>> Min xfer= 1035520.00 kB
>>> CPU Utilization: Wall time2.902CPU time1.951CPU 
>>> utilization  67.22 %
>>> Children see throughput for 12 rewriters   

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-13 Thread Chuck Lever



> On Jan 13, 2021, at 9:07 AM, Chuck Lever  wrote:
> 
> 
> 
>> On Jan 12, 2021, at 9:25 PM, Lu Baolu  wrote:
>> 
>> Hi,
>> 
>> On 1/12/21 10:38 PM, Will Deacon wrote:
>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>>> Hi-
>>>> 
>>>> [ Please cc: me on replies, I'm not currently subscribed to
>>>> iommu@lists ].
>>>> 
>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>>> 
>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>>> 
>>>> For those not familiar with the way storage protocols use RDMA, The
>>>> initiator/client sets up memory regions and the target/server uses
>>>> RDMA Read and Write to move data out of and into those regions. The
>>>> initiator/client uses only RDMA memory registration and invalidation
>>>> operations, and the target/server uses RDMA Read and Write.
>>>> 
>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>>> enabled using the kernel command line options "intel_iommu=on
>>>> iommu=strict".
>>>> 
>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>>> I was able to bisect on my client to the following commits.
>>>> 
>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>>> map_sg"). This is about normal for this test.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>>>Min throughput per process  =  387764.34 kB/sec
>>>>Max throughput per process  =  399655.47 kB/sec
>>>>Avg throughput per process  =  394381.76 kB/sec
>>>>Min xfer= 1017344.00 kB
>>>>CPU Utilization: Wall time2.671CPU time1.974CPU 
>>>> utilization  73.89 %
>>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>>>Min throughput per process  =  398983.72 kB/sec
>>>>Max throughput per process  =  406199.66 kB/sec
>>>>Avg throughput per process  =  403145.16 kB/sec
>>>>Min xfer= 1030656.00 kB
>>>>CPU utilization: Wall time2.584CPU time1.959CPU 
>>>> utilization  75.82 %
>>>>Children see throughput for 12 readers  = 5921370.94 kB/sec
>>>>Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>>>Min throughput per process  =  491812.38 kB/sec
>>>>Max throughput per process  =  494777.28 kB/sec
>>>>Avg throughput per process  =  493447.58 kB/sec
>>>>Min xfer= 1042688.00 kB
>>>>CPU utilization: Wall time2.122CPU time1.968CPU 
>>>> utilization  92.75 %
>>>>Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>>>Min throughput per process  =  492805.81 kB/sec
>>>>Max throughput per process  =  497280.19 kB/sec
>>>>Avg throughput per process  =  495665.47 kB/sec
>>>>Min xfer= 1039360.00 kB
>>>>CPU utilization: Wall time2.111CPU time1.968CPU 
>>>> utilization  93.22 %
>>>> 
>>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in
>>>> iommu_ops.at(de)tach_dev"). It's losing some steam here.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4342419.12 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4310612.79 kB/sec
>>>>Min throughput per process  =  359299.06 kB/sec
>>>>Max throughput per process  =  363866.16 kB/sec
>>

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-18 Thread Chuck Lever



> On Jan 12, 2021, at 9:38 AM, Will Deacon  wrote:
> 
> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
> 
> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>> Hi-
>> 
>> [ Please cc: me on replies, I'm not currently subscribed to
>> iommu@lists ].
>> 
>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>> 
>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>> 
>> For those not familiar with the way storage protocols use RDMA, The
>> initiator/client sets up memory regions and the target/server uses
>> RDMA Read and Write to move data out of and into those regions. The
>> initiator/client uses only RDMA memory registration and invalidation
>> operations, and the target/server uses RDMA Read and Write.
>> 
>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>> enabled using the kernel command line options "intel_iommu=on
>> iommu=strict".
>> 
>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>> I was able to bisect on my client to the following commits.
>> 
>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>> map_sg"). This is about normal for this test.
>> 
>>  Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>  Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>  Min throughput per process  =  387764.34 kB/sec
>>  Max throughput per process  =  399655.47 kB/sec
>>  Avg throughput per process  =  394381.76 kB/sec
>>  Min xfer= 1017344.00 kB
>>  CPU Utilization: Wall time2.671CPU time1.974CPU 
>> utilization  73.89 %
>>  Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>  Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>  Min throughput per process  =  398983.72 kB/sec
>>  Max throughput per process  =  406199.66 kB/sec
>>  Avg throughput per process  =  403145.16 kB/sec
>>  Min xfer= 1030656.00 kB
>>  CPU utilization: Wall time2.584CPU time1.959CPU 
>> utilization  75.82 %
>>  Children see throughput for 12 readers  = 5921370.94 kB/sec
>>  Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>  Min throughput per process  =  491812.38 kB/sec
>>  Max throughput per process  =  494777.28 kB/sec
>>  Avg throughput per process  =  493447.58 kB/sec
>>  Min xfer= 1042688.00 kB
>>  CPU utilization: Wall time2.122CPU time1.968CPU 
>> utilization  92.75 %
>>  Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>  Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>  Min throughput per process  =  492805.81 kB/sec
>>  Max throughput per process  =  497280.19 kB/sec
>>  Avg throughput per process  =  495665.47 kB/sec
>>  Min xfer= 1039360.00 kB
>>  CPU utilization: Wall time2.111CPU time1.968CPU 
>> utilization  93.22 %
>> 
>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in
>> iommu_ops.at(de)tach_dev"). It's losing some steam here.
>> 
>>  Children see throughput for 12 initial writers  = 4342419.12 kB/sec
>>  Parent sees throughput for 12 initial writers   = 4310612.79 kB/sec
>>  Min throughput per process  =  359299.06 kB/sec
>>  Max throughput per process  =  363866.16 kB/sec
>>  Avg throughput per process  =  361868.26 kB/sec
>>  Min xfer= 1035520.00 kB
>>  CPU Utilization: Wall time2.902CPU time1.951CPU 
>> utilization  67.22 %
>>  Children see throughput for 12 rewriters= 4408576.66 kB/sec
>>  Parent sees throughput for 12 rewriters = 4404280.87 kB/sec
>>  Min throughput per process  =  364553.88 kB/sec
>>  Max throughput per process  =  370029.28 kB/sec
>>  Avg throughput per process 

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-18 Thread Chuck Lever



> On Jan 18, 2021, at 1:00 PM, Robin Murphy  wrote:
> 
> On 2021-01-18 16:18, Chuck Lever wrote:
>>> On Jan 12, 2021, at 9:38 AM, Will Deacon  wrote:
>>> 
>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>>> 
>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>>> Hi-
>>>> 
>>>> [ Please cc: me on replies, I'm not currently subscribed to
>>>> iommu@lists ].
>>>> 
>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>>> 
>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>>> 
>>>> For those not familiar with the way storage protocols use RDMA, The
>>>> initiator/client sets up memory regions and the target/server uses
>>>> RDMA Read and Write to move data out of and into those regions. The
>>>> initiator/client uses only RDMA memory registration and invalidation
>>>> operations, and the target/server uses RDMA Read and Write.
>>>> 
>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>>> enabled using the kernel command line options "intel_iommu=on
>>>> iommu=strict".
>>>> 
>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>>> I was able to bisect on my client to the following commits.
>>>> 
>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>>> map_sg"). This is about normal for this test.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>>>Min throughput per process  =  387764.34 kB/sec
>>>>Max throughput per process  =  399655.47 kB/sec
>>>>Avg throughput per process  =  394381.76 kB/sec
>>>>Min xfer= 1017344.00 kB
>>>>CPU Utilization: Wall time2.671CPU time1.974CPU 
>>>> utilization  73.89 %
>>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>>>Min throughput per process  =  398983.72 kB/sec
>>>>Max throughput per process  =  406199.66 kB/sec
>>>>Avg throughput per process  =  403145.16 kB/sec
>>>>Min xfer= 1030656.00 kB
>>>>CPU utilization: Wall time2.584CPU time1.959CPU 
>>>> utilization  75.82 %
>>>>Children see throughput for 12 readers  = 5921370.94 kB/sec
>>>>Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>>>Min throughput per process  =  491812.38 kB/sec
>>>>Max throughput per process  =  494777.28 kB/sec
>>>>Avg throughput per process  =  493447.58 kB/sec
>>>>Min xfer= 1042688.00 kB
>>>>CPU utilization: Wall time2.122CPU time1.968CPU 
>>>> utilization  92.75 %
>>>>Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>>>Min throughput per process  =  492805.81 kB/sec
>>>>Max throughput per process  =  497280.19 kB/sec
>>>>Avg throughput per process  =  495665.47 kB/sec
>>>>Min xfer= 1039360.00 kB
>>>>CPU utilization: Wall time2.111CPU time1.968CPU 
>>>> utilization  93.22 %
>>>> 
>>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in
>>>> iommu_ops.at(de)tach_dev"). It's losing some steam here.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4342419.12 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4310612.79 kB/sec
>>>>Min throughput per process  =  359299.06 kB/sec
>>>>Max throughput per process  =  363866.16 kB/sec
>>>&

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-19 Thread Chuck Lever



> On Jan 18, 2021, at 8:22 PM, Lu Baolu  wrote:
> 
> Do you mind posting the cap and ecap of the iommu used by your device?
> 
> You can get it via sysfs, for example:
> 
> /sys/bus/pci/devices/:00:14.0/iommu/intel-iommu# ls
> address  cap  domains_supported  domains_used  ecap  version

[root@manet intel-iommu]# lspci | grep Mellanox
03:00.0 Network controller: Mellanox Technologies MT27520 Family [ConnectX-3 
Pro]
[root@manet intel-iommu]# pwd
/sys/devices/pci:00/:00:03.0/:03:00.0/iommu/intel-iommu
[root@manet intel-iommu]# for i in *; do   echo -n $i ": ";   cat $i; done
address : c7ffc000
cap : d2078c106f0466
domains_supported : 65536
domains_used : 62
ecap : f020de
version : 1:0
[root@manet intel-iommu]#


>> Fwiw, this system uses the Intel C612 chipset with Intel(R) Xeon(R)
>> E5-2603 v3 @ 1.60GHz CPUs.
> 
> Can you please also hack a line of code to check the return value of
> iommu_dma_map_sg()?

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index baca49fe83af..e811562ead0e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -328,6 +328,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 
dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
  mr->mr_dir);
+   trace_printk("ib_dma_map_sg(%d) returns %d\n", mr->mr_nents, dma_nents);
if (!dma_nents)
goto out_dmamap_err;
mr->mr_device = ep->re_id->device;

During the 256KB iozone test I used before, this trace log is generated:

   kworker/u28:3-1269  [000]   336.054743: bprint:   frwr_map: 
ib_dma_map_sg(30) returns 1
   kworker/u28:3-1269  [000]   336.054835: bprint:   frwr_map: 
ib_dma_map_sg(30) returns 1
   kworker/u28:3-1269  [000]   336.055022: bprint:   frwr_map: 
ib_dma_map_sg(4) returns 1
   kworker/u28:3-1269  [000]   336.055118: bprint:   frwr_map: 
ib_dma_map_sg(30) returns 1
   kworker/u28:3-1269  [000]   336.055312: bprint:   frwr_map: 
ib_dma_map_sg(30) returns 1
   kworker/u28:3-1269  [000]   336.055407: bprint:   frwr_map: 
ib_dma_map_sg(4) returns 1

--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-20 Thread Chuck Lever



> On Jan 19, 2021, at 9:11 PM, Lu Baolu  wrote:
> 
> On 1/19/21 10:37 PM, Chuck Lever wrote:
>>> On Jan 18, 2021, at 8:22 PM, Lu Baolu  wrote:
>>> 
>>> Do you mind posting the cap and ecap of the iommu used by your device?
>>> 
>>> You can get it via sysfs, for example:
>>> 
>>> /sys/bus/pci/devices/:00:14.0/iommu/intel-iommu# ls
>>> address  cap  domains_supported  domains_used  ecap  version
>> [root@manet intel-iommu]# lspci | grep Mellanox
>> 03:00.0 Network controller: Mellanox Technologies MT27520 Family [ConnectX-3 
>> Pro]
>> [root@manet intel-iommu]# pwd
>> /sys/devices/pci:00/:00:03.0/:03:00.0/iommu/intel-iommu
>> [root@manet intel-iommu]# for i in *; do   echo -n $i ": ";   cat $i; done
>> address : c7ffc000
>> cap : d2078c106f0466
> 
> MGAW: 10 (supporting 48-bit address width)
> SAGAW: 00100 (supporting 48-bit 4-level page table)
> 
> So the calculation of domain->domain.geometry.aperture_end is right.

I found the cause of the performance loss with c062db039f40: it was
a testing error on my part. I will begin looking at c588072bba6b
("iommu/vt-d: Convert intel iommu driver to the iommu ops").


--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-21 Thread Chuck Lever



> On Jan 18, 2021, at 1:00 PM, Robin Murphy  wrote:
> 
> On 2021-01-18 16:18, Chuck Lever wrote:
>>> On Jan 12, 2021, at 9:38 AM, Will Deacon  wrote:
>>> 
>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>>> 
>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>>> Hi-
>>>> 
>>>> [ Please cc: me on replies, I'm not currently subscribed to
>>>> iommu@lists ].
>>>> 
>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>>> 
>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>>> 
>>>> For those not familiar with the way storage protocols use RDMA, The
>>>> initiator/client sets up memory regions and the target/server uses
>>>> RDMA Read and Write to move data out of and into those regions. The
>>>> initiator/client uses only RDMA memory registration and invalidation
>>>> operations, and the target/server uses RDMA Read and Write.
>>>> 
>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>>> enabled using the kernel command line options "intel_iommu=on
>>>> iommu=strict".
>>>> 
>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>>> I was able to bisect on my client to the following commits.
>>>> 
>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>>> map_sg"). This is about normal for this test.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>>>Min throughput per process  =  387764.34 kB/sec
>>>>Max throughput per process  =  399655.47 kB/sec
>>>>Avg throughput per process  =  394381.76 kB/sec
>>>>Min xfer= 1017344.00 kB
>>>>CPU Utilization: Wall time2.671CPU time1.974CPU 
>>>> utilization  73.89 %
>>>>Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>>>Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>>>Min throughput per process  =  398983.72 kB/sec
>>>>Max throughput per process  =  406199.66 kB/sec
>>>>Avg throughput per process  =  403145.16 kB/sec
>>>>Min xfer= 1030656.00 kB
>>>>CPU utilization: Wall time2.584CPU time1.959CPU 
>>>> utilization  75.82 %
>>>>Children see throughput for 12 readers  = 5921370.94 kB/sec
>>>>Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>>>Min throughput per process  =  491812.38 kB/sec
>>>>Max throughput per process  =  494777.28 kB/sec
>>>>Avg throughput per process  =  493447.58 kB/sec
>>>>Min xfer= 1042688.00 kB
>>>>CPU utilization: Wall time2.122CPU time1.968CPU 
>>>> utilization  92.75 %
>>>>Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>>>Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>>>Min throughput per process  =  492805.81 kB/sec
>>>>Max throughput per process  =  497280.19 kB/sec
>>>>Avg throughput per process  =  495665.47 kB/sec
>>>>Min xfer= 1039360.00 kB
>>>>CPU utilization: Wall time2.111CPU time1.968CPU 
>>>> utilization  93.22 %
>>>> 
>>>> Here's c062db039f40 ("iommu/vt-d: Update domain geometry in
>>>> iommu_ops.at(de)tach_dev"). It's losing some steam here.
>>>> 
>>>>Children see throughput for 12 initial writers  = 4342419.12 kB/sec
>>>>Parent sees throughput for 12 initial writers   = 4310612.79 kB/sec
>>>>Min throughput per process  =  359299.06 kB/sec
>>>>Max throughput per process  =  363866.16 kB/sec
>>>&

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-22 Thread Chuck Lever



> On Jan 21, 2021, at 10:00 PM, Lu Baolu  wrote:
> 
> +Isaac
> 
> On 1/22/21 3:09 AM, Chuck Lever wrote:
>>> On Jan 18, 2021, at 1:00 PM, Robin Murphy  wrote:
>>> 
>>> On 2021-01-18 16:18, Chuck Lever wrote:
>>>>> On Jan 12, 2021, at 9:38 AM, Will Deacon  wrote:
>>>>> 
>>>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>>>>> 
>>>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>>>>> Hi-
>>>>>> 
>>>>>> [ Please cc: me on replies, I'm not currently subscribed to
>>>>>> iommu@lists ].
>>>>>> 
>>>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>>>>> 
>>>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>>>>> 
>>>>>> For those not familiar with the way storage protocols use RDMA, The
>>>>>> initiator/client sets up memory regions and the target/server uses
>>>>>> RDMA Read and Write to move data out of and into those regions. The
>>>>>> initiator/client uses only RDMA memory registration and invalidation
>>>>>> operations, and the target/server uses RDMA Read and Write.
>>>>>> 
>>>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>>>>> enabled using the kernel command line options "intel_iommu=on
>>>>>> iommu=strict".
>>>>>> 
>>>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>>>>> I was able to bisect on my client to the following commits.
>>>>>> 
>>>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>>>>> map_sg"). This is about normal for this test.
>>>>>> 
>>>>>>  Children see throughput for 12 initial writers  = 4732581.09 kB/sec
>>>>>>  Parent sees throughput for 12 initial writers   = 4646810.21 kB/sec
>>>>>>  Min throughput per process  =  387764.34 kB/sec
>>>>>>  Max throughput per process  =  399655.47 kB/sec
>>>>>>  Avg throughput per process  =  394381.76 kB/sec
>>>>>>  Min xfer= 1017344.00 kB
>>>>>>  CPU Utilization: Wall time2.671CPU time1.974CPU 
>>>>>> utilization  73.89 %
>>>>>>  Children see throughput for 12 rewriters= 4837741.94 kB/sec
>>>>>>  Parent sees throughput for 12 rewriters = 4833509.35 kB/sec
>>>>>>  Min throughput per process  =  398983.72 kB/sec
>>>>>>  Max throughput per process  =  406199.66 kB/sec
>>>>>>  Avg throughput per process  =  403145.16 kB/sec
>>>>>>  Min xfer= 1030656.00 kB
>>>>>>  CPU utilization: Wall time2.584CPU time1.959CPU 
>>>>>> utilization  75.82 %
>>>>>>  Children see throughput for 12 readers  = 5921370.94 kB/sec
>>>>>>  Parent sees throughput for 12 readers   = 5914106.69 kB/sec
>>>>>>  Min throughput per process  =  491812.38 kB/sec
>>>>>>  Max throughput per process  =  494777.28 kB/sec
>>>>>>  Avg throughput per process  =  493447.58 kB/sec
>>>>>>  Min xfer= 1042688.00 kB
>>>>>>  CPU utilization: Wall time2.122CPU time1.968CPU 
>>>>>> utilization  92.75 %
>>>>>>  Children see throughput for 12 re-readers   = 5947985.69 kB/sec
>>>>>>  Parent sees throughput for 12 re-readers= 5941348.51 kB/sec
>>>>>>  Min throughput per process  =  492805.81 kB/sec
>>>>>>  Max throughput per process  =  497280.19 kB/sec
>>>>>>  Avg throughput per process  =  495665.47 kB/sec
>>>>>>  Min xfer= 1039360.00 kB
>>>>>>  CPU utilization: Wall time2.111CPU time1.968CPU 
>>>>>> utilization  93

Re: performance regression noted in v5.11-rc after c062db039f40

2021-01-22 Thread Chuck Lever



> On Jan 22, 2021, at 12:38 PM, Robin Murphy  wrote:
> 
> On 2021-01-22 16:18, Chuck Lever wrote:
>>> On Jan 21, 2021, at 10:00 PM, Lu Baolu  wrote:
>>> 
>>> +Isaac
>>> 
>>> On 1/22/21 3:09 AM, Chuck Lever wrote:
>>>>> On Jan 18, 2021, at 1:00 PM, Robin Murphy  wrote:
>>>>> 
>>>>> On 2021-01-18 16:18, Chuck Lever wrote:
>>>>>>> On Jan 12, 2021, at 9:38 AM, Will Deacon  wrote:
>>>>>>> 
>>>>>>> [Expanding cc list to include DMA-IOMMU and intel IOMMU folks]
>>>>>>> 
>>>>>>> On Fri, Jan 08, 2021 at 04:18:36PM -0500, Chuck Lever wrote:
>>>>>>>> Hi-
>>>>>>>> 
>>>>>>>> [ Please cc: me on replies, I'm not currently subscribed to
>>>>>>>> iommu@lists ].
>>>>>>>> 
>>>>>>>> I'm running NFS performance tests on InfiniBand using CX-3 Pro cards
>>>>>>>> at 56Gb/s. The test is iozone on an NFSv3/RDMA mount:
>>>>>>>> 
>>>>>>>> /home/cel/bin/iozone -M -+u -i0 -i1 -s1g -r256k -t12 -I
>>>>>>>> 
>>>>>>>> For those not familiar with the way storage protocols use RDMA, The
>>>>>>>> initiator/client sets up memory regions and the target/server uses
>>>>>>>> RDMA Read and Write to move data out of and into those regions. The
>>>>>>>> initiator/client uses only RDMA memory registration and invalidation
>>>>>>>> operations, and the target/server uses RDMA Read and Write.
>>>>>>>> 
>>>>>>>> My NFS client is a two-socket 12-core x86_64 system with its I/O MMU
>>>>>>>> enabled using the kernel command line options "intel_iommu=on
>>>>>>>> iommu=strict".
>>>>>>>> 
>>>>>>>> Recently I've noticed a significant (25-30%) loss in NFS throughput.
>>>>>>>> I was able to bisect on my client to the following commits.
>>>>>>>> 
>>>>>>>> Here's 65f746e8285f ("iommu: Add quirk for Intel graphic devices in
>>>>>>>> map_sg"). This is about normal for this test.
>>>>>>>> 
>>>>>>>>Children see throughput for 12 initial writers  = 4732581.09 
>>>>>>>> kB/sec
>>>>>>>>Parent sees throughput for 12 initial writers   = 4646810.21 
>>>>>>>> kB/sec
>>>>>>>>Min throughput per process  =  387764.34 
>>>>>>>> kB/sec
>>>>>>>>Max throughput per process  =  399655.47 
>>>>>>>> kB/sec
>>>>>>>>Avg throughput per process  =  394381.76 
>>>>>>>> kB/sec
>>>>>>>>Min xfer= 1017344.00 kB
>>>>>>>>CPU Utilization: Wall time2.671CPU time1.974CPU 
>>>>>>>> utilization  73.89 %
>>>>>>>>Children see throughput for 12 rewriters= 4837741.94 
>>>>>>>> kB/sec
>>>>>>>>Parent sees throughput for 12 rewriters = 4833509.35 
>>>>>>>> kB/sec
>>>>>>>>Min throughput per process  =  398983.72 
>>>>>>>> kB/sec
>>>>>>>>Max throughput per process  =  406199.66 
>>>>>>>> kB/sec
>>>>>>>>Avg throughput per process  =  403145.16 
>>>>>>>> kB/sec
>>>>>>>>Min xfer= 1030656.00 kB
>>>>>>>>CPU utilization: Wall time2.584CPU time1.959CPU 
>>>>>>>> utilization  75.82 %
>>>>>>>>Children see throughput for 12 readers  = 5921370.94 
>>>>>>>> kB/sec
>>>>>>>>Parent sees throughput for 12 readers   = 5914106.69 
>>>>>>>> kB/sec
>>>>>>>>Min throughput per process  =  491812.38 
>>>>&

Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40

2021-01-25 Thread Chuck Lever
  CPU utilization: Wall time2.144CPU time1.895CPU 
utilization  88.41 %


Children see throughput for 12 re-readers   = 5847438.62 kB/sec
Parent sees throughput for 12 re-readers= 5839292.18 kB/sec
Min throughput per process  = 485835.03 kB/sec 
Max throughput per process  = 488702.12 kB/sec
Avg throughput per process  = 487286.55 kB/sec
Min xfer= 1042688.00 kB
CPU utilization: Wall time2.148CPU time1.909CPU 
utilization  88.84 %

NFS READ throughput is almost fully restored. A normal-looking throughput
result, copied from the previous thread, is:

Children see throughput for 12 readers  = 5921370.94 kB/sec
Parent sees throughput for 12 readers   = 5914106.69 kB/sec

The NFS WRITE throughput result appears to be unchanged, or slightly
worse than before. I don't have an explanation for this result. I applied
your patches on the NFS server also without noting improvement.


--
Chuck Lever



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFT PATCH 0/3] Performance regression noted in v5.11-rc after c062db039f40

2021-01-25 Thread Chuck Lever



> On Jan 25, 2021, at 12:39 PM, Chuck Lever  wrote:
> 
> Hello Lu -
> 
> Many thanks for your prototype.
> 
> 
>> On Jan 24, 2021, at 9:38 PM, Lu Baolu  wrote:
>> 
>> This patch series is only for Request-For-Testing purpose. It aims to
>> fix the performance regression reported here.
>> 
>> https://lore.kernel.org/linux-iommu/d81314ed-5673-44a6-b597-090e3cb83...@oracle.com/
>> 
>> The first two patches are borrowed from here.
>> 
>> https://lore.kernel.org/linux-iommu/20210107122909.16317-1-yong...@mediatek.com/
>> 
>> Please kindly help to verification.
>> 
>> Best regards,
>> baolu
>> 
>> Lu Baolu (1):
>> iommu/vt-d: Add iotlb_sync_map callback
>> 
>> Yong Wu (2):
>> iommu: Move iotlb_sync_map out from __iommu_map
>> iommu: Add iova and size as parameters in iotlb_sync_map
>> 
>> drivers/iommu/intel/iommu.c | 86 +
>> drivers/iommu/iommu.c   | 23 +++---
>> drivers/iommu/tegra-gart.c  |  7 ++-
>> include/linux/iommu.h   |  3 +-
>> 4 files changed, 83 insertions(+), 36 deletions(-)
> 
> Here are results with the NFS client at stock v5.11-rc5 and the
> NFS server at v5.10, showing the regression I reported earlier.
> 
>   Children see throughput for 12 initial writers  = 4534582.00 kB/sec
>   Parent sees throughput for 12 initial writers   = 4458145.56 kB/sec
>   Min throughput per process  = 373101.59 kB/sec 
>   Max throughput per process  = 382669.50 kB/sec
>   Avg throughput per process  = 377881.83 kB/sec
>   Min xfer= 1022720.00 kB
>   CPU Utilization: Wall time2.787CPU time1.922CPU 
> utilization  68.95 %
> 
> 
>   Children see throughput for 12 rewriters= 4542003.12 kB/sec
>   Parent sees throughput for 12 rewriters = 4538024.19 kB/sec
>   Min throughput per process  = 374672.00 kB/sec 
>   Max throughput per process  = 383983.78 kB/sec
>   Avg throughput per process  = 378500.26 kB/sec
>   Min xfer= 1022976.00 kB
>   CPU utilization: Wall time2.733CPU time1.947CPU 
> utilization  71.25 %
> 
> 
>   Children see throughput for 12 readers  = 4568632.03 kB/sec
>   Parent sees throughput for 12 readers   = 4563672.02 kB/sec
>   Min throughput per process  = 376727.56 kB/sec 
>   Max throughput per process  = 383783.91 kB/sec
>   Avg throughput per process  = 380719.34 kB/sec
>   Min xfer= 1029376.00 kB
>   CPU utilization: Wall time2.733CPU time1.898CPU 
> utilization  69.46 %
> 
> 
>   Children see throughput for 12 re-readers   = 4610702.78 kB/sec
>   Parent sees throughput for 12 re-readers= 4606135.66 kB/sec
>   Min throughput per process  = 381532.78 kB/sec 
>   Max throughput per process  = 387072.53 kB/sec
>   Avg throughput per process  = 384225.23 kB/sec
>   Min xfer= 1034496.00 kB
>   CPU utilization: Wall time2.711CPU time1.910CPU 
> utilization  70.45 %
> 
> Here's the NFS client at v5.11-rc5 with your series applied.
> The NFS server remains at v5.10:
> 
>   Children see throughput for 12 initial writers  = 4434778.81 kB/sec
>   Parent sees throughput for 12 initial writers   = 4408190.69 kB/sec
>   Min throughput per process  = 367865.28 kB/sec 
>   Max throughput per process  = 371134.38 kB/sec
>   Avg throughput per process  = 369564.90 kB/sec
>   Min xfer= 1039360.00 kB
>   CPU Utilization: Wall time2.842CPU time1.904CPU 
> utilization  66.99 %
> 
> 
>   Children see throughput for 12 rewriters= 4476870.69 kB/sec
>   Parent sees throughput for 12 rewriters = 4471701.48 kB/sec
>   Min throughput per process  = 370985.34 kB/sec 
>   Max throughput per process  = 374752.28 kB/sec
>   Avg throughput per process  = 373072.56 kB/sec
>   Min xfer= 1038592.00 kB
>   CPU utilization: Wall time2.801CPU time1.902CPU 
> utilization  67.91 %
> 
> 
&g