On 2021-03-23 07:34, Yang Yingliang wrote:
When copy over 128 bytes, src/dst is added after
each ldp/stp instruction, it will cost more time.
To improve this, we only add src/dst after load
or store 64 bytes.

This breaks the required behaviour for copy_*_user(), since the fault handler expects the base address to be up-to-date at all times. Say you're copying 128 bytes and fault on the 4th store, it should return 80 bytes not copied; the code below would return 128 bytes not copied, even though 48 bytes have actually been written to the destination.

We've had a couple of tries at updating this code (because the whole template is frankly a bit terrible, and a long way from the well-optimised code it was derived from), but getting the fault-handling behaviour right without making the handler itself ludicrously complex has proven tricky. And then it got bumped down the priority list while the uaccess behaviour in general was in flux - now that the dust has largely settled on that I should probably try to find time to pick this up again...

Robin.

Copy 4096 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23

With this patch:
memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85

It's about 25% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingli...@huawei.com>
---
  arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
  1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..c3cd6f84c9c0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -152,29 +152,33 @@ D_h       .req    x14
        .p2align        L1_CACHE_SHIFT
  .Lcpy_body_large:
        /* pre-get 64 bytes data. */
-       ldp1    A_l, A_h, src, #16
-       ldp1    B_l, B_h, src, #16
-       ldp1    C_l, C_h, src, #16
-       ldp1    D_l, D_h, src, #16
+       ldp2    A_l, A_h, src, #0,  #8
+       ldp2    B_l, B_h, src, #16, #24
+       ldp2    C_l, C_h, src, #32, #40
+       ldp2    D_l, D_h, src, #48, #56
+       add     src, src, #64
  1:
        /*
        * interlace the load of next 64 bytes data block with store of the last
        * loaded 64 bytes data.
        */
-       stp1    A_l, A_h, dst, #16
-       ldp1    A_l, A_h, src, #16
-       stp1    B_l, B_h, dst, #16
-       ldp1    B_l, B_h, src, #16
-       stp1    C_l, C_h, dst, #16
-       ldp1    C_l, C_h, src, #16
-       stp1    D_l, D_h, dst, #16
-       ldp1    D_l, D_h, src, #16
+       stp2    A_l, A_h, dst, #0,  #8
+       ldp2    A_l, A_h, src, #0,  #8
+       stp2    B_l, B_h, dst, #16, #24
+       ldp2    B_l, B_h, src, #16, #24
+       stp2    C_l, C_h, dst, #32, #40
+       ldp2    C_l, C_h, src, #32, #40
+       stp2    D_l, D_h, dst, #48, #56
+       ldp2    D_l, D_h, src, #48, #56
+       add     src, src, #64
+       add     dst, dst, #64
        subs    count, count, #64
        b.ge    1b
-       stp1    A_l, A_h, dst, #16
-       stp1    B_l, B_h, dst, #16
-       stp1    C_l, C_h, dst, #16
-       stp1    D_l, D_h, dst, #16
+       stp2    A_l, A_h, dst, #0,  #8
+       stp2    B_l, B_h, dst, #16, #24
+       stp2    C_l, C_h, dst, #32, #40
+       stp2    D_l, D_h, dst, #48, #56
+       add     dst, dst, #64
tst count, #0x3f
        b.ne    .Ltail63

Reply via email to