According to the Intel datasheet, the rep movsb instruction
exposes a relevant setup cost - 50 ticks - which affect
badly short string copy operation.

This change tries to avoid such cost calling the explicit
loop available in the unrolled code for string shorter
than 64 bytes. Such value has been selected with empirical
measures as the largest value that still ensure a measurable
gain.

Micro benchmarks of the __copy_from_user() function with
lengths in the [0-63] range show this performance gain
(shorter the string, larger the gain):

- in the [55%-4%] range on Intel Xeon(R) CPU E5-2690 v4
- in the [72%-9%] range on Intel Core i7-4810MQ

Other tested CPUs - namely Intel Atom S1260 and AMD Opteron
8216 - show no differences, because they do not expose the
ERMS feature bit.

Signed-off-by: Paolo Abeni <pab...@redhat.com>
---
 arch/x86/lib/copy_user_64.S | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index c595957..020f75c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -37,7 +37,7 @@ ENTRY(copy_user_generic_unrolled)
        movl %edx,%ecx
        andl $63,%edx
        shrl $6,%ecx
-       jz 17f
+       jz .L_copy_short_string
 1:     movq (%rsi),%r8
 2:     movq 1*8(%rsi),%r9
 3:     movq 2*8(%rsi),%r10
@@ -58,7 +58,8 @@ ENTRY(copy_user_generic_unrolled)
        leaq 64(%rdi),%rdi
        decl %ecx
        jnz 1b
-17:    movl %edx,%ecx
+.L_copy_short_string:
+       movl %edx,%ecx
        andl $7,%edx
        shrl $3,%ecx
        jz 20f
@@ -174,6 +175,8 @@ EXPORT_SYMBOL(copy_user_generic_string)
  */
 ENTRY(copy_user_enhanced_fast_string)
        ASM_STAC
+       cmpl $64,%edx
+       jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
        movl %edx,%ecx
 1:     rep
        movsb
-- 
2.9.4

Reply via email to