Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.

This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.

On a MPC885, throughput is almost doubled:

Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s

After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s

On a MPC8321, throughput is multiplied by 2.12:

Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s

After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s

Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index ab8c4f5f31b6..2c11c2019b69 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -13,6 +13,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/cache.h>
 
        .text
 
@@ -31,44 +32,78 @@ _GLOBAL(memcmp)
        blr
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-       addi    r6,r3,-4
-       li      r3,0
-       li      r5,0
-       cmplwi  0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+       cmplwi  cr0, r4, 4
+       mr      r10, r3
+       li      r3, 0
        blt     7f
-       /* clear a single word */
-11:    stwu    r5,4(r6)
+
+11:    stw     r3, 0(r10)
        beqlr
-       /* clear word sized chunks */
-       andi.   r0,r6,3
-       add     r4,r0,r4
-       subf    r6,r0,r6
-       srwi    r0,r4,2
-       andi.   r4,r4,3
+       andi.   r0, r10, 3
+       add     r11, r0, r4
+       subf    r6, r0, r10
+
+       clrlwi  r7, r6, 32 - LG_CACHELINE_BYTES
+       add     r8, r7, r11
+       srwi    r9, r8, LG_CACHELINE_BYTES
+       addic.  r9, r9, -1      /* total number of complete cachelines */
+       ble     2f
+       xori    r0, r7, CACHELINE_MASK & ~3
+       srwi.   r0, r0, 2
+       beq     3f
+       mtctr   r0
+4:     stwu    r3, 4(r6)
+       bdnz    4b
+3:     mtctr   r9
+       li      r7, 4
+10:    dcbz    r7, r6
+       addi    r6, r6, CACHELINE_BYTES
+       bdnz    10b
+       clrlwi  r11, r8, 32 - LG_CACHELINE_BYTES
+       addi    r11, r11, 4
+
+2:     srwi    r0 ,r11 ,2
        mtctr   r0
-       bdz     7f
-1:     stwu    r5,4(r6)
+       bdz     6f
+1:     stwu    r3, 4(r6)
        bdnz    1b
-       /* clear byte sized chunks */
-7:     cmpwi   0,r4,0
+6:     andi.   r11, r11, 3
        beqlr
-       mtctr   r4
-       addi    r6,r6,3
-8:     stbu    r5,1(r6)
+       mtctr   r11
+       addi    r6, r6, 3
+8:     stbu    r3, 1(r6)
        bdnz    8b
        blr
-90:    mr      r3,r4
+
+7:     cmpwi   cr0, r4, 0
+       beqlr
+       mtctr   r4
+       addi    r6, r10, -1
+9:     stbu    r3, 1(r6)
+       bdnz    9b
        blr
-91:    mfctr   r3
-       slwi    r3,r3,2
-       add     r3,r3,r4
+
+90:    mr      r3, r4
        blr
-92:    mfctr   r3
+91:    add     r3, r10, r4
+       subf    r3, r6, r3
        blr
 
        EX_TABLE(11b, 90b)
+       EX_TABLE(4b, 91b)
+       EX_TABLE(10b, 91b)
        EX_TABLE(1b, 91b)
-       EX_TABLE(8b, 92b)
+       EX_TABLE(8b, 91b)
+       EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

Reply via email to