Rewrite clear_user() on the same principle as memset(0), making use of dcbz to clear complete cache lines.
This code is a copy/paste of memset(), with some modifications in order to retrieve remaining number of bytes to be cleared, as it needs to be returned in case of error. On a MPC885, throughput is almost doubled: Before: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s After: ~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s On a MPC8321, throughput is multiplied by 2.12: Before: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s After: root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000 1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr> --- arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S index ab8c4f5f31b6..2c11c2019b69 100644 --- a/arch/powerpc/lib/string_32.S +++ b/arch/powerpc/lib/string_32.S @@ -13,6 +13,7 @@ #include <asm/errno.h> #include <asm/ppc_asm.h> #include <asm/export.h> +#include <asm/cache.h> .text @@ -31,44 +32,78 @@ _GLOBAL(memcmp) blr EXPORT_SYMBOL(memcmp) +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + _GLOBAL(__clear_user) - addi r6,r3,-4 - li r3,0 - li r5,0 - cmplwi 0,r4,4 +/* + * Use dcbz on the complete cache lines in the destination + * to set them to zero. This requires that the destination + * area is cacheable. + */ + cmplwi cr0, r4, 4 + mr r10, r3 + li r3, 0 blt 7f - /* clear a single word */ -11: stwu r5,4(r6) + +11: stw r3, 0(r10) beqlr - /* clear word sized chunks */ - andi. r0,r6,3 - add r4,r0,r4 - subf r6,r0,r6 - srwi r0,r4,2 - andi. r4,r4,3 + andi. r0, r10, 3 + add r11, r0, r4 + subf r6, r0, r10 + + clrlwi r7, r6, 32 - LG_CACHELINE_BYTES + add r8, r7, r11 + srwi r9, r8, LG_CACHELINE_BYTES + addic. r9, r9, -1 /* total number of complete cachelines */ + ble 2f + xori r0, r7, CACHELINE_MASK & ~3 + srwi. r0, r0, 2 + beq 3f + mtctr r0 +4: stwu r3, 4(r6) + bdnz 4b +3: mtctr r9 + li r7, 4 +10: dcbz r7, r6 + addi r6, r6, CACHELINE_BYTES + bdnz 10b + clrlwi r11, r8, 32 - LG_CACHELINE_BYTES + addi r11, r11, 4 + +2: srwi r0 ,r11 ,2 mtctr r0 - bdz 7f -1: stwu r5,4(r6) + bdz 6f +1: stwu r3, 4(r6) bdnz 1b - /* clear byte sized chunks */ -7: cmpwi 0,r4,0 +6: andi. r11, r11, 3 beqlr - mtctr r4 - addi r6,r6,3 -8: stbu r5,1(r6) + mtctr r11 + addi r6, r6, 3 +8: stbu r3, 1(r6) bdnz 8b blr -90: mr r3,r4 + +7: cmpwi cr0, r4, 0 + beqlr + mtctr r4 + addi r6, r10, -1 +9: stbu r3, 1(r6) + bdnz 9b blr -91: mfctr r3 - slwi r3,r3,2 - add r3,r3,r4 + +90: mr r3, r4 blr -92: mfctr r3 +91: add r3, r10, r4 + subf r3, r6, r3 blr EX_TABLE(11b, 90b) + EX_TABLE(4b, 91b) + EX_TABLE(10b, 91b) EX_TABLE(1b, 91b) - EX_TABLE(8b, 92b) + EX_TABLE(8b, 91b) + EX_TABLE(9b, 91b) EXPORT_SYMBOL(__clear_user) -- 2.13.3