Improve __csum_partial by interleaving loads and adds.

On a 8xx, it brings neither improvement nor degradation.
On a 83xx, it brings a 25% improvement.

Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
---
 arch/powerpc/lib/checksum_32.S | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index d2238ea82209..aa224069f93a 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -47,16 +47,25 @@ _GLOBAL(__csum_partial)
        bdnz    2b
 21:    srwi.   r6,r4,4         /* # blocks of 4 words to do */
        beq     3f
+       lwz     r0,4(r3)
        mtctr   r6
-22:    lwz     r0,4(r3)
        lwz     r6,8(r3)
+       adde    r5,r5,r0
        lwz     r7,12(r3)
+       adde    r5,r5,r6
        lwzu    r8,16(r3)
+       adde    r5,r5,r7
+       bdz     23f
+22:    lwz     r0,4(r3)
+       adde    r5,r5,r8
+       lwz     r6,8(r3)
        adde    r5,r5,r0
+       lwz     r7,12(r3)
        adde    r5,r5,r6
+       lwzu    r8,16(r3)
        adde    r5,r5,r7
-       adde    r5,r5,r8
        bdnz    22b
+23:    adde    r5,r5,r8
 3:     andi.   r0,r4,2
        beq+    4f
        lhz     r0,4(r3)
-- 
2.13.3

Reply via email to