On the 8xx, load latency is 2 cycles and taking branches also takes
2 cycles. So let's unroll the loop.

Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
---
 arch/powerpc/lib/checksum_32.S | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 2e4879c..9c48ee0 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -75,10 +75,24 @@ _GLOBAL(csum_partial)
        srwi.   r6,r4,2         /* # words to do */
        adde    r5,r5,r0
        beq     3f
-1:     mtctr   r6
+1:     andi.   r6,r6,3         /* Prepare to handle words 4 by 4 */
+       beq     21f
+       mtctr   r6
 2:     lwzu    r0,4(r3)
        adde    r5,r5,r0
        bdnz    2b
+21:    srwi.   r6,r4,4         /* # blocks of 4 words to do */
+       beq     3f
+       mtctr   r6
+22:    lwzu    r0,4(r3)
+       lwzu    r6,4(r3)
+       lwzu    r7,4(r3)
+       lwzu    r8,4(r3)
+       adde    r5,r5,r0
+       adde    r5,r5,r6
+       adde    r5,r5,r7
+       adde    r5,r5,r8
+       bdnz    22b
 3:     andi.   r0,r4,2
        beq+    4f
        lhz     r0,4(r3)
-- 
2.1.0

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to