Improve __csum_partial by interleaving loads and adds. On a 8xx, it brings neither improvement nor degradation. On a 83xx, it brings a 25% improvement.
Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr> --- arch/powerpc/lib/checksum_32.S | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S index d2238ea82209..aa224069f93a 100644 --- a/arch/powerpc/lib/checksum_32.S +++ b/arch/powerpc/lib/checksum_32.S @@ -47,16 +47,25 @@ _GLOBAL(__csum_partial) bdnz 2b 21: srwi. r6,r4,4 /* # blocks of 4 words to do */ beq 3f + lwz r0,4(r3) mtctr r6 -22: lwz r0,4(r3) lwz r6,8(r3) + adde r5,r5,r0 lwz r7,12(r3) + adde r5,r5,r6 lwzu r8,16(r3) + adde r5,r5,r7 + bdz 23f +22: lwz r0,4(r3) + adde r5,r5,r8 + lwz r6,8(r3) adde r5,r5,r0 + lwz r7,12(r3) adde r5,r5,r6 + lwzu r8,16(r3) adde r5,r5,r7 - adde r5,r5,r8 bdnz 22b +23: adde r5,r5,r8 3: andi. r0,r4,2 beq+ 4f lhz r0,4(r3) -- 2.13.3