Today's implementation of csum_shift() leads to branching based on
parity of 'offset'

        000002f8 <csum_block_add>:
             2f8:       70 a5 00 01     andi.   r5,r5,1
             2fc:       41 a2 00 08     beq     304 <csum_block_add+0xc>
             300:       54 84 c0 3e     rotlwi  r4,r4,24
             304:       7c 63 20 14     addc    r3,r3,r4
             308:       7c 63 01 94     addze   r3,r3
             30c:       4e 80 00 20     blr

Use first bit of 'offset' directly as input of the rotation instead of
branching.

        000002f8 <csum_block_add>:
             2f8:       54 a5 1f 38     rlwinm  r5,r5,3,28,28
             2fc:       20 a5 00 20     subfic  r5,r5,32
             300:       5c 84 28 3e     rotlw   r4,r4,r5
             304:       7c 63 20 14     addc    r3,r3,r4
             308:       7c 63 01 94     addze   r3,r3
             30c:       4e 80 00 20     blr

And change to left shift instead of right shift to skip one more
instruction. This has no impact on the final sum.

        000002f8 <csum_block_add>:
             2f8:       54 a5 1f 38     rlwinm  r5,r5,3,28,28
             2fc:       5c 84 28 3e     rotlw   r4,r4,r5
             300:       7c 63 20 14     addc    r3,r3,r4
             304:       7c 63 01 94     addze   r3,r3
             308:       4e 80 00 20     blr

Seems like only powerpc benefits from a branchless implementation.
Other main architectures like ARM or X86 get better code with
the generic implementation and its branch.

Signed-off-by: Christophe Leroy <christophe.le...@csgroup.eu>
---
 arch/powerpc/include/asm/checksum.h | 7 +++++++
 include/net/checksum.h              | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index 19eaa2b6d043..8321f6053a67 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -111,6 +111,13 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum 
addend)
 #endif
 }
 
+#define HAVE_ARCH_CSUM_SHIFT
+static __always_inline __wsum csum_shift(__wsum sum, int offset)
+{
+       /* rotate sum to align it with a 16b boundary */
+       return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
+}
+
 /*
  * This is a version of ip_compute_csum() optimized for IP headers,
  * which always checksum on 4 octet boundaries.  ihl is the number
diff --git a/include/net/checksum.h b/include/net/checksum.h
index 79c67f14c448..6bc783b7a06c 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, 
__be16 addend)
        return csum16_add(csum, ~addend);
 }
 
+#ifndef HAVE_ARCH_CSUM_SHIFT
 static __always_inline __wsum csum_shift(__wsum sum, int offset)
 {
        /* rotate sum to align it with a 16b boundary */
@@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int 
offset)
                return (__force __wsum)ror32((__force u32)sum, 8);
        return sum;
 }
+#endif
 
 static __always_inline __wsum
 csum_block_add(__wsum csum, __wsum csum2, int offset)
-- 
2.34.1

Reply via email to