[PATCH] arm64: do_csum: implement accelerated scalar version

Ard Biesheuvel Mon, 18 Feb 2019 15:08:57 -0800

It turns out that the IP checksumming code is still exercised often,
even though one might expect that modern NICs with checksum offload
have no use for it. However, as Lingyan points out, there are
combinations of features where the network stack may still fall back
to software checksumming, and so it makes sense to provide an
optimized implementation in software as well.


So provide an implementation of do_csum() in scalar assembler, which,
unlike C, gives direct access to the carry flag, making the code run
substantially faster. The routine uses overlapping 64 byte loads for
all input size > 64 bytes, in order to reduce the number of branches
and improve performance on cores with deep pipelines.

On Cortex-A57, this implementation is on par with Lingyan's NEON
implementation, and roughly 7x as fast as the generic C code.

Cc: "huanglingyan (A)" <huanglingy...@huawei.com>
Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org>
---
Test code after the patch.

 arch/arm64/include/asm/checksum.h |   3 +
 arch/arm64/lib/Makefile           |   2 +-
 arch/arm64/lib/csum.S             | 127 ++++++++++++++++++++
 3 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/checksum.h 
b/arch/arm64/include/asm/checksum.h
index 0b6f5a7d4027..e906b956c1fc 100644
--- a/arch/arm64/include/asm/checksum.h
+++ b/arch/arm64/include/asm/checksum.h
@@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned 
int ihl)
 }
 #define ip_fast_csum ip_fast_csum
 
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
 #include <asm-generic/checksum.h>
 
 #endif /* __ASM_CHECKSUM_H */
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 5540a1638baf..a7606007a749 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -3,7 +3,7 @@ lib-y           := clear_user.o delay.o copy_from_user.o        
        \
                   copy_to_user.o copy_in_user.o copy_page.o            \
                   clear_page.o memchr.o memcpy.o memmove.o memset.o    \
                   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o       \
-                  strchr.o strrchr.o tishift.o
+                  strchr.o strrchr.o tishift.o csum.o
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)       += xor-neon.o
diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S
new file mode 100644
index 000000000000..534e2ebdc426
--- /dev/null
+++ b/arch/arm64/lib/csum.S
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheu...@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(do_csum)
+       adds            x2, xzr, xzr            // clear x2 and C flag
+
+       // 64 bytes at a time
+       lsr             x3, x1, #6
+       and             x1, x1, #63
+       cbz             x3, 1f
+
+       // Eight 64-bit adds per iteration
+0:     ldp             x4, x5, [x0], #64
+       ldp             x6, x7, [x0, #-48]
+       ldp             x8, x9, [x0, #-32]
+       ldp             x10, x11, [x0, #-16]
+       adcs            x2, x2, x4
+       sub             x3, x3, #1
+       adcs            x2, x2, x5
+       adcs            x2, x2, x6
+       adcs            x2, x2, x7
+       adcs            x2, x2, x8
+       adcs            x2, x2, x9
+       adcs            x2, x2, x10
+       adcs            x2, x2, x11
+       cbnz            x3, 0b
+       adc             x2, x2, xzr
+
+       cbz             x1, 7f
+       bic             x3, x1, #1
+       add             x12, x0, x1
+       add             x0, x0, x3
+       neg             x3, x3
+       add             x3, x3, #64
+       lsl             x3, x3, #3
+
+       // Handle remaining 63 bytes or less using an overlapping 64-byte load
+       // and a branchless code path to complete the calculation
+       ldp             x4, x5, [x0, #-64]
+       ldp             x6, x7, [x0, #-48]
+       ldp             x8, x9, [x0, #-32]
+       ldp             x10, x11, [x0, #-16]
+       ldrb            w12, [x12, #-1]
+
+       .irp            reg, x4, x5, x6, x7, x8, x9, x10, x11
+       cmp             x3, #64
+       csel            \reg, \reg, xzr, lt
+       ccmp            x3, xzr, #0, lt
+       csel            x13, x3, xzr, gt
+       sub             x3, x3, #64
+CPU_LE(        lsr             \reg, \reg, x13         )
+CPU_BE(        lsl             \reg, \reg, x13         )
+       .endr
+
+       adds            x2, x2, x4
+       adcs            x2, x2, x5
+       adcs            x2, x2, x6
+       adcs            x2, x2, x7
+       adcs            x2, x2, x8
+       adcs            x2, x2, x9
+       adcs            x2, x2, x10
+       adcs            x2, x2, x11
+       adc             x2, x2, xzr
+
+CPU_LE(        adds            x12, x2, x12            )
+CPU_BE(        adds            x12, x2, x12, lsl #8    )
+       adc             x12, x12, xzr
+       tst             x1, #1
+       csel            x2, x2, x12, eq
+
+7:     lsr             x1, x2, #32
+       adds            w2, w2, w1
+       adc             w2, w2, wzr
+
+       lsr             w1, w2, #16
+       uxth            w2, w2
+       add             w2, w2, w1
+
+       lsr             w1, w2, #16             // handle the carry by hand
+       add             w2, w2, w1
+
+       uxth            w0, w2
+       ret
+
+       // Handle 63 bytes or less
+1:     tbz             x1, #5, 2f
+       ldp             x4, x5, [x0], #32
+       ldp             x6, x7, [x0, #-16]
+       adds            x2, x2, x4
+       adcs            x2, x2, x5
+       adcs            x2, x2, x6
+       adcs            x2, x2, x7
+       adc             x2, x2, xzr
+
+2:     tbz             x1, #4, 3f
+       ldp             x4, x5, [x0], #16
+       adds            x2, x2, x4
+       adcs            x2, x2, x5
+       adc             x2, x2, xzr
+
+3:     tbz             x1, #3, 4f
+       ldr             x4, [x0], #8
+       adds            x2, x2, x4
+       adc             x2, x2, xzr
+
+4:     tbz             x1, #2, 5f
+       ldr             w4, [x0], #4
+       adds            x2, x2, x4
+       adc             x2, x2, xzr
+
+5:     tbz             x1, #1, 6f
+       ldrh            w4, [x0], #2
+       adds            x2, x2, x4
+       adc             x2, x2, xzr
+
+6:     tbz             x1, #0, 7b
+       ldrb            w4, [x0]
+CPU_LE(        adds            x2, x2, x4              )
+CPU_BE(        adds            x2, x2, x4, lsl #8      )
+       adc             x2, x2, xzr
+       b               7b
+ENDPROC(do_csum)
-- 
2.20.1

  diff --git a/lib/checksum.c b/lib/checksum.c
  index d3ec93f9e5f3..7711f1186f71 100644
  --- a/lib/checksum.c
  +++ b/lib/checksum.c
  @@ -37,7 +37,7 @@
   
   #include <asm/byteorder.h>
   
  -#ifndef do_csum
  +#if 1 //ndef do_csum
   static inline unsigned short from32to16(unsigned int x)
   {
          /* add up 16-bit and 16-bit for 16+c bit */
  @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x)
          return x;
   }
   
  -static unsigned int do_csum(const unsigned char *buff, int len)
  +static unsigned int __do_csum(const unsigned char *buff, int len)
   {
          int odd;
          unsigned int result = 0;
  @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
   }
   EXPORT_SYMBOL(csum_tcpudp_nofold);
   #endif
  +
  +extern u8 crypto_ft_tab[];
  +
  +static int __init do_selftest(void)
  +{
  +       int i, j;
  +       u16 c1, c2;
  +
  +       for (i = 0; i < 1024; i++) {
  +               for (j = i + 1; j <= 1024; j++) {
  +                       c1 = __do_csum(crypto_ft_tab + i, j - i);
  +                       c2 = do_csum(crypto_ft_tab + i, j - i);
  +
  +                       if (c1 != c2)
  +                               pr_err("######### %d %d %x %x\n", i, j, c1, 
c2);
  +               }
  +       }
  +       return 0;
  +}
  +late_initcall(do_selftest);

[PATCH] arm64: do_csum: implement accelerated scalar version

Reply via email to