On Tue, Feb 19, 2019 at 12:08:42AM +0100, Ard Biesheuvel wrote: > It turns out that the IP checksumming code is still exercised often, > even though one might expect that modern NICs with checksum offload > have no use for it. However, as Lingyan points out, there are > combinations of features where the network stack may still fall back > to software checksumming, and so it makes sense to provide an > optimized implementation in software as well. > > So provide an implementation of do_csum() in scalar assembler, which, > unlike C, gives direct access to the carry flag, making the code run > substantially faster. The routine uses overlapping 64 byte loads for > all input size > 64 bytes, in order to reduce the number of branches > and improve performance on cores with deep pipelines. > > On Cortex-A57, this implementation is on par with Lingyan's NEON > implementation, and roughly 7x as fast as the generic C code. > > Cc: "huanglingyan (A)" <huanglingy...@huawei.com> > Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org> > --- > Test code after the patch. > > arch/arm64/include/asm/checksum.h | 3 + > arch/arm64/lib/Makefile | 2 +- > arch/arm64/lib/csum.S | 127 ++++++++++++++++++++ > 3 files changed, 131 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/include/asm/checksum.h > b/arch/arm64/include/asm/checksum.h > index 0b6f5a7d4027..e906b956c1fc 100644 > --- a/arch/arm64/include/asm/checksum.h > +++ b/arch/arm64/include/asm/checksum.h > @@ -46,6 +46,9 @@ static inline __sum16 ip_fast_csum(const void *iph, > unsigned int ihl) > } > #define ip_fast_csum ip_fast_csum > > +extern unsigned int do_csum(const unsigned char *buff, int len); > +#define do_csum do_csum > + > #include <asm-generic/checksum.h> > > #endif /* __ASM_CHECKSUM_H */ > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile > index 5540a1638baf..a7606007a749 100644 > --- a/arch/arm64/lib/Makefile > +++ b/arch/arm64/lib/Makefile > @@ -3,7 +3,7 @@ lib-y := clear_user.o delay.o copy_from_user.o > \ > copy_to_user.o copy_in_user.o copy_page.o \ > clear_page.o memchr.o memcpy.o memmove.o memset.o \ > memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ > - strchr.o strrchr.o tishift.o > + strchr.o strrchr.o tishift.o csum.o > > ifeq ($(CONFIG_KERNEL_MODE_NEON), y) > obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o > diff --git a/arch/arm64/lib/csum.S b/arch/arm64/lib/csum.S > new file mode 100644 > index 000000000000..534e2ebdc426 > --- /dev/null > +++ b/arch/arm64/lib/csum.S > @@ -0,0 +1,127 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * Copyright (C) 2019 Linaro, Ltd. <ard.biesheu...@linaro.org> > + */ > + > +#include <linux/linkage.h> > +#include <asm/assembler.h> > + > +ENTRY(do_csum) > + adds x2, xzr, xzr // clear x2 and C flag > + > + // 64 bytes at a time > + lsr x3, x1, #6 > + and x1, x1, #63 > + cbz x3, 1f > + > + // Eight 64-bit adds per iteration > +0: ldp x4, x5, [x0], #64 > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + adcs x2, x2, x4 > + sub x3, x3, #1 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + cbnz x3, 0b > + adc x2, x2, xzr > + > + cbz x1, 7f > + bic x3, x1, #1 > + add x12, x0, x1 > + add x0, x0, x3 > + neg x3, x3 > + add x3, x3, #64 > + lsl x3, x3, #3 > + > + // Handle remaining 63 bytes or less using an overlapping 64-byte load > + // and a branchless code path to complete the calculation > + ldp x4, x5, [x0, #-64] > + ldp x6, x7, [x0, #-48] > + ldp x8, x9, [x0, #-32] > + ldp x10, x11, [x0, #-16] > + ldrb w12, [x12, #-1] > + > + .irp reg, x4, x5, x6, x7, x8, x9, x10, x11 > + cmp x3, #64 > + csel \reg, \reg, xzr, lt > + ccmp x3, xzr, #0, lt > + csel x13, x3, xzr, gt > + sub x3, x3, #64 > +CPU_LE( lsr \reg, \reg, x13 ) > +CPU_BE( lsl \reg, \reg, x13 ) > + .endr > + > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adcs x2, x2, x8 > + adcs x2, x2, x9 > + adcs x2, x2, x10 > + adcs x2, x2, x11 > + adc x2, x2, xzr > + > +CPU_LE( adds x12, x2, x12 ) > +CPU_BE( adds x12, x2, x12, lsl #8 ) > + adc x12, x12, xzr > + tst x1, #1 > + csel x2, x2, x12, eq > + > +7: lsr x1, x2, #32 > + adds w2, w2, w1 > + adc w2, w2, wzr > + > + lsr w1, w2, #16 > + uxth w2, w2 > + add w2, w2, w1 > + > + lsr w1, w2, #16 // handle the carry by hand > + add w2, w2, w1 > + > + uxth w0, w2 > + ret > + > + // Handle 63 bytes or less > +1: tbz x1, #5, 2f > + ldp x4, x5, [x0], #32 > + ldp x6, x7, [x0, #-16] > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adcs x2, x2, x6 > + adcs x2, x2, x7 > + adc x2, x2, xzr > + > +2: tbz x1, #4, 3f > + ldp x4, x5, [x0], #16 > + adds x2, x2, x4 > + adcs x2, x2, x5 > + adc x2, x2, xzr > + > +3: tbz x1, #3, 4f > + ldr x4, [x0], #8 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +4: tbz x1, #2, 5f > + ldr w4, [x0], #4 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +5: tbz x1, #1, 6f > + ldrh w4, [x0], #2 > + adds x2, x2, x4 > + adc x2, x2, xzr > + > +6: tbz x1, #0, 7b > + ldrb w4, [x0] > +CPU_LE( adds x2, x2, x4 ) > +CPU_BE( adds x2, x2, x4, lsl #8 ) > + adc x2, x2, xzr > + b 7b > +ENDPROC(do_csum) > -- > 2.20.1 > > diff --git a/lib/checksum.c b/lib/checksum.c > index d3ec93f9e5f3..7711f1186f71 100644 > --- a/lib/checksum.c > +++ b/lib/checksum.c > @@ -37,7 +37,7 @@ > > #include <asm/byteorder.h> > > -#ifndef do_csum > +#if 1 //ndef do_csum > static inline unsigned short from32to16(unsigned int x) > { > /* add up 16-bit and 16-bit for 16+c bit */ > @@ -47,7 +47,7 @@ static inline unsigned short from32to16(unsigned int x) > return x; > } > > -static unsigned int do_csum(const unsigned char *buff, int len) > +static unsigned int __do_csum(const unsigned char *buff, int len) > { > int odd; > unsigned int result = 0; > @@ -206,3 +206,23 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, > } > EXPORT_SYMBOL(csum_tcpudp_nofold); > #endif > + > +extern u8 crypto_ft_tab[]; > + > +static int __init do_selftest(void) > +{ > + int i, j; > + u16 c1, c2; > + > + for (i = 0; i < 1024; i++) { > + for (j = i + 1; j <= 1024; j++) { > + c1 = __do_csum(crypto_ft_tab + i, j - i); > + c2 = do_csum(crypto_ft_tab + i, j - i); > + > + if (c1 != c2) > + pr_err("######### %d %d %x %x\n", i, j, c1, > c2); > + } > + } > + return 0; > +} > +late_initcall(do_selftest);
Acked-by: Ilias Apalodimas <ilias.apalodi...@linaro.org>