Implement assembly routine for csum_partial for 64 bit x86. This primarily speeds up checksum calculation for smaller lengths such as those that are present when doing skb_postpull_rcsum when getting CHECKSUM_COMPLETE from device or after CHECKSUM_UNNECESSARY conversion.
This implementation is similar to csum_partial implemented in checksum_32.S, however since we are dealing with 8 bytes at a time there are more cases for small lengths-- for that we employ a jump table. Also, we don't do anything special for alignment, unaligned accesses on x86 do not appear to be a performance issue. Testing: Verified correctness by testing arbitrary length buffer filled with random data. For each buffer I compared the computed checksum using the original algorithm for each possible alignment (0-7 bytes). Checksum performance: Isolating old and new implementation for some common cases: Old New Case nsecs nsecs Improvement ---------------------+--------+--------+----------------------------- 1400 bytes (0 align) 194.5 174.3 10% (Big packet) 40 bytes (0 align) 13.8 5.8 57% (Ipv6 hdr common case) 8 bytes (4 align) 8.4 2.9 65% (UDP, VXLAN in IPv4) 14 bytes (0 align) 10.6 5.8 45% (Eth hdr) 14 bytes (4 align) 10.8 5.8 46% (Eth hdr in IPv4) Signed-off-by: Tom Herbert <t...@herbertland.com> --- arch/x86/include/asm/checksum_64.h | 5 ++ arch/x86/lib/csum-partial_64.S | 147 ++++++++++++++++++++++++++++++++++++ arch/x86/lib/csum-partial_64.c | 148 ------------------------------------- 3 files changed, 152 insertions(+), 148 deletions(-) create mode 100644 arch/x86/lib/csum-partial_64.S delete mode 100644 arch/x86/lib/csum-partial_64.c diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index cd00e17..a888f65 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -128,6 +128,11 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); +static inline __sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold(csum_partial(buff, len, 0)); +} + #define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 #define HAVE_CSUM_COPY_USER 1 diff --git a/arch/x86/lib/csum-partial_64.S b/arch/x86/lib/csum-partial_64.S new file mode 100644 index 0000000..8e387bb --- /dev/null +++ b/arch/x86/lib/csum-partial_64.S @@ -0,0 +1,147 @@ +/* Copyright 2016 Tom Herbert <t...@herbertland.com> + * + * Checksum partial calculation + * + * __wsum csum_partial(const void *buff, int len, __wsum sum) + * + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * Returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * Register usage: + * %rdi: argument 1, buff + * %rsi: argument 2, length + * %rdx: argument 3, add in value + * %rax,%eax: accumulator and return value + * %rcx,%ecx: counter and tmp + * %r11: tmp + * + * Basic algorithm: + * 1) Sum 8 bytes at a time using adcq (unroll main loop + * to do 64 bytes at a time) + * 2) Sum remaining length (less than 8 bytes) + * + * Note that buffer aligment is not considered, unaligned accesses on x86 don't + * seem to be a performance hit (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is set). + */ + +#include <linux/linkage.h> +#include <asm/errno.h> +#include <asm/asm.h> + +#define branch_tbl_len .L_branch_tbl_len + +ENTRY(csum_partial) + movl %edx, %eax /* Initialize with initial sum argument */ + + /* Check length */ + cmpl $8, %esi + jg 10f + jl 20f + + /* Exactly 8 bytes length */ + addl (%rdi), %eax + adcl 4(%rdi), %eax + adcl $0, %eax + ret + + /* Less than 8 bytes length */ +20: clc + jmpq *branch_tbl_len(, %rsi, 8) + + /* Greater than 8 bytes length. Determine number of quads (n). Sum + * over first n % 8 quads + */ +10: movl %esi, %ecx + shrl $3, %ecx + andl $0x7, %ecx + negq %rcx + lea 20f(, %rcx, 4), %r11 + clc + jmp *%r11 + +.align 8 + adcq 6*8(%rdi),%rax + adcq 5*8(%rdi),%rax + adcq 4*8(%rdi),%rax + adcq 3*8(%rdi),%rax + adcq 2*8(%rdi),%rax + adcq 1*8(%rdi),%rax + adcq 0*8(%rdi),%rax + nop +20: /* #quads % 8 jump table base */ + + adcq $0, %rax + shlq $3, %rcx + subq %rcx, %rdi /* %rcx is already negative length */ + + /* Now determine number of blocks of 8 quads. Sum 64 bytes at a time + * using unrolled loop. + */ + movl %esi, %ecx + shrl $6, %ecx + jz 30f + clc + + /* Main loop */ +40: adcq 0*8(%rdi),%rax + adcq 1*8(%rdi),%rax + adcq 2*8(%rdi),%rax + adcq 3*8(%rdi),%rax + adcq 4*8(%rdi),%rax + adcq 5*8(%rdi),%rax + adcq 6*8(%rdi),%rax + adcq 7*8(%rdi),%rax + lea 64(%rdi), %rdi + loop 40b + + adcq $0, %rax + + /* Handle remaining length which is < 8 bytes */ +30: andl $0x7, %esi + + /* Fold 64 bit sum to 32 bits */ + movq %rax, %rcx + shrq $32, %rcx + addl %ecx, %eax + + jmpq *branch_tbl_len(, %rsi, 8) + +/* Length table targets */ + +107: /* Length 7 */ + adcw 4(%rdi), %ax +105: /* Length 5 */ + adcw 2(%rdi), %ax +103: /* Length 3 */ + adcw (%rdi), %ax +101: /* Length 1, grab the odd byte */ + adcb -1(%rdi, %rsi), %al + adcb $0, %ah + adcl $0, %eax + ret +106: /* Length 6 */ + adcw 4(%rdi), %ax +104: /* Length 4 */ + adcl (%rdi), %eax + adcl $0, %eax + ret +102: /* Length 2 */ + adcw (%rdi), %ax +100: /* Length 0 */ + adcl $0, %eax + ret + +.section .rodata +.align 64 +.L_branch_tbl_len: + .quad 100b + .quad 101b + .quad 102b + .quad 103b + .quad 104b + .quad 105b + .quad 106b + .quad 107b diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c deleted file mode 100644 index 9845371..0000000 --- a/arch/x86/lib/csum-partial_64.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * arch/x86_64/lib/csum-partial.c - * - * This file contains network checksum routines that are better done - * in an architecture-specific manner due to speed. - */ - -#include <linux/compiler.h> -#include <linux/module.h> -#include <asm/checksum.h> - -static inline unsigned short from32to16(unsigned a) -{ - unsigned short b = a >> 16; - asm("addw %w2,%w0\n\t" - "adcw $0,%w0\n" - : "=r" (b) - : "0" (b), "r" (a)); - return b; -} - -/* - * Do a 64-bit checksum on an arbitrary memory area. - * Returns a 32bit checksum. - * - * This isn't as time critical as it used to be because many NICs - * do hardware checksumming these days. - * - * Things tried and found to not make it faster: - * Manual Prefetching - * Unrolling to an 128 bytes inner loop. - * Using interleaving with more registers to break the carry chains. - */ -static unsigned do_csum(const unsigned char *buff, unsigned len) -{ - unsigned odd, count; - unsigned long result = 0; - - if (unlikely(len == 0)) - return result; - odd = 1 & (unsigned long) buff; - if (unlikely(odd)) { - result = *buff << 8; - len--; - buff++; - } - count = len >> 1; /* nr of 16-bit words.. */ - if (count) { - if (2 & (unsigned long) buff) { - result += *(unsigned short *)buff; - count--; - len -= 2; - buff += 2; - } - count >>= 1; /* nr of 32-bit words.. */ - if (count) { - unsigned long zero; - unsigned count64; - if (4 & (unsigned long) buff) { - result += *(unsigned int *) buff; - count--; - len -= 4; - buff += 4; - } - count >>= 1; /* nr of 64-bit words.. */ - - /* main loop using 64byte blocks */ - zero = 0; - count64 = count >> 3; - while (count64) { - asm("addq 0*8(%[src]),%[res]\n\t" - "adcq 1*8(%[src]),%[res]\n\t" - "adcq 2*8(%[src]),%[res]\n\t" - "adcq 3*8(%[src]),%[res]\n\t" - "adcq 4*8(%[src]),%[res]\n\t" - "adcq 5*8(%[src]),%[res]\n\t" - "adcq 6*8(%[src]),%[res]\n\t" - "adcq 7*8(%[src]),%[res]\n\t" - "adcq %[zero],%[res]" - : [res] "=r" (result) - : [src] "r" (buff), [zero] "r" (zero), - "[res]" (result)); - buff += 64; - count64--; - } - - /* last up to 7 8byte blocks */ - count %= 8; - while (count) { - asm("addq %1,%0\n\t" - "adcq %2,%0\n" - : "=r" (result) - : "m" (*(unsigned long *)buff), - "r" (zero), "0" (result)); - --count; - buff += 8; - } - result = add32_with_carry(result>>32, - result&0xffffffff); - - if (len & 4) { - result += *(unsigned int *) buff; - buff += 4; - } - } - if (len & 2) { - result += *(unsigned short *) buff; - buff += 2; - } - } - if (len & 1) - result += *buff; - result = add32_with_carry(result>>32, result & 0xffffffff); - if (unlikely(odd)) { - result = from32to16(result); - result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); - } - return result; -} - -/* - * computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit) - * - * returns a 32-bit number suitable for feeding into itself - * or csum_tcpudp_magic - * - * this function must be called with even lengths, except - * for the last fragment, which may be odd - * - * it's best to have buff aligned on a 64-bit boundary - */ -__wsum csum_partial(const void *buff, int len, __wsum sum) -{ - return (__force __wsum)add32_with_carry(do_csum(buff, len), - (__force u32)sum); -} - -/* - * this routine is used for miscellaneous IP-like checksums, mainly - * in icmp.c - */ -__sum16 ip_compute_csum(const void *buff, int len) -{ - return csum_fold(csum_partial(buff,len,0)); -} -EXPORT_SYMBOL(ip_compute_csum); - -- 2.4.6 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html