The branch main has been updated by mjg: URL: https://cgit.FreeBSD.org/src/commit/?id=5fa12fe0cd203efcbb2ac21e7c3e3fb9b2f801ae
commit 5fa12fe0cd203efcbb2ac21e7c3e3fb9b2f801ae Author: Mateusz Guzik <m...@freebsd.org> AuthorDate: 2021-02-21 00:42:26 +0000 Commit: Mateusz Guzik <m...@freebsd.org> CommitDate: 2021-02-21 00:43:05 +0000 amd64: implement strlen in assembly, take 2 Tested with glibc test suite. The C variant in libkern performs excessive branching to find the zero byte instead of using the bsfq instruction. The same code patched to use it is still slower than the routine implemented here as the compiler keeps neglecting to perform certain optimizations (like using leaq). On top of that the routine can be used as a starting point for copyinstr which operates on words intead of bytes. The previous attempt had an instance of swapped operands to andq when dealing with fully aligned case, which had a side effect of breaking the code for certain corner cases. Noted by jrtc27. Sample results: $(perl -e "print 'A' x 3"): stock: 211198039 patched:338626619 asm: 465609618 $(perl -e "print 'A' x 100"): stock: 83151997 patched: 98285919 asm: 120719888 Reviewed by: jhb, kib Differential Revision: https://reviews.freebsd.org/D28779 --- sys/amd64/amd64/support.S | 66 +++++++++++++++++++++++++++++++++++++++++++++++ sys/conf/files.amd64 | 1 - 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S index b623fba277db..4c0f7da87ef8 100644 --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -697,6 +697,72 @@ ENTRY(fillw) ret END(fillw) +/* + * strlen(string) + * %rdi + * + * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. + * + * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added + * with leaq. + * + * For a description see either: + * - "Hacker's Delight" by Henry S. Warren, Jr. + * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" + * by Agner Fog + * + * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. + */ +ENTRY(strlen) + PUSH_FRAME_POINTER + movabsq $0xfefefefefefefeff,%r8 + movabsq $0x8080808080808080,%r9 + + movq %rdi,%r10 + movq %rdi,%rcx + testb $7,%dil + jz 2f + + /* + * Handle misaligned reads: align to 8 and fill + * the spurious bytes. + */ + andq $~7,%rdi + movq (%rdi),%r11 + shlq $3,%rcx + movq $-1,%rdx + shlq %cl,%rdx + notq %rdx + orq %rdx,%r11 + + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jnz 3f + + /* + * Main loop. + */ + ALIGN_TEXT +1: + leaq 8(%rdi),%rdi +2: + movq (%rdi),%r11 + leaq (%r11,%r8),%rcx + notq %r11 + andq %r11,%rcx + andq %r9,%rcx + jz 1b +3: + bsfq %rcx,%rcx + shrq $3,%rcx + leaq (%rcx,%rdi),%rax + subq %r10,%rax + POP_FRAME_POINTER + ret +END(strlen) + /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index 98a78a8b1ef9..395f501198f8 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -390,7 +390,6 @@ isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/imgact_aout.c optional compat_aout kern/link_elf_obj.c standard -libkern/strlen.c standard # # IA32 binary support # _______________________________________________ dev-commits-src-main@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/dev-commits-src-main To unsubscribe, send any mail to "dev-commits-src-main-unsubscr...@freebsd.org"