The branch main has been updated by fuz: URL: https://cgit.FreeBSD.org/src/commit/?id=4b15965daa99044daf184221b7c283bf7f2d7e66
commit 4b15965daa99044daf184221b7c283bf7f2d7e66 Author: Robert Clausecker <f...@freebsd.org> AuthorDate: 2025-07-29 20:12:32 +0000 Commit: Robert Clausecker <f...@freebsd.org> CommitDate: 2025-08-09 20:13:27 +0000 libc/amd64: rewrite memrchr() baseline impl. to read the string from the back This ensures O(1) behaviour if the character is a constant offset from the end of the string, regardless of how long the string is. Reported by: Mikael Simonsson <m...@mikaelsimonsson.com> Reviewed by: benni PR: 288321 MFC after: 1 month --- lib/libc/amd64/string/memrchr.S | 152 +++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 78 deletions(-) diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S index 4f6c5a238daa..f1ba48d6bb41 100644 --- a/lib/libc/amd64/string/memrchr.S +++ b/lib/libc/amd64/string/memrchr.S @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2023 Robert Clausecker + * Copyright (c) 2023, 2025 Robert Clausecker <f...@freebsd.org> */ #include <machine/asm.h> @@ -71,95 +71,91 @@ ARCHENTRY(memrchr, scalar) ARCHEND(memrchr, scalar) ARCHENTRY(memrchr, baseline) - movd %esi, %xmm4 - test %rdx, %rdx # empty buffer? - jz .L0 # if yes, return immediately + test %rdx, %rdx # empty input? + je .Lnomatchb + + + lea (%rdi, %rdx, 1), %ecx # pointer to end of buffer + lea -1(%rdi, %rdx, 1), %rdx # pointer to last char in buffer + movd %esi, %xmm2 + and $~0x1f, %rdx # pointer to final 32 buffer bytes + movdqa (%rdx), %xmm0 # load last 32 bytes + movdqa 16(%rdx), %xmm1 + + punpcklbw %xmm2, %xmm2 # c -> cc - punpcklbw %xmm4, %xmm4 # c -> cc - mov %edi, %ecx - punpcklwd %xmm4, %xmm4 # cc -> cccc - and $~0xf, %rdi # align source pointer - pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc - and $0xf, %ecx - movdqa %xmm4, %xmm0 mov $-1, %r8d - pcmpeqb (%rdi), %xmm0 # compare aligned head - shl %cl, %r8d # mask of bytes in the head of the buffer - pmovmskb %xmm0, %eax + neg %ecx + mov %r8d, %r9d + shr %cl, %r8d # mask with zeroes after the string - sub $16, %rcx - and %r8d, %eax # match mask - add %rcx, %rdx # advance past head - cmc - jbe .Lrunt # did the string end in the buffer? + punpcklwd %xmm2, %xmm2 # cc -> cccc - mov %rdi, %rsi # pointer to matching chunk - add $16, %rdi - sub $16, %rdx # enough left for another round? - jbe 1f + mov %edi, %ecx + mov %r9d, %eax + shl %cl, %r9d # mask with zeroes before the string - /* main loop unrolled twice */ - ALIGN_TEXT -0: movdqa %xmm4, %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %r8d + pshufd $0, %xmm2, %xmm2 # cccc -> cccccccccccccccc - cmp $16, %rdx # enough left for second chunk? - jbe 2f + cmp %rdx, %rdi # tail is beginning of buffer? + cmovae %r9d, %eax # if yes, do combined head/tail processing + and %r8d, %eax # mak of bytes in tail part of string - movdqa %xmm4, %xmm0 - pcmpeqb 16(%rdi), %xmm0 + /* process tail */ + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %esi pmovmskb %xmm0, %ecx + shl $16, %esi + or %esi, %ecx # locations of matches + and %ecx, %eax # any match inside buffer? + jnz .Lprecisematchb - lea 16(%rdi), %r9 - test %ecx, %ecx # match found in second chunk? - cmovz %r8d, %ecx # if not, use match data from first chunk - cmovz %rdi, %r9 - - test %ecx, %ecx # any match found? - cmovnz %ecx, %eax # if yes, overwrite previously found match - cmovnz %r9, %rsi - - add $32, %rdi # advance to next iteration - sub $32, %rdx # advance to next chunks - ja 0b - - /* process remaining 1--16 bytes */ -1: pcmpeqb (%rdi), %xmm4 - mov $0xffff, %r8d - xor %ecx, %ecx - sub %edx, %ecx # number of bytes to be masked out - pmovmskb %xmm4, %r9d - shr %cl, %r8d # mask of bytes to be kept in the buffer - and %r9d, %r8d - cmovnz %r8d, %eax - cmovnz %rdi, %rsi - bsr %eax, %eax - lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) - cmovnz %rsi, %rax # if any match was found, return it - ret + cmp %rdx, %rdi # did the buffer begin here? + jae .Lnomatchb # if yes, we are done - /* end of chunk reached within first half iteration */ -2: test %r8d, %r8d # match in previous chunk? - cmovnz %r8d, %eax # if yes, overwrite previous chunks - cmovnz %rdi, %rsi - add $16, %rdi # point to tail - sub $16, %edx - jmp 1b # handle tail the same otherwise - - /* runt: string ends within head, edx has negated amount of invalid head bytes */ -.Lrunt: mov $0xffff, %r8d - xor %ecx, %ecx - sub %edx, %ecx - shr %cl, %r8d - and %r8d, %eax - bsr %eax, %eax - lea (%rdi, %rax, 1), %rdi - cmovnz %rdi, %rax + /* main loop */ + ALIGN_TEXT +0: movdqa -32(%rdx), %xmm0 # load previous string chunk + movdqa -16(%rdx), %xmm1 + sub $32, %rdx # beginning of string reached? + cmp %rdx, %rdi + jae .Ltailb + + pcmpeqb %xmm2, %xmm0 + pcmpeqb %xmm2, %xmm1 + por %xmm1, %xmm0 # match in either half? + pmovmskb %xmm0, %eax + test %eax, %eax + jz 0b + +.Lmatchb: + pcmpeqb (%rdx), %xmm2 # redo comparison of first 16 bytes + pmovmskb %xmm1, %ecx + pmovmskb %xmm2, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + +.Lprecisematchb: + bsr %eax, %eax # find location of match + add %rdx, %rax # point to matching byte ret - /* empty buffer: return a null pointer */ -.L0: xor %eax, %eax +.Ltailb: + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm0, %eax + shl $16, %ecx + or %ecx, %eax # location of matches + and %r9d, %eax # mask out matches before buffer + bsr %eax, %edi # location of match + lea (%rdx, %rdi, 1), %rdx # pointer to match (if any) + cmovnz %rdx, %rax # point to match if present, + ret # else null pointer + +.Lnomatchb: + xor %eax, %eax # return null pointer ret ARCHEND(memrchr, baseline)