The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=474408bb7933f0383a0da2b01e717bfe683ae77c

commit 474408bb7933f0383a0da2b01e717bfe683ae77c
Author:     Robert Clausecker <f...@freebsd.org>
AuthorDate: 2023-08-13 17:35:01 +0000
Commit:     Robert Clausecker <f...@freebsd.org>
CommitDate: 2023-09-08 21:20:19 +0000

    lib/libc/amd64/string: add strcspn(3) scalar, x86-64-v2 implementation
    
    This changeset adds both a scalar and an x86-64-v2 implementation
    of the strcspn(3) function to libc. A baseline implementation does not
    appear to be feasible given the requirements of the function.
    
    The scalar implementation is similar to the generic libc implementation,
    but expands the bit set into a byte set to reduce latency, improving
    performance. This approach could probably be backported to the generic
    C version to benefit other platforms.
    
    The x86-64-v2 implementation is built around the infamous pcmpistri
    instruction. An alternative implementation based on the Muła/Langdale
    algorithm [1] was prototyped, but performed worse than the pcmpistri
    approach except for sets of more than 16 characters with long input
    strings.
    
    All implementations provide special cases for the empty set (reduces to
    strlen as well as single-character sets (reduces to strchr). The
    x86-64-v2 kernel falls back to the scalar implementation for sets of
    more than 32 characters. This limit could be raised by additional
    multiples of 16 through the use of additional pcmpistri code paths, but
    I consider this case to be too rare to be of importance.
    
    [1]: http://0x80.pl/articles/simd-byte-lookup.html
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    mjg
    MFC after:      1 week
    MFC to:         stable/14
    Differential Revision:  https://reviews.freebsd.org/D41557
---
 lib/libc/amd64/string/Makefile.inc |   1 +
 lib/libc/amd64/string/strcspn.S    | 368 +++++++++++++++++++++++++++++++++++++
 2 files changed, 369 insertions(+)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index 4df4ff8f1417..f01a52d9ebea 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -10,5 +10,6 @@ MDSRCS+= \
        strcat.S \
        strchrnul.S \
        strcmp.S \
+       strcspn.S \
        strlen.S \
        strcpy.c
diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S
new file mode 100644
index 000000000000..de409db6d472
--- /dev/null
+++ b/lib/libc/amd64/string/strcspn.S
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <f...@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+#include <machine/param.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT     .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(strcspn)
+       ARCHFUNC(strcspn, scalar)
+       NOARCHFUNC
+       ARCHFUNC(strcspn, x86_64_v2)
+ENDARCHFUNCS(strcspn)
+
+ARCHENTRY(strcspn, scalar)
+       push    %rbp                    # align stack to enable function call
+       mov     %rsp, %rbp
+       sub     $256, %rsp              # allocate space for lookup table
+
+       /* check for special cases */
+       movzbl  (%rsi), %eax            # first character in the set
+       test    %eax, %eax
+       jz      .Lstrlen
+
+       movzbl  1(%rsi), %edx           # second character in the set
+       test    %edx, %edx
+       jz      .Lstrchr
+
+       /* no special case matches -- prepare lookup table */
+       xor     %r8d, %r8d
+       mov     $28, %ecx
+0:     mov     %r8, (%rsp, %rcx, 8)
+       mov     %r8, 8(%rsp, %rcx, 8)
+       mov     %r8, 16(%rsp, %rcx, 8)
+       mov     %r8, 24(%rsp, %rcx, 8)
+       sub     $4, %ecx
+       jnc     0b
+
+       add     $2, %rsi
+       movb    $1, (%rsp, %rax, 1)     # register first chars in set
+       movb    $1, (%rsp, %rdx, 1)
+       mov     %rdi, %rax              # a copy of the source to iterate over
+
+       /* process remaining chars in set */
+       ALIGN_TEXT
+0:     movzbl  (%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       movzbl  1(%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       add     $2, %rsi
+       jmp     0b
+
+       /* find match */
+       ALIGN_TEXT
+1:     movzbl  (%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     2f
+
+       movzbl  1(%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     3f
+
+       movzbl  2(%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     4f
+
+       movzbl  3(%rax), %ecx
+       add     $4, %rax
+       cmpb    $0, (%rsp, %rcx, 1)
+       je      1b
+
+       sub     $3, %rax
+4:     dec     %rdi
+3:     inc     %rax
+2:     sub     %rdi, %rax              # number of characters preceding match
+       leave
+       ret
+
+       /* set is empty, degrades to strlen */
+.Lstrlen:
+       leave
+       jmp     CNAME(strlen)
+
+       /* just one character in set, degrades to strchr */
+.Lstrchr:
+       mov     %rdi, (%rsp)            # stash a copy of the string
+       mov     %eax, %esi              # find the character in the set
+       call    CNAME(strchrnul)
+       sub     (%rsp), %rax            # length of prefix before match
+       leave
+       ret
+ARCHEND(strcspn, scalar)
+
+       /*
+        * This kernel uses pcmpistri to do the heavy lifting.
+        * We provide five code paths, depending on set size:
+        *
+        *      0: call strlen()
+        *      1: call strchr()
+        *  2--16: one pcmpistri per 16 bytes of input
+        * 17--32: two pcmpistri per 16 bytes of input
+        *   >=33: fall back to look up table
+        */
+ARCHENTRY(strcspn, x86_64_v2)
+       push            %rbp
+       mov             %rsp, %rbp
+       sub             $256, %rsp
+
+       /* check for special cases */
+       movzbl          (%rsi), %eax
+       test            %eax, %eax              # empty string?
+       jz              .Lstrlenv2
+
+       cmpb            $0, 1(%rsi)             # single character string?
+       jz              .Lstrchrv2
+
+       /* find set size and copy up to 32 bytes to (%rsp) */
+       mov             %esi, %ecx
+       and             $~0xf, %rsi             # align set pointer
+       movdqa          (%rsi), %xmm0
+       pxor            %xmm1, %xmm1
+       and             $0xf, %ecx              # amount of bytes rsi is past 
alignment
+       xor             %edx, %edx
+       pcmpeqb         %xmm0, %xmm1            # end of string reached?
+       movdqa          %xmm0, 32(%rsp)         # transfer head of set to stack
+       pmovmskb        %xmm1, %eax
+       shr             %cl, %eax               # clear out junk before string
+       test            %eax, %eax              # end of set reached?
+       jnz             0f
+
+       movdqa          16(%rsi), %xmm0         # second chunk of the set
+       mov             $16, %edx
+       sub             %ecx, %edx              # length of set preceding xmm0
+       pxor            %xmm1, %xmm1
+       pcmpeqb         %xmm0, %xmm1
+       movdqa          %xmm0, 48(%rsp)
+       movdqu          32(%rsp, %rcx, 1), %xmm2 # head of set
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax
+       jnz             1f
+
+       movdqa          32(%rsi), %xmm0         # third chunk
+       add             $16, %edx
+       pxor            %xmm1, %xmm1
+       pcmpeqb         %xmm0, %xmm1
+       movdqa          %xmm0, 64(%rsp)
+       pmovmskb        %xmm1, %eax
+       test            %eax, %eax              # still not done?
+       jz              .Lgt32v2
+
+0:     movdqu          32(%rsp, %rcx, 1), %xmm2 # head of set
+1:     tzcnt           %eax, %eax
+       add             %eax, %edx              # length of set (excluding NUL 
byte)
+       cmp             $32, %edx               # above 32 bytes?
+       ja              .Lgt32v2
+
+       /*
+        * At this point we know that we want to use pcmpistri.
+        * one last problem obtains: the head of the string is not
+        * aligned and may cross a cacheline.  If this is the case,
+        * we take the part before the page boundary and repeat the
+        * last byte to fill up the xmm register.
+        */
+       mov             %rdi, %rax              # save original string pointer
+       lea             15(%rdi), %esi          # last byte of the head
+       xor             %edi, %esi
+       test            $PAGE_SIZE, %esi        # does the head cross a page?
+       jz              0f
+
+       /* head crosses page: copy to stack to fix up */
+       and             $~0xf, %rax             # align head pointer temporarily
+       movzbl          15(%rax), %esi          # last head byte on the page
+       movdqa          (%rax), %xmm0
+       movabs          $0x0101010101010101, %r8
+       imul            %r8, %rsi               # repeated 8 times
+       movdqa          %xmm0, (%rsp)           # head word on stack
+       mov             %rsi, 16(%rsp)          # followed by filler (last byte 
x8)
+       mov             %rsi, 24(%rsp)
+       mov             %edi, %eax
+       and             $0xf, %eax              # offset of head from alignment
+       add             %rsp, %rax              # pointer to fake head
+
+0:     movdqu          (%rax), %xmm0           # load head (fake or real)
+       lea             16(%rdi), %rax
+       and             $~0xf, %rax             # second 16 bytes of string 
(aligned)
+1:     cmp             $16, %edx               # 16--32 bytes?
+       ja              .Lgt16v2
+
+
+       /* set is 2--16 bytes in size */
+
+       /* _SIDD_UBYTE_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_LEAST_SIGNIFICANT */
+       pcmpistri       $0, %xmm0, %xmm2        # match in head?
+       jbe             .Lheadmatchv2
+
+       ALIGN_TEXT
+0:     pcmpistri       $0, (%rax), %xmm2
+       jbe             1f                      # match or end of string?
+       pcmpistri       $0, 16(%rax), %xmm2
+       lea             32(%rax), %rax
+       ja              0b                      # match or end of string?
+
+3:     lea             -16(%rax), %rax         # go back to second half
+1:     jc              2f                      # jump if match found
+       movdqa          (%rax), %xmm0           # reload string piece
+       pxor            %xmm1, %xmm1
+       pcmpeqb         %xmm1, %xmm0            # where is the NUL byte?
+       pmovmskb        %xmm0, %ecx
+       tzcnt           %ecx, %ecx              # location of NUL byte in (%rax)
+2:     sub             %rdi, %rax              # offset of %xmm0 from 
beginning of string
+       add             %rcx, %rax              # prefix length before match/NUL
+       leave
+       ret
+
+.Lheadmatchv2:
+       jc              2f                      # jump if match found
+       pxor            %xmm1, %xmm1
+       pcmpeqb         %xmm1, %xmm0
+       pmovmskb        %xmm0, %ecx
+       tzcnt           %ecx, %ecx              # location of NUL byte
+2:     mov             %ecx, %eax              # prefix length before match/NUL
+       leave
+       ret
+
+.Lgt16v2:
+       movdqu          48(%rsp, %rcx, 1), %xmm3 # second part of set
+
+       /* set is 17--32 bytes in size */
+       pcmpistri       $0, %xmm0, %xmm2        # match in head?
+       jbe             .Lheadmatchv2
+       pcmpistri       $0, %xmm0, %xmm3        # ZF=1 not possible here
+       jb              .Lheadmatchv2
+
+       ALIGN_TEXT
+0:     movdqa          (%rax), %xmm0
+       pcmpistri       $0, %xmm0, %xmm2
+       jbe             1b
+       pcmpistri       $0, %xmm0, %xmm3
+       jb              1f                      # ZF=1 not possible here
+       movdqa          16(%rax), %xmm0
+       add             $32, %rax
+       pcmpistri       $0, %xmm0, %xmm2
+       jbe             3b
+       pcmpistri       $0, %xmm0, %xmm3
+       jae             0b                      # ZF=1 not possible here
+
+       sub             $16, %rax               # go back to second half
+1:     add             %rcx, %rax
+       sub             %rdi, %rax
+       leave
+       ret
+
+       /* set is empty, degrades to strlen */
+.Lstrlenv2:
+       leave
+       jmp     CNAME(strlen)
+
+       /* just one character in set, degrades to strchr */
+.Lstrchrv2:
+       mov     %rdi, (%rsp)            # stash a copy of the string
+       mov     %eax, %esi              # find this character
+       call    CNAME(strchrnul)
+       sub     (%rsp), %rax            # length of prefix before match
+       leave
+       ret
+
+       /* set is >=33 bytes in size */
+.Lgt32v2:
+       xorps   %xmm0, %xmm0
+       mov     $256-64, %edx
+
+       /* clear out look up table */
+0:     movaps  %xmm0, (%rsp, %rdx, 1)
+       movaps  %xmm0, 16(%rsp, %rdx, 1)
+       movaps  %xmm0, 32(%rsp, %rdx, 1)
+       movaps  %xmm0, 48(%rsp, %rdx, 1)
+       sub     $64, %edx
+       jnc     0b
+
+       add     %rcx, %rsi              # restore string pointer
+       mov     %rdi, %rax              # keep a copy of the string
+
+       /* initialise look up table */
+       ALIGN_TEXT
+0:     movzbl  (%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       movzbl  1(%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       movzbl  2(%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       movzbl  3(%rsi), %ecx
+       movb    $1, (%rsp, %rcx, 1)
+       test    %ecx, %ecx
+       jz      1f
+
+       add     $4, %rsi
+       jmp     0b
+
+       /* find match */
+       ALIGN_TEXT
+1:     movzbl  (%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     2f
+
+       movzbl  1(%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     3f
+
+       movzbl  2(%rax), %ecx
+       cmpb    $0, (%rsp, %rcx, 1)
+       jne     4f
+
+       movzbl  3(%rax), %ecx
+       add     $4, %rax
+       cmpb    $0, (%rsp, %rcx, 1)
+       je      1b
+
+       sub     $3, %rax
+4:     dec     %rdi
+3:     inc     %rax
+2:     sub     %rdi, %rax              # number of characters preceding match
+       leave
+       ret
+ARCHEND(strcspn, x86_64_v2)
+
+       .section .note.GNU-stack,"",%progbits

Reply via email to