string: add timingsafe_memcmp() assembly implementation

Robert Clausecker Thu, 28 Dec 2023 09:20:55 -0800

The branch stable/14 has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=9a6a587e672baaed3470f6cf4a27a0d1166ca372


commit 9a6a587e672baaed3470f6cf4a27a0d1166ca372
Author:     Robert Clausecker <f...@freebsd.org>
AuthorDate: 2023-10-15 19:25:53 +0000
Commit:     Robert Clausecker <f...@freebsd.org>
CommitDate: 2023-12-28 17:02:41 +0000

    lib/libc/amd64/string: add timingsafe_memcmp() assembly implementation
    
    Conceptually very similar to timingsafe_bcmp(), but with comparison
    logic inspired by Elijah Stone's fancy memcmp. A baseline (SSE)
    implementation was omitted this time as I was not able to get it to
    perform adequately.  Best I got was 8% over the scalar version for
    long inputs, but slower for short inputs.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    security (cperciva)
    Inspired by:    https://github.com/moon-chilled/fancy-memcmp
    Differential Revision:  https://reviews.freebsd.org/D41696
    
    (cherry picked from commit 5048c1b85506c5e0f441ee7dd98dd8d96d0a4a24)
---
 lib/libc/amd64/string/Makefile.inc        |   4 +-
 lib/libc/amd64/string/timingsafe_memcmp.S | 145 ++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index fc420de0450e..09bf7c8f251e 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -1,4 +1,3 @@
-
 MDSRCS+= \
        amd64_archlevel.c \
        bcmp.S \
@@ -16,4 +15,5 @@ MDSRCS+= \
        strlen.S \
        strnlen.c \
        strspn.S \
-       timingsafe_bcmp.S
+       timingsafe_bcmp.S \
+       timingsafe_memcmp.S
diff --git a/lib/libc/amd64/string/timingsafe_memcmp.S 
b/lib/libc/amd64/string/timingsafe_memcmp.S
new file mode 100644
index 000000000000..3f1eccdbd640
--- /dev/null
+++ b/lib/libc/amd64/string/timingsafe_memcmp.S
@@ -0,0 +1,145 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <f...@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#define ALIGN_TEXT     .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+/* int timingsafe_memcmp(const void *rdi, const void *rsi, size_t rdx) */
+ENTRY(timingsafe_memcmp)
+       cmp     $16, %rdx               # at least 17 bytes to process?
+       ja      .Lgt16
+
+       cmp     $8, %edx                # at least 9 bytes to process?
+       ja      .L0916
+
+       cmp     $4, %edx                # at least 5 bytes to process?
+       ja      .L0508
+
+       cmp     $2, %edx                # at least 3 bytes to process?
+       ja      .L0304
+
+       test    %edx, %edx              # buffer empty?
+       jnz     .L0102
+
+       xor     %eax, %eax              # empty buffer always matches
+       ret
+
+.L0102:        movzbl  -1(%rdi, %rdx, 1), %eax # load 1--2 bytes from first 
buffer
+       movzbl  -1(%rsi, %rdx, 1), %ecx
+       mov     (%rdi), %ah             # in big endian
+       mov     (%rsi), %ch
+       sub     %ecx, %eax
+       ret
+
+.L0304:        movzwl  -2(%rdi, %rdx, 1), %ecx
+       movzwl  -2(%rsi, %rdx, 1), %edx
+       movzwl  (%rdi), %eax
+       movzwl  (%rsi), %esi
+       bswap   %ecx                    # convert to big endian
+       bswap   %edx                    # dito for edx, (e)ax, and (e)si
+       rol     $8, %ax                 # ROLW is used here so the upper two
+       rol     $8, %si                 # bytes stay clear, allowing us to
+       sub     %edx, %ecx              # save a SBB compared to .L0508
+       sbb     %esi, %eax
+       or      %eax, %ecx              # nonzero if not equal
+       setnz   %al
+       ret
+
+.L0508:        mov     -4(%rdi, %rdx, 1), %ecx
+       mov     -4(%rsi, %rdx, 1), %edx
+       mov     (%rdi), %edi
+       mov     (%rsi), %esi
+       bswap   %ecx                    # compare in big endian
+       bswap   %edx
+       bswap   %edi
+       bswap   %esi
+       sub     %edx, %ecx
+       sbb     %esi, %edi
+       sbb     %eax, %eax              # -1 if less, 0 if greater or equal
+       or      %edi, %ecx              # nonzero if not equal
+       setnz   %al                     # negative if <, 0 if =, 1 if >
+       ret
+
+.L0916:        mov     -8(%rdi, %rdx, 1), %rcx
+       mov     -8(%rsi, %rdx, 1), %rdx
+       mov     (%rdi), %rdi
+       mov     (%rsi), %rsi
+       bswap   %rcx                    # compare in big endian
+       bswap   %rdx
+       bswap   %rdi
+       bswap   %rsi
+       sub     %rdx, %rcx
+       sbb     %rsi, %rdi
+       sbb     %eax, %eax              # -1 if less, 0 if greater or equal
+       or      %rdi, %rcx              # nonzero if not equal
+       setnz   %al                     # negative if <, 0 if =, 1 if >
+       ret
+
+       /* compare 17+ bytes */
+.Lgt16:        mov     (%rdi), %r8             # process first 16 bytes
+       mov     (%rsi), %r9
+       mov     $32, %ecx
+       cmp     %r8, %r9                # mismatch in head?
+       cmove   8(%rdi), %r8            # if not, try second pair
+       cmove   8(%rsi), %r9
+       cmp     %rdx, %rcx
+       jae     .Ltail
+
+       /* main loop processing 16 bytes per iteration */
+       ALIGN_TEXT
+0:     mov     -16(%rdi, %rcx, 1), %r10
+       mov     -16(%rsi, %rcx, 1), %r11
+       cmp     %r10, %r11              # mismatch in first pair?
+       cmove   -8(%rdi, %rcx, 1), %r10 # if not, try second pair
+       cmove   -8(%rsi, %rcx, 1), %r11
+       cmp     %r8, %r9                # was there a mismatch previously?
+       cmove   %r10, %r8               # apply new pair if there was not
+       cmove   %r11, %r9
+       add     $16, %rcx
+       cmp     %rdx, %rcx
+       jb      0b
+
+.Ltail:        mov     -8(%rdi, %rdx, 1), %r10
+       mov     -8(%rsi, %rdx, 1), %r11
+       cmp     %r8, %r9
+       cmove   -16(%rdi, %rdx, 1), %r8
+       cmove   -16(%rsi, %rdx, 1), %r9
+       bswap   %r10                    # compare in big endian
+       bswap   %r11
+       bswap   %r8
+       bswap   %r9
+       sub     %r11, %r10
+       sbb     %r9, %r8
+       sbb     %eax, %eax              # -1 if less, 0 if greater or equal
+       or      %r10, %r8               # nonzero if not equal
+       setnz   %al                     # negative if <, 0 if =, 1 if >
+       ret
+END(timingsafe_memcmp)
+
+       .section .note.GNU-stack,"",%progbits

git: 9a6a587e672b - stable/14 - lib/libc/amd64/string: add timingsafe_memcmp() assembly implementation

Reply via email to