string: add timingsafe_bcmp(3) scalar, baseline implementations

Robert Clausecker Sun, 15 Oct 2023 12:31:28 -0700

The branch main has been updated by fuz:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=76c2b331bcd9f73c5c8c43a06e328fa0c7b8c39a


commit 76c2b331bcd9f73c5c8c43a06e328fa0c7b8c39a
Author:     Robert Clausecker <f...@freebsd.org>
AuthorDate: 2023-08-30 15:37:26 +0000
Commit:     Robert Clausecker <f...@freebsd.org>
CommitDate: 2023-10-15 19:19:04 +0000

    lib/libc/amd64/string: add timingsafe_bcmp(3) scalar, baseline 
implementations
    
    Very straightforward and similar to memcmp(3). The code has
    been written to use only instructions specified as having
    data operand independent timing by Intel.
    
    Sponsored by:   The FreeBSD Foundation
    Approved by:    security (cperciva)
    Differential Revision:  https://reviews.freebsd.org/D41673
---
 lib/libc/amd64/string/Makefile.inc      |   3 +-
 lib/libc/amd64/string/timingsafe_bcmp.S | 232 ++++++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+), 1 deletion(-)

diff --git a/lib/libc/amd64/string/Makefile.inc 
b/lib/libc/amd64/string/Makefile.inc
index 73973a6d69de..fc420de0450e 100644
--- a/lib/libc/amd64/string/Makefile.inc
+++ b/lib/libc/amd64/string/Makefile.inc
@@ -15,4 +15,5 @@ MDSRCS+= \
        strcspn.S \
        strlen.S \
        strnlen.c \
-       strspn.S
+       strspn.S \
+       timingsafe_bcmp.S
diff --git a/lib/libc/amd64/string/timingsafe_bcmp.S 
b/lib/libc/amd64/string/timingsafe_bcmp.S
new file mode 100644
index 000000000000..c003da2ea9a7
--- /dev/null
+++ b/lib/libc/amd64/string/timingsafe_bcmp.S
@@ -0,0 +1,232 @@
+/*-
+ * Copyright (c) 2023 The FreeBSD Foundation
+ *
+ * This software was developed by Robert Clausecker <f...@freebsd.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+#include <machine/asm.h>
+
+#include "amd64_archlevel.h"
+
+#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
+ARCHFUNCS(timingsafe_bcmp)
+       ARCHFUNC(timingsafe_bcmp, scalar)
+       ARCHFUNC(timingsafe_bcmp, baseline)
+ENDARCHFUNCS(timingsafe_bcmp)
+
+ARCHENTRY(timingsafe_bcmp, scalar)
+       cmp     $16, %rdx               # at least 17 bytes to process?
+       ja      .Lgt16
+
+       cmp     $8, %edx                # at least 9 bytes to process?
+       ja      .L0916
+
+       cmp     $4, %edx                # at least 5 bytes to process?
+       ja      .L0508
+
+       cmp     $2, %edx                # at least 3 bytes to process?
+       ja      .L0304
+
+       test    %edx, %edx              # buffer empty?
+       jnz     .L0102
+
+       xor     %eax, %eax              # empty buffer always matches
+       ret
+
+.L0102:        movzbl  (%rdi), %eax            # load 1--2 bytes from first 
buffer
+       movzbl  -1(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %al             # xor in second buffer
+       xor     -1(%rsi, %rdx, 1), %cl
+       or      %ecx, %eax              # mismatch in any of the two?
+       ret
+
+.L0304:        movzwl  (%rdi), %eax
+       movzwl  -2(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %ax
+       xor     -2(%rsi, %rdx, 1), %cx
+       or      %ecx, %eax
+       ret
+
+.L0508:        mov     (%rdi), %eax
+       mov     -4(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %eax
+       xor     -4(%rsi, %rdx, 1), %ecx
+       or      %ecx, %eax
+       ret
+
+.L0916:        mov     (%rdi), %rax
+       mov     -8(%rdi, %rdx, 1), %rcx
+       xor     (%rsi), %rax
+       xor     -8(%rsi, %rdx, 1), %rcx
+       or      %rcx, %rax
+       setnz   %al                     # ensure EAX nonzero even if only
+       ret                             # high bits of RAX were set
+
+       /* more than 16 bytes: process buffer in a loop */
+.Lgt16:        mov     (%rdi), %rax            # process first 16 bytes
+       mov     8(%rdi), %r9
+       mov     $32, %ecx
+       xor     (%rsi), %rax
+       xor     8(%rsi), %r9
+       or      %r9, %rax
+
+       cmp     %rdx, %rcx              # enough left for a full iteration?
+       jae     .Ltail
+
+       /* main loop processing 16 bytes per iteration */
+       ALIGN_TEXT
+0:     mov     -16(%rdi, %rcx, 1), %r8
+       mov     -8(%rdi, %rcx, 1), %r9
+       xor     -16(%rsi, %rcx, 1), %r8
+       xor     -8(%rsi, %rcx, 1), %r9
+       add     $16, %rcx
+       or      %r9, %r8
+       or      %r8, %rax
+
+       cmp     %rdx, %rcx
+       jb      0b
+
+       /* process last 16 bytes */
+.Ltail:        mov     -16(%rdi, %rdx, 1), %r8
+       mov     -8(%rdi, %rdx, 1), %r9
+       xor     -16(%rsi, %rdx, 1), %r8
+       xor     -8(%rsi, %rdx, 1), %r9
+       or      %r9, %r8
+       or      %r8, %rax
+       setnz   %al
+       ret
+ARCHEND(timingsafe_bcmp, scalar)
+
+ARCHENTRY(timingsafe_bcmp, baseline)
+       cmp     $32, %rdx               # at least 33 bytes to process?
+       ja      .Lgt32b
+
+       cmp     $16, %edx               # at least 17 bytes to process?
+       ja      .L1732b
+
+       cmp     $8, %edx                # at least 9 bytes to process?
+       ja      .L0916b
+
+       cmp     $4, %edx                # at least 5 bytes to process?
+       ja      .L0508b
+
+       cmp     $2, %edx                # at least 3 bytes to process?
+       ja      .L0304b
+
+       test    %edx, %edx              # buffer empty?
+       jnz     .L0102b
+
+       xor     %eax, %eax              # empty buffer always matches
+       ret
+
+.L0102b:
+       movzbl  (%rdi), %eax            # load 1--2 bytes from first buffer
+       movzbl  -1(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %al             # xor in second buffer
+       xor     -1(%rsi, %rdx, 1), %cl
+       or      %ecx, %eax              # mismatch in any of the two?
+       ret
+
+.L0304b:
+       movzwl  (%rdi), %eax
+       movzwl  -2(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %ax
+       xor     -2(%rsi, %rdx, 1), %cx
+       or      %ecx, %eax
+       ret
+
+.L0508b:
+       mov     (%rdi), %eax
+       mov     -4(%rdi, %rdx, 1), %ecx
+       xor     (%rsi), %eax
+       xor     -4(%rsi, %rdx, 1), %ecx
+       or      %ecx, %eax
+       ret
+
+.L0916b:
+       mov     (%rdi), %rax
+       mov     -8(%rdi, %rdx, 1), %rcx
+       xor     (%rsi), %rax
+       xor     -8(%rsi, %rdx, 1), %rcx
+       or      %rcx, %rax
+       setnz   %al                     # ensure EAX nonzero even if only
+       ret                             # high bits of RAX were set
+
+.L1732b:
+       movdqu          (%rdi), %xmm0
+       movdqu          (%rsi), %xmm2
+       movdqu          -16(%rdi, %rdx, 1), %xmm1
+       movdqu          -16(%rsi, %rdx, 1), %xmm3
+       pcmpeqb         %xmm2, %xmm0
+       pcmpeqb         %xmm3, %xmm1
+       pand            %xmm1, %xmm0
+       pmovmskb        %xmm0, %eax     # 1 where equal
+       xor             $0xffff, %eax   # 1 where not equal
+       ret
+
+       /* more than 32 bytes: process buffer in a loop */
+.Lgt32b:
+       movdqu          (%rdi), %xmm4
+       movdqu          (%rsi), %xmm2
+       movdqu          16(%rdi), %xmm1
+       movdqu          16(%rsi), %xmm3
+       mov             $64, %ecx
+       pcmpeqb         %xmm2, %xmm4
+       pcmpeqb         %xmm3, %xmm1
+       pand            %xmm1, %xmm4
+       cmp             %rdx, %rcx      # enough left for a full iteration?
+       jae             .Ltailb
+
+       /* main loop processing 32 bytes per iteration */
+       ALIGN_TEXT
+0:     movdqu          -32(%rdi, %rcx, 1), %xmm0
+       movdqu          -32(%rsi, %rcx, 1), %xmm2
+       movdqu          -16(%rdi, %rcx, 1), %xmm1
+       movdqu          -16(%rsi, %rcx, 1), %xmm3
+       add             $32, %rcx
+       pcmpeqb         %xmm2, %xmm0
+       pcmpeqb         %xmm3, %xmm1
+       pand            %xmm1, %xmm0
+       pand            %xmm0, %xmm4
+       cmp             %rdx, %rcx
+       jb              0b
+
+       /* process last 32 bytes */
+.Ltailb:
+       movdqu          -32(%rdi, %rdx, 1), %xmm0
+       movdqu          -32(%rsi, %rdx, 1), %xmm2
+       movdqu          -16(%rdi, %rdx, 1), %xmm1
+       movdqu          -16(%rsi, %rdx, 1), %xmm3
+       pcmpeqb         %xmm2, %xmm0
+       pcmpeqb         %xmm3, %xmm1
+       pand            %xmm1, %xmm0
+       pand            %xmm4, %xmm0
+       pmovmskb        %xmm0, %eax
+       xor             $0xffff, %eax
+       ret
+ARCHEND(timingsafe_bcmp, baseline)
+
+       .section .note.GNU-stack,"",%progbits

git: 76c2b331bcd9 - main - lib/libc/amd64/string: add timingsafe_bcmp(3) scalar, baseline implementations

Reply via email to