The branch stable/13 has been updated by andrew:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3af87126f68e539453dc530925d7e297ee261c7f

commit 3af87126f68e539453dc530925d7e297ee261c7f
Author:     Andrew Turner <and...@freebsd.org>
AuthorDate: 2022-09-07 11:12:30 +0000
Commit:     Andrew Turner <and...@freebsd.org>
CommitDate: 2022-09-21 09:45:53 +0000

    Import an optimized arm64 memcmp into the kernel
    
    Bring in a version of the Arm Optimized Routines memcpy from before
    the VFP registers were used.
    
    Imported with modification from:
    
https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
    
    Sponsored by:   The FreeBSD Foundation
    
    (cherry picked from commit 51a1bf7ba7eb79c760161a2054c113978dce38cb)
---
 sys/arm64/arm64/memcmp.S | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 sys/conf/files.arm64     |   3 +-
 2 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/sys/arm64/arm64/memcmp.S b/sys/arm64/arm64/memcmp.S
new file mode 100644
index 000000000000..8517a181f3f3
--- /dev/null
+++ b/sys/arm64/arm64/memcmp.S
@@ -0,0 +1,136 @@
+/* memcmp - compare memory
+ *
+ * Copyright (c) 2013-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#include <machine/asm.h>
+
+#define L(l) .L ## l
+
+/* Parameters and result.  */
+#define src1           x0
+#define src2           x1
+#define limit          x2
+#define result         w0
+
+/* Internal variables.  */
+#define data1          x3
+#define data1w         w3
+#define data1h         x4
+#define data2          x5
+#define data2w         w5
+#define data2h         x6
+#define tmp1           x7
+#define tmp2           x8
+
+ENTRY (memcmp)
+       subs    limit, limit, 8
+       b.lo    L(less8)
+
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       cmp     data1, data2
+       b.ne    L(return)
+
+       subs    limit, limit, 8
+       b.gt    L(more16)
+
+       ldr     data1, [src1, limit]
+       ldr     data2, [src2, limit]
+       b       L(return)
+
+L(more16):
+       ldr     data1, [src1], 8
+       ldr     data2, [src2], 8
+       cmp     data1, data2
+       bne     L(return)
+
+       /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+          strings.  */
+       subs    limit, limit, 16
+       b.ls    L(last_bytes)
+
+       /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+          try to align, so limit it only to strings larger than 128 bytes.  */
+       cmp     limit, 96
+       b.ls    L(loop16)
+
+       /* Align src1 and adjust src2 with bytes not yet done.  */
+       and     tmp1, src1, 15
+       add     limit, limit, tmp1
+       sub     src1, src1, tmp1
+       sub     src2, src2, tmp1
+
+       /* Loop performing 16 bytes per iteration using aligned src1.
+          Limit is pre-decremented by 16 and must be larger than zero.
+          Exit if <= 16 bytes left to do or if the data is not equal.  */
+       .p2align 4
+L(loop16):
+       ldp     data1, data1h, [src1], 16
+       ldp     data2, data2h, [src2], 16
+       subs    limit, limit, 16
+       ccmp    data1, data2, 0, hi
+       ccmp    data1h, data2h, 0, eq
+       b.eq    L(loop16)
+
+       cmp     data1, data2
+       bne     L(return)
+       mov     data1, data1h
+       mov     data2, data2h
+       cmp     data1, data2
+       bne     L(return)
+
+       /* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+       add     src1, src1, limit
+       add     src2, src2, limit
+       ldp     data1, data1h, [src1]
+       ldp     data2, data2h, [src2]
+       cmp     data1, data2
+       bne     L(return)
+       mov     data1, data1h
+       mov     data2, data2h
+       cmp     data1, data2
+
+       /* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+       rev     data1, data1
+       rev     data2, data2
+#endif
+       cmp     data1, data2
+L(ret_eq):
+       cset    result, ne
+       cneg    result, result, lo
+       ret
+
+       .p2align 4
+       /* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+       adds    limit, limit, 4
+       b.lo    L(less4)
+       ldr     data1w, [src1], 4
+       ldr     data2w, [src2], 4
+       cmp     data1w, data2w
+       b.ne    L(return)
+       sub     limit, limit, 4
+L(less4):
+       adds    limit, limit, 4
+       beq     L(ret_eq)
+L(byte_loop):
+       ldrb    data1w, [src1], 1
+       ldrb    data2w, [src2], 1
+       subs    limit, limit, 1
+       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */
+       b.eq    L(byte_loop)
+       sub     result, data1w, data2w
+       ret
+
+END (memcmp)
+
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 86ada6e4c924..963ee0aef8f0 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -10,8 +10,6 @@ kern/subr_devmap.c                            standard
 kern/subr_intr.c                               optional intrng
 kern/subr_physmem.c                            standard
 libkern/bcmp.c                                 standard
-libkern/memcmp.c                               standard        \
-       compile-with "${NORMAL_C:N-fsanitize*}"
 libkern/memset.c                               standard        \
        compile-with "${NORMAL_C:N-fsanitize*}"
 libkern/strlen.c               standard
@@ -60,6 +58,7 @@ arm64/arm64/locore.S                          standard no-obj
 arm64/arm64/machdep.c                          standard
 arm64/arm64/machdep_boot.c                     standard
 arm64/arm64/mem.c                              standard
+arm64/arm64/memcmp.S                           standard
 arm64/arm64/memcpy.S                           standard
 arm64/arm64/minidump_machdep.c                 standard
 arm64/arm64/mp_machdep.c                       optional smp

Reply via email to