git: 3a909754a10e - stable/13 - Import the arm64 Cortex Strings memset into the kernel

Andrew Turner Wed, 21 Sep 2022 02:47:25 -0700

The branch stable/13 has been updated by andrew:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=3a909754a10ead4e6d9e85f2f03ff8b8bcb5b29f


commit 3a909754a10ead4e6d9e85f2f03ff8b8bcb5b29f
Author:     Andrew Turner <[email protected]>
AuthorDate: 2022-09-07 11:41:13 +0000
Commit:     Andrew Turner <[email protected]>
CommitDate: 2022-09-21 09:45:53 +0000

    Import the arm64 Cortex Strings memset into the kernel
    
    Bring in the last version of the optimized memset from the Cortex
    Strings library that didn't use the VFP registers. While here clean up
    to use the ENTRY/EDIT macros.
    
    This is imported from:
    
https://git.linaro.org/toolchain/cortex-strings.git/tree/src/aarch64/memset.S?id=bc654e7a594385071b42db3c9c94bd0fd5fe0da5
    
    Sponsored by:   The FreeBSD Foundation
    
    (cherry picked from commit 949bc17cf28bb0a21b4688f1effad01de63d3318)
---
 sys/arm64/arm64/memset.S | 235 +++++++++++++++++++++++++++++++++++++++++++++++
 sys/conf/files.arm64     |   3 +-
 2 files changed, 236 insertions(+), 2 deletions(-)

diff --git a/sys/arm64/arm64/memset.S b/sys/arm64/arm64/memset.S
new file mode 100644
index 000000000000..aaa196d35278
--- /dev/null
+++ b/sys/arm64/arm64/memset.S
@@ -0,0 +1,235 @@
+/* Copyright (c) 2012, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the Linaro nor the
+         names of its contributors may be used to endorse or promote products
+         derived from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#include <machine/asm.h>
+
+
+/* By default we assume that the DC instruction can be used to zero
+   data blocks more efficiently.  In some circumstances this might be
+   unsafe, for example in an asymmetric multiprocessor environment with
+   different DC clear lengths (neither the upper nor lower lengths are
+   safe to use).  The feature can be disabled by defining DONT_USE_DC.
+
+   If code may be run in a virtualized environment, then define
+   MAYBE_VIRT.  This will cause the code to cache the system register
+   values rather than re-reading them each call.  */
+
+#define dstin          x0
+#define val            w1
+#define count          x2
+#define tmp1           x3
+#define tmp1w          w3
+#define tmp2           x4
+#define tmp2w          w4
+#define zva_len_x      x5
+#define zva_len                w5
+#define zva_bits_x     x6
+
+#define A_l            x7
+#define A_lw           w7
+#define dst            x8
+#define tmp3w          w9
+
+ENTRY(memset)
+
+       mov     dst, dstin              /* Preserve return value.  */
+       ands    A_lw, val, #255
+#ifndef DONT_USE_DC
+       b.eq    .Lzero_mem
+#endif
+       orr     A_lw, A_lw, A_lw, lsl #8
+       orr     A_lw, A_lw, A_lw, lsl #16
+       orr     A_l, A_l, A_l, lsl #32
+.Ltail_maybe_long:
+       cmp     count, #64
+       b.ge    .Lnot_short
+.Ltail_maybe_tiny:
+       cmp     count, #15
+       b.le    .Ltail15tiny
+.Ltail63:
+       ands    tmp1, count, #0x30
+       b.eq    .Ltail15
+       add     dst, dst, tmp1
+       cmp     tmp1w, #0x20
+       b.eq    1f
+       b.lt    2f
+       stp     A_l, A_l, [dst, #-48]
+1:
+       stp     A_l, A_l, [dst, #-32]
+2:
+       stp     A_l, A_l, [dst, #-16]
+
+.Ltail15:
+       and     count, count, #15
+       add     dst, dst, count
+       stp     A_l, A_l, [dst, #-16]   /* Repeat some/all of last store. */
+       ret
+
+.Ltail15tiny:
+       /* Set up to 15 bytes.  Does not assume earlier memory
+          being set.  */
+       tbz     count, #3, 1f
+       str     A_l, [dst], #8
+1:
+       tbz     count, #2, 1f
+       str     A_lw, [dst], #4
+1:
+       tbz     count, #1, 1f
+       strh    A_lw, [dst], #2
+1:
+       tbz     count, #0, 1f
+       strb    A_lw, [dst]
+1:
+       ret
+
+       /* Critical loop.  Start at a new cache line boundary.  Assuming
+        * 64 bytes per line, this ensures the entire loop is in one line.  */
+       .p2align 6
+.Lnot_short:
+       neg     tmp2, dst
+       ands    tmp2, tmp2, #15
+       b.eq    2f
+       /* Bring DST to 128-bit (16-byte) alignment.  We know that there's
+        * more than that to set, so we simply store 16 bytes and advance by
+        * the amount required to reach alignment.  */
+       sub     count, count, tmp2
+       stp     A_l, A_l, [dst]
+       add     dst, dst, tmp2
+       /* There may be less than 63 bytes to go now.  */
+       cmp     count, #63
+       b.le    .Ltail63
+2:
+       sub     dst, dst, #16           /* Pre-bias.  */
+       sub     count, count, #64
+1:
+       stp     A_l, A_l, [dst, #16]
+       stp     A_l, A_l, [dst, #32]
+       stp     A_l, A_l, [dst, #48]
+       stp     A_l, A_l, [dst, #64]!
+       subs    count, count, #64
+       b.ge    1b
+       tst     count, #0x3f
+       add     dst, dst, #16
+       b.ne    .Ltail63
+       ret
+
+#ifndef DONT_USE_DC
+       /* For zeroing memory, check to see if we can use the ZVA feature to
+        * zero entire 'cache' lines.  */
+.Lzero_mem:
+       mov     A_l, #0
+       cmp     count, #63
+       b.le    .Ltail_maybe_tiny
+       neg     tmp2, dst
+       ands    tmp2, tmp2, #15
+       b.eq    1f
+       sub     count, count, tmp2
+       stp     A_l, A_l, [dst]
+       add     dst, dst, tmp2
+       cmp     count, #63
+       b.le    .Ltail63
+1:
+       /* For zeroing small amounts of memory, it's not worth setting up
+        * the line-clear code.  */
+       cmp     count, #128
+       b.lt    .Lnot_short
+#ifdef MAYBE_VIRT
+       /* For efficiency when virtualized, we cache the ZVA capability.  */
+       adrp    tmp2, .Lcache_clear
+       ldr     zva_len, [tmp2, #:lo12:.Lcache_clear]
+       tbnz    zva_len, #31, .Lnot_short
+       cbnz    zva_len, .Lzero_by_line
+       mrs     tmp1, dczid_el0
+       tbz     tmp1, #4, 1f
+       /* ZVA not available.  Remember this for next time.  */
+       mov     zva_len, #~0
+       str     zva_len, [tmp2, #:lo12:.Lcache_clear]
+       b       .Lnot_short
+1:
+       mov     tmp3w, #4
+       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
+       lsl     zva_len, tmp3w, zva_len
+       str     zva_len, [tmp2, #:lo12:.Lcache_clear]
+#else
+       mrs     tmp1, dczid_el0
+       tbnz    tmp1, #4, .Lnot_short
+       mov     tmp3w, #4
+       and     zva_len, tmp1w, #15     /* Safety: other bits reserved.  */
+       lsl     zva_len, tmp3w, zva_len
+#endif
+
+.Lzero_by_line:
+       /* Compute how far we need to go to become suitably aligned.  We're
+        * already at quad-word alignment.  */
+       cmp     count, zva_len_x
+       b.lt    .Lnot_short             /* Not enough to reach alignment.  */
+       sub     zva_bits_x, zva_len_x, #1
+       neg     tmp2, dst
+       ands    tmp2, tmp2, zva_bits_x
+       b.eq    1f                      /* Already aligned.  */
+       /* Not aligned, check that there's enough to copy after alignment.  */
+       sub     tmp1, count, tmp2
+       cmp     tmp1, #64
+       ccmp    tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
+       b.lt    .Lnot_short
+       /* We know that there's at least 64 bytes to zero and that it's safe
+        * to overrun by 64 bytes.  */
+       mov     count, tmp1
+2:
+       stp     A_l, A_l, [dst]
+       stp     A_l, A_l, [dst, #16]
+       stp     A_l, A_l, [dst, #32]
+       subs    tmp2, tmp2, #64
+       stp     A_l, A_l, [dst, #48]
+       add     dst, dst, #64
+       b.ge    2b
+       /* We've overrun a bit, so adjust dst downwards.  */
+       add     dst, dst, tmp2
+1:
+       sub     count, count, zva_len_x
+3:
+       dc      zva, dst
+       add     dst, dst, zva_len_x
+       subs    count, count, zva_len_x
+       b.ge    3b
+       ands    count, count, zva_bits_x
+       b.ne    .Ltail_maybe_long
+       ret
+#ifdef MAYBE_VIRT
+       .bss
+       .p2align 2
+.Lcache_clear:
+       .space 4
+#endif
+#endif /* DONT_USE_DC */
+END(memset)
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 963ee0aef8f0..221ec6d0e409 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -10,8 +10,6 @@ kern/subr_devmap.c                            standard
 kern/subr_intr.c                               optional intrng
 kern/subr_physmem.c                            standard
 libkern/bcmp.c                                 standard
-libkern/memset.c                               standard        \
-       compile-with "${NORMAL_C:N-fsanitize*}"
 libkern/strlen.c               standard
 libkern/arm64/crc32c_armv8.S                   standard
 
@@ -60,6 +58,7 @@ arm64/arm64/machdep_boot.c                    standard
 arm64/arm64/mem.c                              standard
 arm64/arm64/memcmp.S                           standard
 arm64/arm64/memcpy.S                           standard
+arm64/arm64/memset.S                           standard
 arm64/arm64/minidump_machdep.c                 standard
 arm64/arm64/mp_machdep.c                       optional smp
 arm64/arm64/nexus.c                            standard

git: 3a909754a10e - stable/13 - Import the arm64 Cortex Strings memset into the kernel

Reply via email to