The branch stable/13 has been updated by andrew:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=61506f50ef20d08d7849164eff17f23a5d3526f3

commit 61506f50ef20d08d7849164eff17f23a5d3526f3
Author:     Andrew Turner <and...@freebsd.org>
AuthorDate: 2022-09-07 09:51:21 +0000
Commit:     Andrew Turner <and...@freebsd.org>
CommitDate: 2022-09-21 09:45:53 +0000

    Update the arm64 kernel memcpy/memmove
    
    Bring in the latest Arm Optimized Routines memcpy/memmove into the
    arm64 kernel. As these functions have been merged in the current
    version remove the now unneeded memmove.S.
    
    Sponsored by:   The FreeBSD Foundation
    
    (cherry picked from commit 01e478d9559566e666a7b758ede219ed23d9a833)
---
 sys/arm64/arm64/memcpy.S  | 265 +++++++++++++++++++++++++---------------------
 sys/arm64/arm64/memmove.S | 150 --------------------------
 sys/conf/files.arm64      |   1 -
 3 files changed, 144 insertions(+), 272 deletions(-)

diff --git a/sys/arm64/arm64/memcpy.S b/sys/arm64/arm64/memcpy.S
index f98c2513fa58..d5fbfa64e0fa 100644
--- a/sys/arm64/arm64/memcpy.S
+++ b/sys/arm64/arm64/memcpy.S
@@ -1,66 +1,20 @@
-/* Copyright (c) 2012, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
 /*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * memcpy - copy memory area
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
  */
 
-#include <machine/asm.h>
-__FBSDID("$FreeBSD$");
-
 /* Assumptions:
  *
  * ARMv8-a, AArch64, unaligned accesses.
  *
  */
 
+#include <machine/asm.h>
+
+#define L(l) .L ## l
+
 #define dstin  x0
 #define src    x1
 #define count  x2
@@ -70,114 +24,136 @@ __FBSDID("$FreeBSD$");
 #define A_l    x6
 #define A_lw   w6
 #define A_h    x7
-#define A_hw   w7
 #define B_l    x8
 #define B_lw   w8
 #define B_h    x9
 #define C_l    x10
+#define C_lw   w10
 #define C_h    x11
 #define D_l    x12
 #define D_h    x13
-#define E_l    src
-#define E_h    count
-#define F_l    srcend
-#define F_h    dst
-#define tmp1   x9
-
-#define L(l) .L ## l
-
-/* Copies are split into 3 main cases: small copies of up to 16 bytes,
-   medium copies of 17..96 bytes which are fully unrolled. Large copies
-   of more than 96 bytes align the destination and use an unrolled loop
-   processing 64 bytes per iteration.
-   Small and medium copies read all data before writing, allowing any
-   kind of overlap, and memmove tailcalls memcpy for these cases as
-   well as non-overlapping copies.
+#define E_l    x14
+#define E_h    x15
+#define F_l    x16
+#define F_h    x17
+#define G_l    count
+#define G_h    dst
+#define H_l    src
+#define H_h    srcend
+#define tmp1   x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per 
iteration.
+   The destination pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
 */
 
 ENTRY(memcpy)
-       prfm    PLDL1KEEP, [src]
+EENTRY(memmove)
        add     srcend, src, count
        add     dstend, dstin, count
-       cmp     count, 16
-       b.ls    L(copy16)
-       cmp     count, 96
+       cmp     count, 128
        b.hi    L(copy_long)
+       cmp     count, 32
+       b.hi    L(copy32_128)
 
-       /* Medium copies: 17..96 bytes.  */
-       sub     tmp1, count, 1
+       /* Small copies: 0..32 bytes.  */
+       cmp     count, 16
+       b.lo    L(copy16)
        ldp     A_l, A_h, [src]
-       tbnz    tmp1, 6, L(copy96)
        ldp     D_l, D_h, [srcend, -16]
-       tbz     tmp1, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
        stp     A_l, A_h, [dstin]
        stp     D_l, D_h, [dstend, -16]
        ret
 
-       .p2align 4
-       /* Small copies: 0..16 bytes.  */
+       /* Copy 8-15 bytes.  */
 L(copy16):
-       cmp     count, 8
-       b.lo    1f
+       tbz     count, 3, L(copy8)
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
-       .p2align 4
-1:
-       tbz     count, 2, 1f
+
+       .p2align 3
+       /* Copy 4-7 bytes.  */
+L(copy8):
+       tbz     count, 2, L(copy4)
        ldr     A_lw, [src]
-       ldr     A_hw, [srcend, -4]
+       ldr     B_lw, [srcend, -4]
        str     A_lw, [dstin]
-       str     A_hw, [dstend, -4]
+       str     B_lw, [dstend, -4]
        ret
 
-       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
-          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
-1:
-       cbz     count, 2f
+       /* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+       cbz     count, L(copy0)
        lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       ldrb    A_hw, [srcend, -1]
+       ldrb    C_lw, [srcend, -1]
        ldrb    B_lw, [src, tmp1]
        strb    A_lw, [dstin]
        strb    B_lw, [dstin, tmp1]
-       strb    A_hw, [dstend, -1]
-2:     ret
+       strb    C_lw, [dstend, -1]
+L(copy0):
+       ret
 
        .p2align 4
-       /* Copy 64..96 bytes.  Copy 64 bytes from the start and
-          32 bytes from the end.  */
-L(copy96):
+       /* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+       ldp     A_l, A_h, [src]
        ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [src, 32]
-       ldp     D_l, D_h, [src, 48]
-       ldp     E_l, E_h, [srcend, -32]
-       ldp     F_l, F_h, [srcend, -16]
+       ldp     C_l, C_h, [srcend, -32]
+       ldp     D_l, D_h, [srcend, -16]
+       cmp     count, 64
+       b.hi    L(copy128)
        stp     A_l, A_h, [dstin]
        stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin, 32]
-       stp     D_l, D_h, [dstin, 48]
-       stp     E_l, E_h, [dstend, -32]
-       stp     F_l, F_h, [dstend, -16]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
        ret
 
-       /* Align DST to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 96 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
+       .p2align 4
+       /* Copy 65..128 bytes.  */
+L(copy128):
+       ldp     E_l, E_h, [src, 32]
+       ldp     F_l, F_h, [src, 48]
+       cmp     count, 96
+       b.ls    L(copy96)
+       ldp     G_l, G_h, [srcend, -64]
+       ldp     H_l, H_h, [srcend, -48]
+       stp     G_l, G_h, [dstend, -64]
+       stp     H_l, H_h, [dstend, -48]
+L(copy96):
+       stp     A_l, A_h, [dstin]
+       stp     B_l, B_h, [dstin, 16]
+       stp     E_l, E_h, [dstin, 32]
+       stp     F_l, F_h, [dstin, 48]
+       stp     C_l, C_h, [dstend, -32]
+       stp     D_l, D_h, [dstend, -16]
+       ret
 
        .p2align 4
+       /* Copy more than 128 bytes.  */
 L(copy_long):
+       /* Use backwards copy if there is an overlap.  */
+       sub     tmp1, dstin, src
+       cbz     tmp1, L(copy0)
+       cmp     tmp1, count
+       b.lo    L(copy_long_backwards)
+
+       /* Copy 16 bytes and then align dst to 16-byte alignment.  */
+
+       ldp     D_l, D_h, [src]
        and     tmp1, dstin, 15
        bic     dst, dstin, 15
-       ldp     D_l, D_h, [src]
        sub     src, src, tmp1
        add     count, count, tmp1      /* Count is now 16 too large.  */
        ldp     A_l, A_h, [src, 16]
@@ -186,8 +162,9 @@ L(copy_long):
        ldp     C_l, C_h, [src, 48]
        ldp     D_l, D_h, [src, 64]!
        subs    count, count, 128 + 16  /* Test and readjust count.  */
-       b.ls    2f
-1:
+       b.ls    L(copy64_from_end)
+
+L(loop64):
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [src, 16]
        stp     B_l, B_h, [dst, 32]
@@ -197,12 +174,10 @@ L(copy_long):
        stp     D_l, D_h, [dst, 64]!
        ldp     D_l, D_h, [src, 64]!
        subs    count, count, 64
-       b.hi    1b
+       b.hi    L(loop64)
 
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the end even if
-          there is just 1 byte left.  */
-2:
+       /* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
        ldp     E_l, E_h, [srcend, -64]
        stp     A_l, A_h, [dst, 16]
        ldp     A_l, A_h, [srcend, -48]
@@ -216,4 +191,52 @@ L(copy_long):
        stp     B_l, B_h, [dstend, -32]
        stp     C_l, C_h, [dstend, -16]
        ret
+
+       .p2align 4
+
+       /* Large backwards copy for overlapping copies.
+          Copy 16 bytes and then align dst to 16-byte alignment.  */
+L(copy_long_backwards):
+       ldp     D_l, D_h, [srcend, -16]
+       and     tmp1, dstend, 15
+       sub     srcend, srcend, tmp1
+       sub     count, count, tmp1
+       ldp     A_l, A_h, [srcend, -16]
+       stp     D_l, D_h, [dstend, -16]
+       ldp     B_l, B_h, [srcend, -32]
+       ldp     C_l, C_h, [srcend, -48]
+       ldp     D_l, D_h, [srcend, -64]!
+       sub     dstend, dstend, tmp1
+       subs    count, count, 128
+       b.ls    L(copy64_from_start)
+
+L(loop64_backwards):
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [srcend, -16]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [srcend, -32]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [srcend, -48]
+       stp     D_l, D_h, [dstend, -64]!
+       ldp     D_l, D_h, [srcend, -64]!
+       subs    count, count, 64
+       b.hi    L(loop64_backwards)
+
+       /* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+       ldp     G_l, G_h, [src, 48]
+       stp     A_l, A_h, [dstend, -16]
+       ldp     A_l, A_h, [src, 32]
+       stp     B_l, B_h, [dstend, -32]
+       ldp     B_l, B_h, [src, 16]
+       stp     C_l, C_h, [dstend, -48]
+       ldp     C_l, C_h, [src]
+       stp     D_l, D_h, [dstend, -64]
+       stp     G_l, G_h, [dstin, 48]
+       stp     A_l, A_h, [dstin, 32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstin]
+       ret
+EEND(memmove)
 END(memcpy)
+
diff --git a/sys/arm64/arm64/memmove.S b/sys/arm64/arm64/memmove.S
deleted file mode 100644
index 4b99dccc536e..000000000000
--- a/sys/arm64/arm64/memmove.S
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2013, Linaro Limited
-   All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are met:
-       * Redistributions of source code must retain the above copyright
-         notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above copyright
-         notice, this list of conditions and the following disclaimer in the
-         documentation and/or other materials provided with the distribution.
-       * Neither the name of the Linaro nor the
-         names of its contributors may be used to endorse or promote products
-         derived from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
-
-/*
- * Copyright (c) 2015 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <machine/asm.h>
-__FBSDID("$FreeBSD$");
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, unaligned accesses
- */
-
-/* Parameters and result.  */
-#define dstin  x0
-#define src    x1
-#define count  x2
-#define srcend x3
-#define dstend x4
-#define tmp1   x5
-#define A_l    x6
-#define A_h    x7
-#define B_l    x8
-#define B_h    x9
-#define C_l    x10
-#define C_h    x11
-#define D_l    x12
-#define D_h    x13
-#define E_l    count
-#define E_h    tmp1
-
-/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
-   Larger backwards copies are also handled by memcpy. The only remaining
-   case is forward large copies.  The destination is aligned, and an
-   unrolled loop processes 64 bytes per iteration.
-*/
-
-ENTRY(bcopy)
-       /* Switch the input pointers when called as bcopy */
-       mov     x3, x1
-       mov     x1, x0
-       mov     x0, x3
-EENTRY(memmove)
-       sub     tmp1, dstin, src
-       cmp     count, 96
-       ccmp    tmp1, count, 2, hi
-       b.hs    memcpy
-
-       cbz     tmp1, 3f
-       add     dstend, dstin, count
-       add     srcend, src, count
-
-       /* Align dstend to 16 byte alignment so that we don't cross cache line
-          boundaries on both loads and stores.  There are at least 96 bytes
-          to copy, so copy 16 bytes unaligned and then align.  The loop
-          copies 64 bytes per iteration and prefetches one iteration ahead.  */
-
-       and     tmp1, dstend, 15
-       ldp     D_l, D_h, [srcend, -16]
-       sub     srcend, srcend, tmp1
-       sub     count, count, tmp1
-       ldp     A_l, A_h, [srcend, -16]
-       stp     D_l, D_h, [dstend, -16]
-       ldp     B_l, B_h, [srcend, -32]
-       ldp     C_l, C_h, [srcend, -48]
-       ldp     D_l, D_h, [srcend, -64]!
-       sub     dstend, dstend, tmp1
-       subs    count, count, 128
-       b.ls    2f
-       nop
-1:
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [srcend, -16]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [srcend, -32]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [srcend, -48]
-       stp     D_l, D_h, [dstend, -64]!
-       ldp     D_l, D_h, [srcend, -64]!
-       subs    count, count, 64
-       b.hi    1b
-
-       /* Write the last full set of 64 bytes.  The remainder is at most 64
-          bytes, so it is safe to always copy 64 bytes from the start even if
-          there is just 1 byte left.  */
-2:
-       ldp     E_l, E_h, [src, 48]
-       stp     A_l, A_h, [dstend, -16]
-       ldp     A_l, A_h, [src, 32]
-       stp     B_l, B_h, [dstend, -32]
-       ldp     B_l, B_h, [src, 16]
-       stp     C_l, C_h, [dstend, -48]
-       ldp     C_l, C_h, [src]
-       stp     D_l, D_h, [dstend, -64]
-       stp     E_l, E_h, [dstin, 48]
-       stp     A_l, A_h, [dstin, 32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstin]
-3:     ret
-EEND(memmove)
-END(bcopy)
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index f9dc5c9740e7..86ada6e4c924 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -61,7 +61,6 @@ arm64/arm64/machdep.c                         standard
 arm64/arm64/machdep_boot.c                     standard
 arm64/arm64/mem.c                              standard
 arm64/arm64/memcpy.S                           standard
-arm64/arm64/memmove.S                          standard
 arm64/arm64/minidump_machdep.c                 standard
 arm64/arm64/mp_machdep.c                       optional smp
 arm64/arm64/nexus.c                            standard

Reply via email to