The branch main has been updated by jhb:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=47d997021fbc7b662e9507deec1897d514d1224c

commit 47d997021fbc7b662e9507deec1897d514d1224c
Author:     John Baldwin <j...@freebsd.org>
AuthorDate: 2023-08-29 21:46:44 +0000
Commit:     John Baldwin <j...@freebsd.org>
CommitDate: 2023-08-29 21:46:44 +0000

    libcrypto: Switch back to the generated assembly in sys/crypto/openssl
    
    Reviewed by:    markj
    Differential Revision:  https://reviews.freebsd.org/D41569
---
 secure/lib/libcrypto/Makefile                      |     4 +-
 .../lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S  |  6390 ---------
 secure/lib/libcrypto/arch/aarch64/aesv8-armx.S     |  3181 -----
 secure/lib/libcrypto/arch/aarch64/arm64cpuid.S     |   130 -
 secure/lib/libcrypto/arch/aarch64/armv8-mont.S     |  2125 ---
 secure/lib/libcrypto/arch/aarch64/chacha-armv8.S   |  2035 ---
 .../libcrypto/arch/aarch64/ecp_nistz256-armv8.S    |  4243 ------
 secure/lib/libcrypto/arch/aarch64/ghashv8-armx.S   |   553 -
 .../lib/libcrypto/arch/aarch64/keccak1600-armv8.S  |  1010 --
 secure/lib/libcrypto/arch/aarch64/poly1305-armv8.S |   864 --
 secure/lib/libcrypto/arch/aarch64/sha1-armv8.S     |  1212 --
 secure/lib/libcrypto/arch/aarch64/sha256-armv8.S   |  2052 ---
 secure/lib/libcrypto/arch/aarch64/sha512-armv8.S   |  1607 ---
 secure/lib/libcrypto/arch/aarch64/vpaes-armv8.S    |  1197 --
 secure/lib/libcrypto/arch/amd64/aes-x86_64.S       |  2680 ----
 secure/lib/libcrypto/arch/amd64/aesni-gcm-x86_64.S |   811 --
 secure/lib/libcrypto/arch/amd64/aesni-mb-x86_64.S  |  1610 ---
 .../lib/libcrypto/arch/amd64/aesni-sha1-x86_64.S   |  3057 -----
 .../lib/libcrypto/arch/amd64/aesni-sha256-x86_64.S |  4457 ------
 secure/lib/libcrypto/arch/amd64/aesni-x86_64.S     |  4507 ------
 secure/lib/libcrypto/arch/amd64/bsaes-x86_64.S     |  2619 ----
 secure/lib/libcrypto/arch/amd64/chacha-x86_64.S    |  2215 ---
 secure/lib/libcrypto/arch/amd64/cmll-x86_64.S      |  1947 ---
 secure/lib/libcrypto/arch/amd64/e_padlock-x86_64.S |  1059 --
 .../lib/libcrypto/arch/amd64/ecp_nistz256-x86_64.S |  7365 ----------
 secure/lib/libcrypto/arch/amd64/ghash-x86_64.S     |  1875 ---
 .../lib/libcrypto/arch/amd64/keccak1600-x86_64.S   |   546 -
 secure/lib/libcrypto/arch/amd64/md5-x86_64.S       |   705 -
 secure/lib/libcrypto/arch/amd64/poly1305-x86_64.S  |  2090 ---
 secure/lib/libcrypto/arch/amd64/rc4-md5-x86_64.S   |  1303 --
 secure/lib/libcrypto/arch/amd64/rc4-x86_64.S       |   657 -
 secure/lib/libcrypto/arch/amd64/rsaz-avx2.S        |  1766 ---
 secure/lib/libcrypto/arch/amd64/rsaz-avx512.S      |   902 --
 secure/lib/libcrypto/arch/amd64/rsaz-x86_64.S      |  2037 ---
 secure/lib/libcrypto/arch/amd64/sha1-mb-x86_64.S   |  7325 ----------
 secure/lib/libcrypto/arch/amd64/sha1-x86_64.S      |  5472 --------
 secure/lib/libcrypto/arch/amd64/sha256-mb-x86_64.S |  8006 -----------
 secure/lib/libcrypto/arch/amd64/sha256-x86_64.S    |  5478 --------
 secure/lib/libcrypto/arch/amd64/sha512-x86_64.S    |  5483 --------
 secure/lib/libcrypto/arch/amd64/vpaes-x86_64.S     |   880 --
 secure/lib/libcrypto/arch/amd64/wp-x86_64.S        |   901 --
 secure/lib/libcrypto/arch/amd64/x25519-x86_64.S    |   824 --
 secure/lib/libcrypto/arch/amd64/x86_64-gf2m.S      |   333 -
 secure/lib/libcrypto/arch/amd64/x86_64-mont.S      |  1261 --
 secure/lib/libcrypto/arch/amd64/x86_64-mont5.S     |  3625 -----
 secure/lib/libcrypto/arch/amd64/x86_64cpuid.S      |   513 -
 secure/lib/libcrypto/arch/arm/aes-armv4.S          |  1198 --
 secure/lib/libcrypto/arch/arm/aesv8-armx.S         |  1088 --
 secure/lib/libcrypto/arch/arm/armv4-gf2m.S         |   236 -
 secure/lib/libcrypto/arch/arm/armv4-mont.S         |   961 --
 secure/lib/libcrypto/arch/arm/armv4cpuid.S         |   273 -
 secure/lib/libcrypto/arch/arm/bsaes-armv7.S        |  2561 ----
 secure/lib/libcrypto/arch/arm/chacha-armv4.S       |  1478 --
 secure/lib/libcrypto/arch/arm/ecp_nistz256-armv4.S |  4430 ------
 secure/lib/libcrypto/arch/arm/ghash-armv4.S        |   565 -
 secure/lib/libcrypto/arch/arm/ghashv8-armx.S       |   244 -
 secure/lib/libcrypto/arch/arm/keccak1600-armv4.S   |  2694 ----
 secure/lib/libcrypto/arch/arm/poly1305-armv4.S     |  1169 --
 secure/lib/libcrypto/arch/arm/sha1-armv4-large.S   |  1499 --
 secure/lib/libcrypto/arch/arm/sha256-armv4.S       |  2823 ----
 secure/lib/libcrypto/arch/arm/sha512-armv4.S       |  1877 ---
 secure/lib/libcrypto/arch/i386/aes-586.S           |  6644 ---------
 secure/lib/libcrypto/arch/i386/aesni-x86.S         |  6732 ---------
 secure/lib/libcrypto/arch/i386/bf-586.S            |  1928 ---
 secure/lib/libcrypto/arch/i386/bn-586.S            |  3157 -----
 secure/lib/libcrypto/arch/i386/cast-586.S          |  2002 ---
 secure/lib/libcrypto/arch/i386/chacha-x86.S        |  2084 ---
 secure/lib/libcrypto/arch/i386/cmll-x86.S          |  4896 -------
 secure/lib/libcrypto/arch/i386/co-586.S            |  2584 ----
 secure/lib/libcrypto/arch/i386/crypt586.S          |  1800 ---
 secure/lib/libcrypto/arch/i386/des-586.S           |  3932 ------
 secure/lib/libcrypto/arch/i386/e_padlock-x86.S     |  2300 ----
 secure/lib/libcrypto/arch/i386/ecp_nistz256-x86.S  | 10584 --------------
 secure/lib/libcrypto/arch/i386/ghash-x86.S         |  2636 ----
 secure/lib/libcrypto/arch/i386/md5-586.S           |  1404 --
 secure/lib/libcrypto/arch/i386/poly1305-x86.S      |  3938 ------
 secure/lib/libcrypto/arch/i386/rc4-586.S           |   819 --
 secure/lib/libcrypto/arch/i386/rc5-586.S           |  1264 --
 secure/lib/libcrypto/arch/i386/rmd-586.S           |  3976 ------
 secure/lib/libcrypto/arch/i386/sha1-586.S          |  8016 -----------
 secure/lib/libcrypto/arch/i386/sha256-586.S        | 13612 -------------------
 secure/lib/libcrypto/arch/i386/sha512-586.S        |  5704 --------
 secure/lib/libcrypto/arch/i386/vpaes-x86.S         |  1488 --
 secure/lib/libcrypto/arch/i386/wp-mmx.S            |  2260 ---
 secure/lib/libcrypto/arch/i386/x86-gf2m.S          |   755 -
 secure/lib/libcrypto/arch/i386/x86-mont.S          |   995 --
 secure/lib/libcrypto/arch/i386/x86cpuid.S          |  1217 --
 secure/lib/libcrypto/arch/powerpc/aes-ppc.S        |  1561 ---
 secure/lib/libcrypto/arch/powerpc/aesp8-ppc.S      |  3642 -----
 secure/lib/libcrypto/arch/powerpc/bn-ppc.S         |  1855 ---
 secure/lib/libcrypto/arch/powerpc/chacha-ppc.S     |  1492 --
 secure/lib/libcrypto/arch/powerpc/ghashp8-ppc.S    |   569 -
 secure/lib/libcrypto/arch/powerpc/poly1305-ppc.S   |  1301 --
 secure/lib/libcrypto/arch/powerpc/poly1305-ppcfp.S |   586 -
 secure/lib/libcrypto/arch/powerpc/ppc-mont.S       |  1787 ---
 secure/lib/libcrypto/arch/powerpc/ppc.S            |  1855 ---
 secure/lib/libcrypto/arch/powerpc/ppccpuid.S       |   356 -
 secure/lib/libcrypto/arch/powerpc/sha1-ppc.S       |  1118 --
 secure/lib/libcrypto/arch/powerpc/sha256-ppc.S     |  1321 --
 secure/lib/libcrypto/arch/powerpc/sha256p8-ppc.S   |   735 -
 secure/lib/libcrypto/arch/powerpc/sha512-ppc.S     |  3071 -----
 secure/lib/libcrypto/arch/powerpc/sha512p8-ppc.S   |   833 --
 secure/lib/libcrypto/arch/powerpc/vpaes-ppc.S      |  1468 --
 secure/lib/libcrypto/arch/powerpc64/aes-ppc.S      |  1533 ---
 secure/lib/libcrypto/arch/powerpc64/aesp8-ppc.S    |  3659 -----
 secure/lib/libcrypto/arch/powerpc64/bn-ppc.S       |  1876 ---
 secure/lib/libcrypto/arch/powerpc64/chacha-ppc.S   |  1499 --
 .../libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S  |   354 -
 .../libcrypto/arch/powerpc64/ecp_nistz256-ppc64.S  |  4854 -------
 secure/lib/libcrypto/arch/powerpc64/ghashp8-ppc.S  |   576 -
 .../libcrypto/arch/powerpc64/keccak1600-ppc64.S    |   670 -
 secure/lib/libcrypto/arch/powerpc64/poly1305-ppc.S |  1142 --
 .../lib/libcrypto/arch/powerpc64/poly1305-ppcfp.S  |   596 -
 secure/lib/libcrypto/arch/powerpc64/ppc-mont.S     |  1790 ---
 secure/lib/libcrypto/arch/powerpc64/ppc.S          |  1876 ---
 secure/lib/libcrypto/arch/powerpc64/ppccpuid.S     |   387 -
 secure/lib/libcrypto/arch/powerpc64/sha1-ppc.S     |  1121 --
 secure/lib/libcrypto/arch/powerpc64/sha256-ppc.S   |  1324 --
 secure/lib/libcrypto/arch/powerpc64/sha256p8-ppc.S |   738 -
 secure/lib/libcrypto/arch/powerpc64/sha512-ppc.S   |  1420 --
 secure/lib/libcrypto/arch/powerpc64/sha512p8-ppc.S |   836 --
 secure/lib/libcrypto/arch/powerpc64/vpaes-ppc.S    |  1479 --
 secure/lib/libcrypto/arch/powerpc64/x25519-ppc64.S |   349 -
 secure/lib/libcrypto/arch/powerpc64le/aes-ppc.S    |  1581 ---
 secure/lib/libcrypto/arch/powerpc64le/aesp8-ppc.S  |  3659 -----
 secure/lib/libcrypto/arch/powerpc64le/bn-ppc.S     |  1876 ---
 secure/lib/libcrypto/arch/powerpc64le/chacha-ppc.S |  1371 --
 .../arch/powerpc64le/ecp_nistp521-ppc64.S          |   354 -
 .../arch/powerpc64le/ecp_nistz256-ppc64.S          |  4854 -------
 .../lib/libcrypto/arch/powerpc64le/ghashp8-ppc.S   |   576 -
 .../libcrypto/arch/powerpc64le/keccak1600-ppc64.S  |   670 -
 .../lib/libcrypto/arch/powerpc64le/poly1305-ppc.S  |  1128 --
 .../libcrypto/arch/powerpc64le/poly1305-ppcfp.S    |   591 -
 secure/lib/libcrypto/arch/powerpc64le/ppc-mont.S   |  1790 ---
 secure/lib/libcrypto/arch/powerpc64le/ppc.S        |  1876 ---
 secure/lib/libcrypto/arch/powerpc64le/ppccpuid.S   |   387 -
 secure/lib/libcrypto/arch/powerpc64le/sha1-ppc.S   |  1169 --
 secure/lib/libcrypto/arch/powerpc64le/sha256-ppc.S |  1372 --
 .../lib/libcrypto/arch/powerpc64le/sha256p8-ppc.S  |   746 -
 secure/lib/libcrypto/arch/powerpc64le/sha512-ppc.S |  1516 ---
 .../lib/libcrypto/arch/powerpc64le/sha512p8-ppc.S  |   848 --
 secure/lib/libcrypto/arch/powerpc64le/vpaes-ppc.S  |  1479 --
 .../lib/libcrypto/arch/powerpc64le/x25519-ppc64.S  |   349 -
 secure/lib/libcrypto/engines/padlock/Makefile      |     2 +-
 secure/lib/libcrypto/modules/fips/Makefile         |     4 +-
 145 files changed, 5 insertions(+), 310557 deletions(-)

diff --git a/secure/lib/libcrypto/Makefile b/secure/lib/libcrypto/Makefile
index ab9044ad67f9..585e89861815 100644
--- a/secure/lib/libcrypto/Makefile
+++ b/secure/lib/libcrypto/Makefile
@@ -618,12 +618,12 @@ buildasm cleanasm:
 PICFLAG+=      -DOPENSSL_PIC
 
 .if defined(ASM_${MACHINE_CPUARCH})
-.PATH: ${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_CPUARCH}
+.PATH: ${SRCTOP}/sys/crypto/openssl/${MACHINE_CPUARCH}
 .if defined(ASM_amd64)
 .PATH: ${LCRYPTO_SRC}/crypto/bn/asm
 .endif
 .elif defined(ASM_${MACHINE_ARCH})
-.PATH: ${SRCTOP}/secure/lib/libcrypto/arch/${MACHINE_ARCH}
+.PATH: ${SRCTOP}/sys/crypto/openssl/${MACHINE_ARCH}
 .endif
 
 .PATH: ${LCRYPTO_SRC}/crypto \
diff --git a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S 
b/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
deleted file mode 100644
index eb85dbc9f996..000000000000
--- a/secure/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S
+++ /dev/null
@@ -1,6390 +0,0 @@
-/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
-#include "arm_arch.h"
-
-#if __ARM_MAX_ARCH__>=8
-.arch  armv8-a+crypto
-.text
-.globl aes_gcm_enc_128_kernel
-.type  aes_gcm_enc_128_kernel,%function
-.align 4
-aes_gcm_enc_128_kernel:
-       cbz     x1, .L128_enc_ret
-       stp     x19, x20, [sp, #-112]!
-       mov     x16, x4
-       mov     x8, x5
-       stp     x21, x22, [sp, #16]
-       stp     x23, x24, [sp, #32]
-       stp     d8, d9, [sp, #48]
-       stp     d10, d11, [sp, #64]
-       stp     d12, d13, [sp, #80]
-       stp     d14, d15, [sp, #96]
-
-       ldp     x10, x11, [x16]              //ctr96_b64, ctr96_t32
-#ifdef __AARCH64EB__
-       rev     x10, x10
-       rev     x11, x11
-#endif
-       ldp     x13, x14, [x8, #160]                     //load rk10
-#ifdef __AARCH64EB__
-       ror     x13, x13, #32
-       ror     x14, x14, #32
-#endif
-       ld1     {v11.16b}, [x3]
-       ext     v11.16b, v11.16b, v11.16b, #8
-       rev64   v11.16b, v11.16b
-       lsr     x5, x1, #3              //byte_len
-       mov     x15, x5
-
-       ld1     {v18.4s}, [x8], #16                                             
                  //load rk0
-       add     x4, x0, x1, lsr #3   //end_input_ptr
-       sub     x5, x5, #1      //byte_len - 1
-
-       lsr     x12, x11, #32
-       ldr     q15, [x3, #112]                        //load h4l | h4h
-#ifndef __AARCH64EB__
-       ext     v15.16b, v15.16b, v15.16b, #8
-#endif
-       fmov    d1, x10                               //CTR block 1
-       rev     w12, w12                                //rev_ctr32
-
-       add     w12, w12, #1                            //increment rev_ctr32
-       orr     w11, w11, w11
-       ld1     {v19.4s}, [x8], #16                                             
                  //load rk1
-
-       rev     w9, w12                                 //CTR block 1
-       add     w12, w12, #1                            //CTR block 1
-       fmov    d3, x10                               //CTR block 3
-
-       orr     x9, x11, x9, lsl #32            //CTR block 1
-       ld1     { v0.16b}, [x16]                             //special case 
vector load initial counter so we can start first AES block as quickly as 
possible
-
-       fmov    v1.d[1], x9                               //CTR block 1
-       rev     w9, w12                                 //CTR block 2
-
-       fmov    d2, x10                               //CTR block 2
-       orr     x9, x11, x9, lsl #32            //CTR block 2
-       add     w12, w12, #1                            //CTR block 2
-
-       fmov    v2.d[1], x9                               //CTR block 2
-       rev     w9, w12                                 //CTR block 3
-
-       orr     x9, x11, x9, lsl #32            //CTR block 3
-       ld1     {v20.4s}, [x8], #16                                             
                  //load rk2
-
-       add     w12, w12, #1                            //CTR block 3
-       fmov    v3.d[1], x9                               //CTR block 3
-
-       ldr     q14, [x3, #80]                         //load h3l | h3h
-#ifndef __AARCH64EB__
-       ext     v14.16b, v14.16b, v14.16b, #8
-#endif
-       aese    v1.16b, v18.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 0
-       ld1     {v21.4s}, [x8], #16                                             
                  //load rk3
-
-       aese    v2.16b, v18.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 0
-       ldr     q12, [x3, #32]                         //load h1l | h1h
-#ifndef __AARCH64EB__
-       ext     v12.16b, v12.16b, v12.16b, #8
-#endif
-
-       aese    v0.16b, v18.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 0
-       ld1     {v22.4s}, [x8], #16                                             
                  //load rk4
-
-       aese    v3.16b, v18.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 0
-       ld1     {v23.4s}, [x8], #16                                             
                  //load rk5
-
-       aese    v2.16b, v19.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 1
-       trn2    v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
-
-       aese    v0.16b, v19.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 1
-       ld1     {v24.4s}, [x8], #16                                             
                  //load rk6
-
-       aese    v1.16b, v19.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 1
-       ld1     {v25.4s}, [x8], #16                                             
                  //load rk7
-
-       aese    v3.16b, v19.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 1
-       trn1    v9.2d, v14.2d,    v15.2d                      //h4h | h3h
-
-       aese    v0.16b, v20.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 2
-       ld1     {v26.4s}, [x8], #16                                             
                  //load rk8
-
-       aese    v1.16b, v20.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 2
-       ldr     q13, [x3, #64]                         //load h2l | h2h
-#ifndef __AARCH64EB__
-       ext     v13.16b, v13.16b, v13.16b, #8
-#endif
-
-       aese    v3.16b, v20.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 2
-
-       aese    v2.16b, v20.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 2
-       eor     v17.16b, v17.16b, v9.16b                  //h4k | h3k
-
-       aese    v0.16b, v21.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 3
-
-       aese    v1.16b, v21.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 3
-
-       aese    v2.16b, v21.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 3
-       ld1     {v27.4s}, [x8], #16                                             
                  //load rk9
-
-       aese    v3.16b, v21.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 3
-
-       and     x5, x5, #0xffffffffffffffc0    //number of bytes to be 
processed in main loop (at least 1 byte must be handled by tail)
-       trn2    v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
-
-       aese    v3.16b, v22.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 4
-       add     x5, x5, x0
-
-       aese    v2.16b, v22.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 4
-       cmp     x0, x5                   //check if we have <= 4 blocks
-
-       aese    v0.16b, v22.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 4
-
-       aese    v3.16b, v23.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 5
-
-       aese    v2.16b, v23.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 5
-
-       aese    v0.16b, v23.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 5
-
-       aese    v3.16b, v24.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 6
-
-       aese    v1.16b, v22.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 4
-
-       aese    v2.16b, v24.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 6
-       trn1    v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
-
-       aese    v0.16b, v24.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 6
-
-       aese    v1.16b, v23.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 5
-
-       aese    v3.16b, v25.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 7
-
-       aese    v0.16b, v25.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 7
-
-       aese    v1.16b, v24.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 6
-
-       aese    v2.16b, v25.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 7
-
-       aese    v0.16b, v26.16b
-       aesmc   v0.16b, v0.16b          //AES block 0 - round 8
-
-       aese    v1.16b, v25.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 7
-
-       aese    v2.16b, v26.16b
-       aesmc   v2.16b, v2.16b          //AES block 2 - round 8
-
-       aese    v3.16b, v26.16b
-       aesmc   v3.16b, v3.16b          //AES block 3 - round 8
-
-       aese    v1.16b, v26.16b
-       aesmc   v1.16b, v1.16b          //AES block 1 - round 8
-
-       aese    v2.16b, v27.16b                                      //AES 
block 2 - round 9
-
-       aese    v0.16b, v27.16b                                      //AES 
block 0 - round 9
-
-       eor     v16.16b, v16.16b, v8.16b                     //h2k | h1k
-
-       aese    v1.16b, v27.16b                                      //AES 
block 1 - round 9
-
-       aese    v3.16b, v27.16b                                      //AES 
block 3 - round 9
-       b.ge    .L128_enc_tail                                    //handle tail
-
-       ldp     x6, x7, [x0, #0]            //AES block 0 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x6, x6
-       rev     x7, x7
-#endif
-       ldp     x21, x22, [x0, #32]           //AES block 2 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x21, x21
-       rev     x22, x22
-#endif
-       ldp     x19, x20, [x0, #16]           //AES block 1 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x19, x19
-       rev     x20, x20
-#endif
-       ldp     x23, x24, [x0, #48]           //AES block 3 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x23, x23
-       rev     x24, x24
-#endif
-       eor     x6, x6, x13                     //AES block 0 - round 10 low
-       eor     x7, x7, x14                     //AES block 0 - round 10 high
-
-       eor     x21, x21, x13                     //AES block 2 - round 10 low
-       fmov    d4, x6                               //AES block 0 - mov low
-
-       eor     x19, x19, x13                     //AES block 1 - round 10 low
-       eor     x22, x22, x14                     //AES block 2 - round 10 high
-       fmov    v4.d[1], x7                           //AES block 0 - mov high
-
-       fmov    d5, x19                               //AES block 1 - mov low
-       eor     x20, x20, x14                     //AES block 1 - round 10 high
-
-       eor     x23, x23, x13                     //AES block 3 - round 10 low
-       fmov    v5.d[1], x20                           //AES block 1 - mov high
-
-       fmov    d6, x21                               //AES block 2 - mov low
-       eor     x24, x24, x14                     //AES block 3 - round 10 high
-       rev     w9, w12                                 //CTR block 4
-
-       fmov    v6.d[1], x22                           //AES block 2 - mov high
-       orr     x9, x11, x9, lsl #32            //CTR block 4
-
-       eor     v4.16b, v4.16b, v0.16b                          //AES block 0 - 
result
-       fmov    d0, x10                               //CTR block 4
-       add     w12, w12, #1                            //CTR block 4
-
-       fmov    v0.d[1], x9                               //CTR block 4
-       rev     w9, w12                                 //CTR block 5
-
-       eor     v5.16b, v5.16b, v1.16b                          //AES block 1 - 
result
-       fmov    d1, x10                               //CTR block 5
-       orr     x9, x11, x9, lsl #32            //CTR block 5
-
-       add     w12, w12, #1                            //CTR block 5
-       add     x0, x0, #64                       //AES input_ptr update
-       fmov    v1.d[1], x9                               //CTR block 5
-
-       fmov    d7, x23                               //AES block 3 - mov low
-       rev     w9, w12                                 //CTR block 6
-       st1     { v4.16b}, [x2], #16                     //AES block 0 - store 
result
-
-       fmov    v7.d[1], x24                           //AES block 3 - mov high
-       orr     x9, x11, x9, lsl #32            //CTR block 6
-
-       add     w12, w12, #1                            //CTR block 6
-       eor     v6.16b, v6.16b, v2.16b                          //AES block 2 - 
result
-       st1     { v5.16b}, [x2], #16                     //AES block 1 - store 
result
-
-       fmov    d2, x10                               //CTR block 6
-       cmp     x0, x5                   //check if we have <= 8 blocks
-
-       fmov    v2.d[1], x9                               //CTR block 6
-       rev     w9, w12                                 //CTR block 7
-       st1     { v6.16b}, [x2], #16                     //AES block 2 - store 
result
-
-       orr     x9, x11, x9, lsl #32            //CTR block 7
-
-       eor     v7.16b, v7.16b, v3.16b                          //AES block 3 - 
result
-       st1     { v7.16b}, [x2], #16                     //AES block 3 - store 
result
-       b.ge    .L128_enc_prepretail                              //do 
prepretail
-
-.L128_enc_main_loop:   //main  loop start
-       ldp     x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x23, x23
-       rev     x24, x24
-#endif
-       rev64   v4.16b, v4.16b                                    //GHASH block 
4k (only t0 is free)
-       rev64   v6.16b, v6.16b                                    //GHASH block 
4k+2 (t0, t1, and t2 free)
-
-       aese    v2.16b, v18.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
-       fmov    d3, x10                               //CTR block 4k+3
-
-       ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
-       rev64   v5.16b, v5.16b                                    //GHASH block 
4k+1 (t0 and t1 free)
-
-       aese    v1.16b, v18.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
-       add     w12, w12, #1                            //CTR block 4k+3
-       fmov    v3.d[1], x9                               //CTR block 4k+3
-
-       aese    v0.16b, v18.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
-       mov     d31, v6.d[1]                                  //GHASH block 
4k+2 - mid
-
-       aese    v2.16b, v19.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
-       mov     d30, v5.d[1]                                  //GHASH block 
4k+1 - mid
-
-       aese    v1.16b, v19.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
-       eor     v4.16b, v4.16b, v11.16b                           //PRE 1
-
-       aese    v3.16b, v18.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
-       eor     x24, x24, x14                     //AES block 4k+3 - round 10 
high
-
-       pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 
4k+1 - high
-       eor     v31.8b, v31.8b, v6.8b                          //GHASH block 
4k+2 - mid
-       ldp     x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x6, x6
-       rev     x7, x7
-#endif
-       aese    v0.16b, v19.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
-       rev     w9, w12                                 //CTR block 4k+8
-
-       eor     v30.8b, v30.8b, v5.8b                          //GHASH block 
4k+1 - mid
-       mov     d8, v4.d[1]                                  //GHASH block 4k - 
mid
-       orr     x9, x11, x9, lsl #32            //CTR block 4k+8
-
-       pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - 
high
-       add     w12, w12, #1                            //CTR block 4k+8
-       mov     d10, v17.d[1]                               //GHASH block 4k - 
mid
-
-       aese    v0.16b, v20.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
-
-       pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - 
low
-       eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - 
mid
-
-       aese    v1.16b, v20.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
-
-       aese    v0.16b, v21.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
-       eor     v9.16b, v9.16b, v28.16b                         //GHASH block 
4k+1 - high
-
-       pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 
4k+2 - low
-
-       pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - 
mid
-       rev64   v7.16b, v7.16b                                    //GHASH block 
4k+3 (t0, t1, t2 and t3 free)
-
-       pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 
4k+1 - mid
-
-       pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 
4k+1 - low
-       ins     v31.d[1], v31.d[0]                                //GHASH block 
4k+2 - mid
-
-       pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 
4k+2 - high
-       eor     x7, x7, x14                     //AES block 4k+4 - round 10 high
-
-       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+1 - mid
-       mov     d30, v7.d[1]                                  //GHASH block 
4k+3 - mid
-
-       aese    v3.16b, v19.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
-       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+1 - low
-
-       aese    v2.16b, v20.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
-       eor     x6, x6, x13                     //AES block 4k+4 - round 10 low
-
-       aese    v1.16b, v21.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
-       eor     v30.8b, v30.8b, v7.8b                          //GHASH block 
4k+3 - mid
-
-       pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 
4k+3 - high
-
-       aese    v2.16b, v21.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
-       eor     v9.16b, v9.16b, v8.16b                         //GHASH block 
4k+2 - high
-
-       pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 
4k+2 - mid
-
-       pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 
4k+3 - low
-       movi    v8.8b, #0xc2
-
-       pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 
4k+3 - mid
-       eor     v11.16b, v11.16b, v28.16b                         //GHASH block 
4k+2 - low
-
-       aese    v1.16b, v22.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
-
-       aese    v3.16b, v20.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
-       shl     d8, d8, #56               //mod_constant
-
-       aese    v0.16b, v22.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
-       eor     v9.16b, v9.16b, v4.16b                         //GHASH block 
4k+3 - high
-
-       aese    v1.16b, v23.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
-       ldp     x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x19, x19
-       rev     x20, x20
-#endif
-       aese    v3.16b, v21.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
-       eor     v10.16b, v10.16b, v31.16b                         //GHASH block 
4k+2 - mid
-
-       aese    v0.16b, v23.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
-       ldp     x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x21, x21
-       rev     x22, x22
-#endif
-       pmull   v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with 
mid
-       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+3 - low
-
-       aese    v2.16b, v22.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
-       eor     x19, x19, x13                     //AES block 4k+5 - round 10 
low
-
-       aese    v3.16b, v22.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
-       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+3 - mid
-
-       aese    v1.16b, v24.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
-       eor     x23, x23, x13                     //AES block 4k+3 - round 10 
low
-
-       aese    v2.16b, v23.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
-       eor     v30.16b, v11.16b, v9.16b                         //MODULO - 
karatsuba tidy up
-
-       fmov    d4, x6                               //AES block 4k+4 - mov low
-       aese    v0.16b, v24.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
-       fmov    v4.d[1], x7                           //AES block 4k+4 - mov 
high
-
-       add     x0, x0, #64                       //AES input_ptr update
-       fmov    d7, x23                               //AES block 4k+3 - mov low
-       ext     v9.16b, v9.16b, v9.16b, #8                     //MODULO - other 
top alignment
-
-       aese    v3.16b, v23.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
-       fmov    d5, x19                               //AES block 4k+5 - mov low
-
-       aese    v0.16b, v25.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
-       eor     v10.16b, v10.16b, v30.16b                         //MODULO - 
karatsuba tidy up
-
-       aese    v2.16b, v24.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
-       eor     x20, x20, x14                     //AES block 4k+5 - round 10 
high
-
-       aese    v1.16b, v25.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
-       fmov    v5.d[1], x20                           //AES block 4k+5 - mov 
high
-
-       aese    v0.16b, v26.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
-       fmov    v7.d[1], x24                           //AES block 4k+3 - mov 
high
-
-       aese    v3.16b, v24.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
-       cmp     x0, x5                   //.LOOP CONTROL
-
-       aese    v1.16b, v26.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
-       eor     v10.16b, v10.16b, v31.16b                      //MODULO - fold 
into mid
-
-       aese    v0.16b, v27.16b                                      //AES 
block 4k+4 - round 9
-       eor     x21, x21, x13                     //AES block 4k+6 - round 10 
low
-       eor     x22, x22, x14                     //AES block 4k+6 - round 10 
high
-
-       aese    v3.16b, v25.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
-       fmov    d6, x21                               //AES block 4k+6 - mov low
-
-       aese    v1.16b, v27.16b                                      //AES 
block 4k+5 - round 9
-       fmov    v6.d[1], x22                           //AES block 4k+6 - mov 
high
-
-       aese    v2.16b, v25.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
-       eor     v4.16b, v4.16b, v0.16b                          //AES block 
4k+4 - result
-
-       fmov    d0, x10                               //CTR block 4k+8
-       aese    v3.16b, v26.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
-
-       fmov    v0.d[1], x9                               //CTR block 4k+8
-       rev     w9, w12                                 //CTR block 4k+9
-       eor     v10.16b, v10.16b, v9.16b                         //MODULO - 
fold into mid
-
-       aese    v2.16b, v26.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
-       eor     v5.16b, v5.16b, v1.16b                          //AES block 
4k+5 - result
-
-       add     w12, w12, #1                            //CTR block 4k+9
-       orr     x9, x11, x9, lsl #32            //CTR block 4k+9
-       fmov    d1, x10                               //CTR block 4k+9
-
-       pmull   v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with 
low
-       fmov    v1.d[1], x9                               //CTR block 4k+9
-       rev     w9, w12                                 //CTR block 4k+10
-
-       aese    v2.16b, v27.16b                                      //AES 
block 4k+6 - round 9
-       st1     { v4.16b}, [x2], #16                     //AES block 4k+4 - 
store result
-       eor     v6.16b, v6.16b, v2.16b                          //AES block 
4k+6 - result
-       orr     x9, x11, x9, lsl #32            //CTR block 4k+10
-
-       aese    v3.16b, v27.16b                                      //AES 
block 4k+7 - round 9
-       add     w12, w12, #1                            //CTR block 4k+10
-       ext     v10.16b, v10.16b, v10.16b, #8                     //MODULO - 
other mid alignment
-       fmov    d2, x10                               //CTR block 4k+10
-
-       eor     v11.16b, v11.16b, v9.16b                         //MODULO - 
fold into low
-       st1     { v5.16b}, [x2], #16                     //AES block 4k+5 - 
store result
-
-       fmov    v2.d[1], x9                               //CTR block 4k+10
-       st1     { v6.16b}, [x2], #16                     //AES block 4k+6 - 
store result
-       rev     w9, w12                                 //CTR block 4k+11
-
-       orr     x9, x11, x9, lsl #32            //CTR block 4k+11
-       eor     v7.16b, v7.16b, v3.16b                          //AES block 
4k+3 - result
-
-       eor     v11.16b, v11.16b, v10.16b                         //MODULO - 
fold into low
-       st1     { v7.16b}, [x2], #16                     //AES block 4k+3 - 
store result
-       b.lt    .L128_enc_main_loop
-
-.L128_enc_prepretail:  //PREPRETAIL
-       rev64   v4.16b, v4.16b                                    //GHASH block 
4k (only t0 is free)
-       fmov    d3, x10                               //CTR block 4k+3
-       rev64   v5.16b, v5.16b                                    //GHASH block 
4k+1 (t0 and t1 free)
-
-       ext     v11.16b, v11.16b, v11.16b, #8                     //PRE 0
-       add     w12, w12, #1                            //CTR block 4k+3
-       fmov    v3.d[1], x9                               //CTR block 4k+3
-
-       aese    v1.16b, v18.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 0
-       rev64   v6.16b, v6.16b                                    //GHASH block 
4k+2 (t0, t1, and t2 free)
-
-       pmull   v29.1q, v5.1d, v14.1d                          //GHASH block 
4k+1 - low
-
-       rev64   v7.16b, v7.16b                                    //GHASH block 
4k+3 (t0, t1, t2 and t3 free)
-       eor     v4.16b, v4.16b, v11.16b                           //PRE 1
-
-       pmull2  v28.1q, v5.2d, v14.2d                          //GHASH block 
4k+1 - high
-
-       aese    v3.16b, v18.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 0
-       mov     d30, v5.d[1]                                  //GHASH block 
4k+1 - mid
-
-       pmull   v11.1q, v4.1d, v15.1d                       //GHASH block 4k - 
low
-       mov     d8, v4.d[1]                                  //GHASH block 4k - 
mid
-
-       mov     d31, v6.d[1]                                  //GHASH block 
4k+2 - mid
-       mov     d10, v17.d[1]                               //GHASH block 4k - 
mid
-
-       aese    v1.16b, v19.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 1
-       eor     v30.8b, v30.8b, v5.8b                          //GHASH block 
4k+1 - mid
-
-       eor     v8.8b, v8.8b, v4.8b                          //GHASH block 4k - 
mid
-
-       pmull2  v9.1q, v4.2d, v15.2d                       //GHASH block 4k - 
high
-       eor     v31.8b, v31.8b, v6.8b                          //GHASH block 
4k+2 - mid
-
-       aese    v3.16b, v19.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 1
-
-       pmull   v30.1q, v30.1d, v17.1d                          //GHASH block 
4k+1 - mid
-       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+1 - low
-
-       pmull   v10.1q, v8.1d, v10.1d                      //GHASH block 4k - 
mid
-
-       aese    v0.16b, v18.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 0
-       ins     v31.d[1], v31.d[0]                                //GHASH block 
4k+2 - mid
-
-       aese    v2.16b, v18.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 0
-
-       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+1 - mid
-       mov     d30, v7.d[1]                                  //GHASH block 
4k+3 - mid
-
-       aese    v0.16b, v19.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 1
-       eor     v9.16b, v9.16b, v28.16b                         //GHASH block 
4k+1 - high
-
-       pmull2  v31.1q, v31.2d, v16.2d                          //GHASH block 
4k+2 - mid
-
-       pmull2  v8.1q, v6.2d, v13.2d                          //GHASH block 
4k+2 - high
-       eor     v30.8b, v30.8b, v7.8b                          //GHASH block 
4k+3 - mid
-
-       pmull2  v4.1q, v7.2d, v12.2d                          //GHASH block 
4k+3 - high
-
-       pmull   v28.1q, v6.1d, v13.1d                          //GHASH block 
4k+2 - low
-
-       aese    v2.16b, v19.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 1
-       eor     v9.16b, v9.16b, v8.16b                         //GHASH block 
4k+2 - high
-
-       aese    v0.16b, v20.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 2
-
-       pmull   v29.1q, v7.1d, v12.1d                          //GHASH block 
4k+3 - low
-       movi    v8.8b, #0xc2
-
-       aese    v2.16b, v20.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 2
-       eor     v11.16b, v11.16b, v28.16b                         //GHASH block 
4k+2 - low
-
-       aese    v3.16b, v20.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 2
-
-       pmull   v30.1q, v30.1d, v16.1d                          //GHASH block 
4k+3 - mid
-       eor     v10.16b, v10.16b, v31.16b                         //GHASH block 
4k+2 - mid
-
-       aese    v2.16b, v21.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 3
-
-       aese    v1.16b, v20.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 2
-       eor     v9.16b, v9.16b, v4.16b                         //GHASH block 
4k+3 - high
-
-       aese    v0.16b, v21.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 3
-
-       eor     v10.16b, v10.16b, v30.16b                         //GHASH block 
4k+3 - mid
-       shl     d8, d8, #56               //mod_constant
-
-       aese    v1.16b, v21.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 3
-       eor     v11.16b, v11.16b, v29.16b                         //GHASH block 
4k+3 - low
-
-       aese    v0.16b, v22.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 4
-
-       pmull   v28.1q, v9.1d, v8.1d
-       eor     v10.16b, v10.16b, v9.16b                         //karatsuba 
tidy up
-
-       aese    v1.16b, v22.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 4
-
-       aese    v0.16b, v23.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 5
-       ext     v9.16b, v9.16b, v9.16b, #8
-
-       aese    v3.16b, v21.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 3
-
-       aese    v2.16b, v22.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 4
-       eor     v10.16b, v10.16b, v11.16b
-
-       aese    v0.16b, v24.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 6
-
-       aese    v3.16b, v22.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 4
-
-       aese    v1.16b, v23.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 5
-
-       aese    v2.16b, v23.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 5
-       eor     v10.16b, v10.16b, v28.16b
-
-       aese    v3.16b, v23.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 5
-
-       aese    v1.16b, v24.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 6
-
-       aese    v2.16b, v24.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 6
-
-       aese    v3.16b, v24.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 6
-       eor     v10.16b, v10.16b, v9.16b
-
-       aese    v0.16b, v25.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 7
-
-       aese    v2.16b, v25.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 7
-
-       aese    v3.16b, v25.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 7
-
-       pmull   v28.1q, v10.1d, v8.1d
-
-       aese    v1.16b, v25.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 7
-       ext     v10.16b, v10.16b, v10.16b, #8
-
-       aese    v3.16b, v26.16b
-       aesmc   v3.16b, v3.16b          //AES block 4k+7 - round 8
-
-       aese    v0.16b, v26.16b
-       aesmc   v0.16b, v0.16b          //AES block 4k+4 - round 8
-       eor     v11.16b, v11.16b, v28.16b
-
-       aese    v1.16b, v26.16b
-       aesmc   v1.16b, v1.16b          //AES block 4k+5 - round 8
-
-       aese    v3.16b, v27.16b                                      //AES 
block 4k+7 - round 9
-
-       aese    v2.16b, v26.16b
-       aesmc   v2.16b, v2.16b          //AES block 4k+6 - round 8
-
-       aese    v0.16b, v27.16b                                      //AES 
block 4k+4 - round 9
-
-       aese    v1.16b, v27.16b                                      //AES 
block 4k+5 - round 9
-       eor     v11.16b, v11.16b, v10.16b
-
-       aese    v2.16b, v27.16b                                      //AES 
block 4k+6 - round 9
-.L128_enc_tail:        //TAIL
-
-       sub     x5, x4, x0   //main_end_input_ptr is number of bytes left to 
process
-       ldp     x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
-#ifdef __AARCH64EB__
-       rev     x6, x6
-       rev     x7, x7
-#endif
-       cmp     x5, #48
-
-       ext     v8.16b, v11.16b, v11.16b, #8                     //prepare 
final partial tag
-       eor     x6, x6, x13                     //AES block 4k+4 - round 10 low
-       eor     x7, x7, x14                     //AES block 4k+4 - round 10 high
-
-       fmov    d4, x6                               //AES block 4k+4 - mov low
-
-       fmov    v4.d[1], x7                           //AES block 4k+4 - mov 
high
-
-       eor     v5.16b, v4.16b, v0.16b                          //AES block 
4k+4 - result
-
-       b.gt    .L128_enc_blocks_more_than_3
-
-       sub     w12, w12, #1
-       movi    v11.8b, #0
-       mov     v3.16b, v2.16b
-
-       cmp     x5, #32
-       mov     v2.16b, v1.16b
-       movi    v9.8b, #0
-
-       movi    v10.8b, #0
-       b.gt    .L128_enc_blocks_more_than_2
-
-       mov     v3.16b, v1.16b
-       cmp     x5, #16
-
-       sub     w12, w12, #1
-       b.gt    .L128_enc_blocks_more_than_1
-
-       sub     w12, w12, #1
-       b       .L128_enc_blocks_less_than_1
-.L128_enc_blocks_more_than_3:  //blocks        left >  3
-       st1     { v5.16b}, [x2], #16                     //AES final-3 block  - 
store result
-
-       ldp     x6, x7, [x0], #16           //AES final-2 block - load input 
low & high
-#ifdef __AARCH64EB__
-       rev     x6, x6
-       rev     x7, x7
-#endif
-       rev64   v4.16b, v5.16b                                    //GHASH 
final-3 block
*** 310630 LINES SKIPPED ***

Reply via email to