Author: jkim
Date: Wed Oct 26 20:02:22 2016
New Revision: 307976
URL: https://svnweb.freebsd.org/changeset/base/307976

Log:
  Build OpenSSL assembly sources for aarch64.  Tested with ThunderX by andrew.

Added:
  head/secure/lib/libcrypto/aarch64/
  head/secure/lib/libcrypto/aarch64/aesv8-armx.S   (contents, props changed)
  head/secure/lib/libcrypto/aarch64/ghashv8-armx.S   (contents, props changed)
  head/secure/lib/libcrypto/aarch64/sha1-armv8.S   (contents, props changed)
  head/secure/lib/libcrypto/aarch64/sha256-armv8.S   (contents, props changed)
  head/secure/lib/libcrypto/aarch64/sha512-armv8.S   (contents, props changed)
Modified:
  head/crypto/openssl/crypto/aes/asm/aesv8-armx.pl
  head/crypto/openssl/crypto/arm64cpuid.S
  head/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl
  head/secure/lib/libcrypto/Makefile
  head/secure/lib/libcrypto/Makefile.asm
  head/secure/lib/libcrypto/Makefile.inc

Modified: head/crypto/openssl/crypto/aes/asm/aesv8-armx.pl
==============================================================================
--- head/crypto/openssl/crypto/aes/asm/aesv8-armx.pl    Wed Oct 26 18:47:47 
2016        (r307975)
+++ head/crypto/openssl/crypto/aes/asm/aesv8-armx.pl    Wed Oct 26 20:02:22 
2016        (r307976)
@@ -42,7 +42,7 @@ $code=<<___;
 #if __ARM_MAX_ARCH__>=7
 .text
 ___
-$code.=".arch  armv8-a+crypto\n"                       if ($flavour =~ /64/);
+# $code.=".arch        armv8-a+crypto\n"                       if ($flavour =~ 
/64/);
 $code.=".arch  armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
                #^^^^^^ this is done to simplify adoption by not depending
                #       on latest binutils.

Modified: head/crypto/openssl/crypto/arm64cpuid.S
==============================================================================
--- head/crypto/openssl/crypto/arm64cpuid.S     Wed Oct 26 18:47:47 2016        
(r307975)
+++ head/crypto/openssl/crypto/arm64cpuid.S     Wed Oct 26 20:02:22 2016        
(r307976)
@@ -1,7 +1,6 @@
 #include "arm_arch.h"
 
 .text
-.arch  armv8-a+crypto
 
 .align 5
 .global        _armv7_neon_probe

Modified: head/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl
==============================================================================
--- head/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl        Wed Oct 26 
18:47:47 2016        (r307975)
+++ head/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl        Wed Oct 26 
20:02:22 2016        (r307976)
@@ -49,7 +49,7 @@ $code=<<___;
 
 .text
 ___
-$code.=".arch  armv8-a+crypto\n"       if ($flavour =~ /64/);
+# $code.=".arch        armv8-a+crypto\n"       if ($flavour =~ /64/);
 $code.=".fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
 
 
################################################################################

Modified: head/secure/lib/libcrypto/Makefile
==============================================================================
--- head/secure/lib/libcrypto/Makefile  Wed Oct 26 18:47:47 2016        
(r307975)
+++ head/secure/lib/libcrypto/Makefile  Wed Oct 26 20:02:22 2016        
(r307976)
@@ -22,7 +22,10 @@ MAN+=        config.5 des_modes.7
 # base sources
 SRCS=  cpt_err.c cryptlib.c cversion.c ex_data.c mem.c mem_dbg.c o_dir.c \
        o_fips.c o_init.c o_str.c o_time.c uid.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+= arm64cpuid.S armcap.c mem_clr.c
+CFLAGS.arm64cpuid.S=   -march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+= x86_64cpuid.S
 .elif defined(ASM_arm)
 SRCS+= armcap.c armv4cpuid.S
@@ -35,7 +38,10 @@ INCS+=       crypto.h ebcdic.h opensslv.h ossl
 
 # aes
 SRCS+= aes_cfb.c aes_ctr.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+= aes_cbc.c aes_core.c aesv8-armx.S
+CFLAGS.aesv8-armx.S=   -march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+= aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S \
        aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S
 .elif defined(ASM_arm)
@@ -238,7 +244,10 @@ INCS+=     mdc2.h
 # modes
 SRCS+= cbc128.c ccm128.c cfb128.c ctr128.c cts128.c gcm128.c ofb128.c \
        wrap128.c xts128.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+= ghashv8-armx.S
+CFLAGS.ghashv8-armx.S= -march=armv8-a+crypto
+.elif defined(ASM_amd64)
 SRCS+= aesni-gcm-x86_64.S ghash-x86_64.S
 .elif defined(ASM_arm)
 SRCS+= ghash-armv4.S ghashv8-armx.S
@@ -324,7 +333,9 @@ INCS+=      seed.h
 
 # sha
 SRCS+= sha1_one.c sha1dgst.c sha256.c sha512.c sha_dgst.c sha_one.c
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+SRCS+= sha1-armv8.S sha256-armv8.S sha512-armv8.S
+.elif defined(ASM_amd64)
 SRCS+= sha1-mb-x86_64.S sha1-x86_64.S sha256-mb-x86_64.S sha256-x86_64.S \
        sha512-x86_64.S
 .elif defined(ASM_arm)

Modified: head/secure/lib/libcrypto/Makefile.asm
==============================================================================
--- head/secure/lib/libcrypto/Makefile.asm      Wed Oct 26 18:47:47 2016        
(r307975)
+++ head/secure/lib/libcrypto/Makefile.asm      Wed Oct 26 20:02:22 2016        
(r307976)
@@ -6,7 +6,44 @@
 
 .include "Makefile.inc"
 
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+
+.PATH: ${LCRYPTO_SRC}/crypto \
+       ${LCRYPTO_SRC}/crypto/aes/asm \
+       ${LCRYPTO_SRC}/crypto/modes/asm \
+       ${LCRYPTO_SRC}/crypto/sha/asm
+
+PERLPATH=      -I${LCRYPTO_SRC}/crypto/perlasm
+
+# aes
+SRCS=  aesv8-armx.pl
+
+# modes
+SRCS+= ghashv8-armx.pl
+
+# sha
+SRCS+= sha1-armv8.pl sha512-armv8.pl
+
+ASM=   ${SRCS:R:S/$/.S/} sha256-armv8.S
+
+all:   ${ASM}
+
+CLEANFILES=    ${ASM} ${SRCS:R:S/$/.s/} sha256-armv8.s
+.SUFFIXES:     .pl
+
+sha256-armv8.S:        sha512-armv8.pl
+       env CC=cc perl ${.ALLSRC} 64 ${.TARGET:R:S/$/.s/}
+       ( echo '/* $$'FreeBSD'$$ */' ;\
+       echo '/* Do not modify. This file is auto-generated from 
${.ALLSRC:T:R:S/$/.pl/}. */' ;\
+       cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
+
+.pl.S:
+       env CC=cc perl ${.IMPSRC} 64 ${.TARGET:R:S/$/.s/}
+       ( echo '/* $$'FreeBSD'$$ */' ;\
+       echo '/* Do not modify. This file is auto-generated from 
${.IMPSRC:T:R:S/$/.pl/}. */' ;\
+       cat ${.TARGET:R:S/$/.s/}) > ${.TARGET}
+
+.elif defined(ASM_amd64)
 
 .PATH: ${LCRYPTO_SRC}/crypto \
        ${LCRYPTO_SRC}/crypto/aes/asm \

Modified: head/secure/lib/libcrypto/Makefile.inc
==============================================================================
--- head/secure/lib/libcrypto/Makefile.inc      Wed Oct 26 18:47:47 2016        
(r307975)
+++ head/secure/lib/libcrypto/Makefile.inc      Wed Oct 26 20:02:22 2016        
(r307976)
@@ -21,7 +21,9 @@ CFLAGS+=-DL_ENDIAN
 CFLAGS+=-DB_ENDIAN
 .endif
 
-.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
+.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm"
+ASM_${MACHINE_CPUARCH}=
+.elif ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386"
 _ASM_AVX!=     { \
                    echo vzeroall | \
                    ${CC} -x assembler -o /dev/null -c - 2> /dev/null; \
@@ -29,11 +31,11 @@ _ASM_AVX!=  { \
 .if ${_ASM_AVX} == yes
 ASM_${MACHINE_CPUARCH}=
 .endif
-.elif ${MACHINE_CPUARCH} == "arm"
-ASM_arm=
 .endif
 
-.if defined(ASM_amd64)
+.if defined(ASM_aarch64)
+CFLAGS+=-DSHA1_ASM -DSHA256_ASM -DSHA512_ASM
+.elif defined(ASM_amd64)
 CFLAGS+=-DOPENSSL_IA32_SSE2
 CFLAGS+=-DAES_ASM -DBSAES_ASM -DVPAES_ASM
 CFLAGS+=-DECP_NISTZ256_ASM

Added: head/secure/lib/libcrypto/aarch64/aesv8-armx.S
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/secure/lib/libcrypto/aarch64/aesv8-armx.S      Wed Oct 26 20:02:22 
2016        (r307976)
@@ -0,0 +1,748 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.align 5
+rcon:
+.long  0x01,0x01,0x01,0x01
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
+.long  0x1b,0x1b,0x1b,0x1b
+
+.globl aes_v8_set_encrypt_key
+.type  aes_v8_set_encrypt_key,%function
+.align 5
+aes_v8_set_encrypt_key:
+.Lenc_key:
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+       mov     x3,#-1
+       cmp     x0,#0
+       b.eq    .Lenc_key_abort
+       cmp     x2,#0
+       b.eq    .Lenc_key_abort
+       mov     x3,#-2
+       cmp     w1,#128
+       b.lt    .Lenc_key_abort
+       cmp     w1,#256
+       b.gt    .Lenc_key_abort
+       tst     w1,#0x3f
+       b.ne    .Lenc_key_abort
+
+       adr     x3,rcon
+       cmp     w1,#192
+
+       eor     v0.16b,v0.16b,v0.16b
+       ld1     {v3.16b},[x0],#16
+       mov     w1,#8           // reuse w1
+       ld1     {v1.4s,v2.4s},[x3],#32
+
+       b.lt    .Loop128
+       b.eq    .L192
+       b       .L256
+
+.align 4
+.Loop128:
+       tbl     v6.16b,{v3.16b},v2.16b
+       ext     v5.16b,v0.16b,v3.16b,#12
+       st1     {v3.4s},[x2],#16
+       aese    v6.16b,v0.16b
+       subs    w1,w1,#1
+
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+        eor    v6.16b,v6.16b,v1.16b
+       eor     v3.16b,v3.16b,v5.16b
+       shl     v1.16b,v1.16b,#1
+       eor     v3.16b,v3.16b,v6.16b
+       b.ne    .Loop128
+
+       ld1     {v1.4s},[x3]
+
+       tbl     v6.16b,{v3.16b},v2.16b
+       ext     v5.16b,v0.16b,v3.16b,#12
+       st1     {v3.4s},[x2],#16
+       aese    v6.16b,v0.16b
+
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+        eor    v6.16b,v6.16b,v1.16b
+       eor     v3.16b,v3.16b,v5.16b
+       shl     v1.16b,v1.16b,#1
+       eor     v3.16b,v3.16b,v6.16b
+
+       tbl     v6.16b,{v3.16b},v2.16b
+       ext     v5.16b,v0.16b,v3.16b,#12
+       st1     {v3.4s},[x2],#16
+       aese    v6.16b,v0.16b
+
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+        eor    v6.16b,v6.16b,v1.16b
+       eor     v3.16b,v3.16b,v5.16b
+       eor     v3.16b,v3.16b,v6.16b
+       st1     {v3.4s},[x2]
+       add     x2,x2,#0x50
+
+       mov     w12,#10
+       b       .Ldone
+
+.align 4
+.L192:
+       ld1     {v4.8b},[x0],#8
+       movi    v6.16b,#8                       // borrow v6.16b
+       st1     {v3.4s},[x2],#16
+       sub     v2.16b,v2.16b,v6.16b    // adjust the mask
+
+.Loop192:
+       tbl     v6.16b,{v4.16b},v2.16b
+       ext     v5.16b,v0.16b,v3.16b,#12
+       st1     {v4.8b},[x2],#8
+       aese    v6.16b,v0.16b
+       subs    w1,w1,#1
+
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+
+       dup     v5.4s,v3.s[3]
+       eor     v5.16b,v5.16b,v4.16b
+        eor    v6.16b,v6.16b,v1.16b
+       ext     v4.16b,v0.16b,v4.16b,#12
+       shl     v1.16b,v1.16b,#1
+       eor     v4.16b,v4.16b,v5.16b
+       eor     v3.16b,v3.16b,v6.16b
+       eor     v4.16b,v4.16b,v6.16b
+       st1     {v3.4s},[x2],#16
+       b.ne    .Loop192
+
+       mov     w12,#12
+       add     x2,x2,#0x20
+       b       .Ldone
+
+.align 4
+.L256:
+       ld1     {v4.16b},[x0]
+       mov     w1,#7
+       mov     w12,#14
+       st1     {v3.4s},[x2],#16
+
+.Loop256:
+       tbl     v6.16b,{v4.16b},v2.16b
+       ext     v5.16b,v0.16b,v3.16b,#12
+       st1     {v4.4s},[x2],#16
+       aese    v6.16b,v0.16b
+       subs    w1,w1,#1
+
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v3.16b,v3.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+        eor    v6.16b,v6.16b,v1.16b
+       eor     v3.16b,v3.16b,v5.16b
+       shl     v1.16b,v1.16b,#1
+       eor     v3.16b,v3.16b,v6.16b
+       st1     {v3.4s},[x2],#16
+       b.eq    .Ldone
+
+       dup     v6.4s,v3.s[3]           // just splat
+       ext     v5.16b,v0.16b,v4.16b,#12
+       aese    v6.16b,v0.16b
+
+       eor     v4.16b,v4.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v4.16b,v4.16b,v5.16b
+       ext     v5.16b,v0.16b,v5.16b,#12
+       eor     v4.16b,v4.16b,v5.16b
+
+       eor     v4.16b,v4.16b,v6.16b
+       b       .Loop256
+
+.Ldone:
+       str     w12,[x2]
+       mov     x3,#0
+
+.Lenc_key_abort:
+       mov     x0,x3                   // return value
+       ldr     x29,[sp],#16
+       ret
+.size  aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
+
+.globl aes_v8_set_decrypt_key
+.type  aes_v8_set_decrypt_key,%function
+.align 5
+aes_v8_set_decrypt_key:
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+       bl      .Lenc_key
+
+       cmp     x0,#0
+       b.ne    .Ldec_key_abort
+
+       sub     x2,x2,#240              // restore original x2
+       mov     x4,#-16
+       add     x0,x2,x12,lsl#4 // end of key schedule
+
+       ld1     {v0.4s},[x2]
+       ld1     {v1.4s},[x0]
+       st1     {v0.4s},[x0],x4
+       st1     {v1.4s},[x2],#16
+
+.Loop_imc:
+       ld1     {v0.4s},[x2]
+       ld1     {v1.4s},[x0]
+       aesimc  v0.16b,v0.16b
+       aesimc  v1.16b,v1.16b
+       st1     {v0.4s},[x0],x4
+       st1     {v1.4s},[x2],#16
+       cmp     x0,x2
+       b.hi    .Loop_imc
+
+       ld1     {v0.4s},[x2]
+       aesimc  v0.16b,v0.16b
+       st1     {v0.4s},[x0]
+
+       eor     x0,x0,x0                // return value
+.Ldec_key_abort:
+       ldp     x29,x30,[sp],#16
+       ret
+.size  aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
+.globl aes_v8_encrypt
+.type  aes_v8_encrypt,%function
+.align 5
+aes_v8_encrypt:
+       ldr     w3,[x2,#240]
+       ld1     {v0.4s},[x2],#16
+       ld1     {v2.16b},[x0]
+       sub     w3,w3,#2
+       ld1     {v1.4s},[x2],#16
+
+.Loop_enc:
+       aese    v2.16b,v0.16b
+       aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
+       subs    w3,w3,#2
+       aese    v2.16b,v1.16b
+       aesmc   v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
+       b.gt    .Loop_enc
+
+       aese    v2.16b,v0.16b
+       aesmc   v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
+       aese    v2.16b,v1.16b
+       eor     v2.16b,v2.16b,v0.16b
+
+       st1     {v2.16b},[x1]
+       ret
+.size  aes_v8_encrypt,.-aes_v8_encrypt
+.globl aes_v8_decrypt
+.type  aes_v8_decrypt,%function
+.align 5
+aes_v8_decrypt:
+       ldr     w3,[x2,#240]
+       ld1     {v0.4s},[x2],#16
+       ld1     {v2.16b},[x0]
+       sub     w3,w3,#2
+       ld1     {v1.4s},[x2],#16
+
+.Loop_dec:
+       aesd    v2.16b,v0.16b
+       aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2],#16
+       subs    w3,w3,#2
+       aesd    v2.16b,v1.16b
+       aesimc  v2.16b,v2.16b
+       ld1     {v1.4s},[x2],#16
+       b.gt    .Loop_dec
+
+       aesd    v2.16b,v0.16b
+       aesimc  v2.16b,v2.16b
+       ld1     {v0.4s},[x2]
+       aesd    v2.16b,v1.16b
+       eor     v2.16b,v2.16b,v0.16b
+
+       st1     {v2.16b},[x1]
+       ret
+.size  aes_v8_decrypt,.-aes_v8_decrypt
+.globl aes_v8_cbc_encrypt
+.type  aes_v8_cbc_encrypt,%function
+.align 5
+aes_v8_cbc_encrypt:
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+       subs    x2,x2,#16
+       mov     x8,#16
+       b.lo    .Lcbc_abort
+       csel    x8,xzr,x8,eq
+
+       cmp     w5,#0                   // en- or decrypting?
+       ldr     w5,[x3,#240]
+       and     x2,x2,#-16
+       ld1     {v6.16b},[x4]
+       ld1     {v0.16b},[x0],x8
+
+       ld1     {v16.4s-v17.4s},[x3]            // load key schedule...
+       sub     w5,w5,#6
+       add     x7,x3,x5,lsl#4  // pointer to last 7 round keys
+       sub     w5,w5,#2
+       ld1     {v18.4s-v19.4s},[x7],#32
+       ld1     {v20.4s-v21.4s},[x7],#32
+       ld1     {v22.4s-v23.4s},[x7],#32
+       ld1     {v7.4s},[x7]
+
+       add     x7,x3,#32
+       mov     w6,w5
+       b.eq    .Lcbc_dec
+
+       cmp     w5,#2
+       eor     v0.16b,v0.16b,v6.16b
+       eor     v5.16b,v16.16b,v7.16b
+       b.eq    .Lcbc_enc128
+
+       ld1     {v2.4s-v3.4s},[x7]
+       add     x7,x3,#16
+       add     x6,x3,#16*4
+       add     x12,x3,#16*5
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       add     x14,x3,#16*6
+       add     x3,x3,#16*7
+       b       .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+        st1    {v6.16b},[x1],#16
+.Lenter_cbc_enc:
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v2.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x6]
+       cmp     w5,#4
+       aese    v0.16b,v3.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x12]
+       b.eq    .Lcbc_enc192
+
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v16.4s},[x14]
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+       ld1     {v17.4s},[x3]
+       nop
+
+.Lcbc_enc192:
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+        subs   x2,x2,#16
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+        csel   x8,xzr,x8,eq
+       aese    v0.16b,v18.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v19.16b
+       aesmc   v0.16b,v0.16b
+        ld1    {v16.16b},[x0],x8
+       aese    v0.16b,v20.16b
+       aesmc   v0.16b,v0.16b
+        eor    v16.16b,v16.16b,v5.16b
+       aese    v0.16b,v21.16b
+       aesmc   v0.16b,v0.16b
+        ld1 {v17.4s},[x7]              // re-pre-load rndkey[1]
+       aese    v0.16b,v22.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v23.16b
+       eor     v6.16b,v0.16b,v7.16b
+       b.hs    .Loop_cbc_enc
+
+       st1     {v6.16b},[x1],#16
+       b       .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+       ld1     {v2.4s-v3.4s},[x7]
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+       b       .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+       aese    v0.16b,v16.16b
+       aesmc   v0.16b,v0.16b
+        st1    {v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+       aese    v0.16b,v17.16b
+       aesmc   v0.16b,v0.16b
+        subs   x2,x2,#16
+       aese    v0.16b,v2.16b
+       aesmc   v0.16b,v0.16b
+        csel   x8,xzr,x8,eq
+       aese    v0.16b,v3.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v18.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v19.16b
+       aesmc   v0.16b,v0.16b
+        ld1    {v16.16b},[x0],x8
+       aese    v0.16b,v20.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v21.16b
+       aesmc   v0.16b,v0.16b
+       aese    v0.16b,v22.16b
+       aesmc   v0.16b,v0.16b
+        eor    v16.16b,v16.16b,v5.16b
+       aese    v0.16b,v23.16b
+       eor     v6.16b,v0.16b,v7.16b
+       b.hs    .Loop_cbc_enc128
+
+       st1     {v6.16b},[x1],#16
+       b       .Lcbc_done
+.align 5
+.Lcbc_dec:
+       ld1     {v18.16b},[x0],#16
+       subs    x2,x2,#32               // bias
+       add     w6,w5,#2
+       orr     v3.16b,v0.16b,v0.16b
+       orr     v1.16b,v0.16b,v0.16b
+       orr     v19.16b,v18.16b,v18.16b
+       b.lo    .Lcbc_dec_tail
+
+       orr     v1.16b,v18.16b,v18.16b
+       ld1     {v18.16b},[x0],#16
+       orr     v2.16b,v0.16b,v0.16b
+       orr     v3.16b,v1.16b,v1.16b
+       orr     v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+       aesd    v0.16b,v16.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
+       aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
+       subs    w6,w6,#2
+       aesd    v0.16b,v17.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
+       aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
+       b.gt    .Loop3x_cbc_dec
+
+       aesd    v0.16b,v16.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v16.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
+       aesimc  v18.16b,v18.16b
+        eor    v4.16b,v6.16b,v7.16b
+        subs   x2,x2,#0x30
+        eor    v5.16b,v2.16b,v7.16b
+        csel   x6,x2,x6,lo                     // x6, w6, is zero at this point
+       aesd    v0.16b,v17.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v17.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
+       aesimc  v18.16b,v18.16b
+        eor    v17.16b,v3.16b,v7.16b
+        add    x0,x0,x6                // x0 is adjusted in such way that
+                                       // at exit from the loop v1.16b-v18.16b
+                                       // are loaded with last "words"
+        orr    v6.16b,v19.16b,v19.16b
+        mov    x7,x3
+       aesd    v0.16b,v20.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v20.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
+       aesimc  v18.16b,v18.16b
+        ld1    {v2.16b},[x0],#16
+       aesd    v0.16b,v21.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v21.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
+       aesimc  v18.16b,v18.16b
+        ld1    {v3.16b},[x0],#16
+       aesd    v0.16b,v22.16b
+       aesimc  v0.16b,v0.16b
+       aesd    v1.16b,v22.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
+       aesimc  v18.16b,v18.16b
+        ld1    {v19.16b},[x0],#16
+       aesd    v0.16b,v23.16b
+       aesd    v1.16b,v23.16b
+       aesd    v18.16b,v23.16b
+        ld1 {v16.4s},[x7],#16  // re-pre-load rndkey[0]
+        add    w6,w5,#2
+       eor     v4.16b,v4.16b,v0.16b
+       eor     v5.16b,v5.16b,v1.16b
+       eor     v18.16b,v18.16b,v17.16b
+        ld1 {v17.4s},[x7],#16  // re-pre-load rndkey[1]
+       st1     {v4.16b},[x1],#16
+        orr    v0.16b,v2.16b,v2.16b
+       st1     {v5.16b},[x1],#16
+        orr    v1.16b,v3.16b,v3.16b
+       st1     {v18.16b},[x1],#16
+        orr    v18.16b,v19.16b,v19.16b
+       b.hs    .Loop3x_cbc_dec
+
+       cmn     x2,#0x30
+       b.eq    .Lcbc_done
+       nop
+
+.Lcbc_dec_tail:
+       aesd    v1.16b,v16.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
+       aesimc  v18.16b,v18.16b
+       ld1     {v16.4s},[x7],#16
+       subs    w6,w6,#2
+       aesd    v1.16b,v17.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
+       aesimc  v18.16b,v18.16b
+       ld1     {v17.4s},[x7],#16
+       b.gt    .Lcbc_dec_tail
+
+       aesd    v1.16b,v16.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v16.16b
+       aesimc  v18.16b,v18.16b
+       aesd    v1.16b,v17.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v17.16b
+       aesimc  v18.16b,v18.16b
+       aesd    v1.16b,v20.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v20.16b
+       aesimc  v18.16b,v18.16b
+        cmn    x2,#0x20
+       aesd    v1.16b,v21.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v21.16b
+       aesimc  v18.16b,v18.16b
+        eor    v5.16b,v6.16b,v7.16b
+       aesd    v1.16b,v22.16b
+       aesimc  v1.16b,v1.16b
+       aesd    v18.16b,v22.16b
+       aesimc  v18.16b,v18.16b
+        eor    v17.16b,v3.16b,v7.16b
+       aesd    v1.16b,v23.16b
+       aesd    v18.16b,v23.16b
+       b.eq    .Lcbc_dec_one
+       eor     v5.16b,v5.16b,v1.16b
+       eor     v17.16b,v17.16b,v18.16b
+        orr    v6.16b,v19.16b,v19.16b
+       st1     {v5.16b},[x1],#16
+       st1     {v17.16b},[x1],#16
+       b       .Lcbc_done
+
+.Lcbc_dec_one:
+       eor     v5.16b,v5.16b,v18.16b
+        orr    v6.16b,v19.16b,v19.16b
+       st1     {v5.16b},[x1],#16
+
+.Lcbc_done:
+       st1     {v6.16b},[x4]
+.Lcbc_abort:
+       ldr     x29,[sp],#16
+       ret
+.size  aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
+.globl aes_v8_ctr32_encrypt_blocks
+.type  aes_v8_ctr32_encrypt_blocks,%function
+.align 5
+aes_v8_ctr32_encrypt_blocks:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+       ldr             w5,[x3,#240]
+
+       ldr             w8, [x4, #12]
+       ld1             {v0.4s},[x4]
+
+       ld1             {v16.4s-v17.4s},[x3]            // load key schedule...
+       sub             w5,w5,#4
+       mov             x12,#16
+       cmp             x2,#2
+       add             x7,x3,x5,lsl#4  // pointer to last 5 round keys
+       sub             w5,w5,#2
+       ld1             {v20.4s-v21.4s},[x7],#32
+       ld1             {v22.4s-v23.4s},[x7],#32
+       ld1             {v7.4s},[x7]
+       add             x7,x3,#32
+       mov             w6,w5
+       csel    x12,xzr,x12,lo
+#ifndef __ARMEB__
+       rev             w8, w8
+#endif
+       orr             v1.16b,v0.16b,v0.16b
+       add             w10, w8, #1
+       orr             v18.16b,v0.16b,v0.16b
+       add             w8, w8, #2
+       orr             v6.16b,v0.16b,v0.16b
+       rev             w10, w10
+       mov             v1.s[3],w10
+       b.ls            .Lctr32_tail
+       rev             w12, w8
+       sub             x2,x2,#3                // bias
+       mov             v18.s[3],w12
+       b               .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       aese            v18.16b,v16.16b
+       aesmc           v18.16b,v18.16b
+       ld1             {v16.4s},[x7],#16
+       subs            w6,w6,#2
+       aese            v0.16b,v17.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
+       aesmc           v1.16b,v1.16b
+       aese            v18.16b,v17.16b
+       aesmc           v18.16b,v18.16b
+       ld1             {v17.4s},[x7],#16
+       b.gt            .Loop3x_ctr32
+
+       aese            v0.16b,v16.16b
+       aesmc           v4.16b,v0.16b
+       aese            v1.16b,v16.16b
+       aesmc           v5.16b,v1.16b
+        ld1            {v2.16b},[x0],#16
+        orr            v0.16b,v6.16b,v6.16b
+       aese            v18.16b,v16.16b
+       aesmc           v18.16b,v18.16b
+        ld1            {v3.16b},[x0],#16
+        orr            v1.16b,v6.16b,v6.16b
+       aese            v4.16b,v17.16b
+       aesmc           v4.16b,v4.16b
+       aese            v5.16b,v17.16b
+       aesmc           v5.16b,v5.16b
+        ld1            {v19.16b},[x0],#16
+        mov            x7,x3
+       aese            v18.16b,v17.16b
+       aesmc           v17.16b,v18.16b
+        orr            v18.16b,v6.16b,v6.16b
+        add            w9,w8,#1
+       aese            v4.16b,v20.16b
+       aesmc           v4.16b,v4.16b
+       aese            v5.16b,v20.16b
+       aesmc           v5.16b,v5.16b
+        eor            v2.16b,v2.16b,v7.16b
+        add            w10,w8,#2
+       aese            v17.16b,v20.16b
+       aesmc           v17.16b,v17.16b
+        eor            v3.16b,v3.16b,v7.16b
+        add            w8,w8,#3
+       aese            v4.16b,v21.16b
+       aesmc           v4.16b,v4.16b
+       aese            v5.16b,v21.16b
+       aesmc           v5.16b,v5.16b
+        eor            v19.16b,v19.16b,v7.16b
+        rev            w9,w9
+       aese            v17.16b,v21.16b
+       aesmc           v17.16b,v17.16b
+        mov    v0.s[3], w9
+        rev            w10,w10
+       aese            v4.16b,v22.16b
+       aesmc           v4.16b,v4.16b
+       aese            v5.16b,v22.16b
+       aesmc           v5.16b,v5.16b
+        mov    v1.s[3], w10
+        rev            w12,w8
+       aese            v17.16b,v22.16b
+       aesmc           v17.16b,v17.16b
+        mov    v18.s[3], w12
+        subs           x2,x2,#3
+       aese            v4.16b,v23.16b
+       aese            v5.16b,v23.16b
+       aese            v17.16b,v23.16b
+
+       eor             v2.16b,v2.16b,v4.16b
+        ld1     {v16.4s},[x7],#16      // re-pre-load rndkey[0]
+       st1             {v2.16b},[x1],#16
+       eor             v3.16b,v3.16b,v5.16b
+        mov            w6,w5
+       st1             {v3.16b},[x1],#16
+       eor             v19.16b,v19.16b,v17.16b
+        ld1     {v17.4s},[x7],#16      // re-pre-load rndkey[1]
+       st1             {v19.16b},[x1],#16
+       b.hs            .Loop3x_ctr32
+
+       adds            x2,x2,#3
+       b.eq            .Lctr32_done
+       cmp             x2,#1
+       mov             x12,#16
+       csel    x12,xzr,x12,eq
+
+.Lctr32_tail:
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       ld1             {v16.4s},[x7],#16
+       subs            w6,w6,#2
+       aese            v0.16b,v17.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
+       aesmc           v1.16b,v1.16b
+       ld1             {v17.4s},[x7],#16
+       b.gt            .Lctr32_tail
+
+       aese            v0.16b,v16.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v16.16b
+       aesmc           v1.16b,v1.16b
+       aese            v0.16b,v17.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v17.16b
+       aesmc           v1.16b,v1.16b
+        ld1            {v2.16b},[x0],x12
+       aese            v0.16b,v20.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v20.16b
+       aesmc           v1.16b,v1.16b
+        ld1            {v3.16b},[x0]
+       aese            v0.16b,v21.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v21.16b
+       aesmc           v1.16b,v1.16b
+        eor            v2.16b,v2.16b,v7.16b
+       aese            v0.16b,v22.16b
+       aesmc           v0.16b,v0.16b
+       aese            v1.16b,v22.16b
+       aesmc           v1.16b,v1.16b
+        eor            v3.16b,v3.16b,v7.16b
+       aese            v0.16b,v23.16b
+       aese            v1.16b,v23.16b
+
+       cmp             x2,#1
+       eor             v2.16b,v2.16b,v0.16b
+       eor             v3.16b,v3.16b,v1.16b
+       st1             {v2.16b},[x1],#16
+       b.eq            .Lctr32_done
+       st1             {v3.16b},[x1]
+
+.Lctr32_done:
+       ldr             x29,[sp],#16
+       ret
+.size  aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
+#endif

Added: head/secure/lib/libcrypto/aarch64/ghashv8-armx.S
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/secure/lib/libcrypto/aarch64/ghashv8-armx.S    Wed Oct 26 20:02:22 
2016        (r307976)
@@ -0,0 +1,228 @@
+/* $FreeBSD$ */
+/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */
+#include "arm_arch.h"
+
+.text
+.global        gcm_init_v8
+.type  gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+       ld1             {v17.2d},[x1]           //load input H
+       movi            v19.16b,#0xe1
+       shl     v19.2d,v19.2d,#57               //0xc2.0
+       ext             v3.16b,v17.16b,v17.16b,#8
+       ushr    v18.2d,v19.2d,#63
+       dup             v17.4s,v17.s[1]
+       ext             v16.16b,v18.16b,v19.16b,#8              //t0=0xc2....01
+       ushr    v18.2d,v3.2d,#63
+       sshr    v17.4s,v17.4s,#31               //broadcast carry bit
+       and             v18.16b,v18.16b,v16.16b
+       shl     v3.2d,v3.2d,#1
+       ext             v18.16b,v18.16b,v18.16b,#8
+       and             v16.16b,v16.16b,v17.16b
+       orr             v3.16b,v3.16b,v18.16b           //H<<<=1
+       eor             v20.16b,v3.16b,v16.16b          //twisted H
+       st1             {v20.2d},[x0],#16               //store Htable[0]
+
+       //calculate H^2
+       ext             v16.16b,v20.16b,v20.16b,#8              //Karatsuba 
pre-processing
+       pmull   v0.1q,v20.1d,v20.1d
+       eor             v16.16b,v16.16b,v20.16b
+       pmull2  v2.1q,v20.2d,v20.2d
+       pmull   v1.1q,v16.1d,v16.1d
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba 
post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v22.16b,v0.16b,v18.16b
+
+       ext             v17.16b,v22.16b,v22.16b,#8              //Karatsuba 
pre-processing
+       eor             v17.16b,v17.16b,v22.16b
+       ext             v21.16b,v16.16b,v17.16b,#8              //pack 
Karatsuba pre-processed
+       st1             {v21.2d-v22.2d},[x0]            //store Htable[1..2]
+
+       ret
+.size  gcm_init_v8,.-gcm_init_v8
+.global        gcm_gmult_v8
+.type  gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+       ld1             {v17.2d},[x0]           //load Xi
+       movi            v19.16b,#0xe1
+       ld1             {v20.2d-v21.2d},[x1]    //load twisted H, ...
+       shl     v19.2d,v19.2d,#57
+#ifndef __ARMEB__
+       rev64   v17.16b,v17.16b
+#endif
+       ext             v3.16b,v17.16b,v17.16b,#8
+
+       pmull   v0.1q,v20.1d,v3.1d              //H.lo·Xi.lo
+       eor             v17.16b,v17.16b,v3.16b          //Karatsuba 
pre-processing
+       pmull2  v2.1q,v20.2d,v3.2d              //H.hi·Xi.hi
+       pmull   v1.1q,v21.1d,v17.1d             //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+       ext             v17.16b,v0.16b,v2.16b,#8                //Karatsuba 
post-processing
+       eor             v18.16b,v0.16b,v2.16b
+       eor             v1.16b,v1.16b,v17.16b
+       eor             v1.16b,v1.16b,v18.16b
+       pmull   v18.1q,v0.1d,v19.1d             //1st phase of reduction
+
+       ins     v2.d[0],v1.d[1]
+       ins     v1.d[1],v0.d[0]
+       eor             v0.16b,v1.16b,v18.16b
+
+       ext             v18.16b,v0.16b,v0.16b,#8                //2nd phase of 
reduction
+       pmull   v0.1q,v0.1d,v19.1d
+       eor             v18.16b,v18.16b,v2.16b
+       eor             v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to