From: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com> This patch adds AES-128-CBC + SHA1 low-level crypto operations for ARMv8 processors. The assembly code is a base for an optimized PMD and is currently excluded from the build.
This code is optimized to provide performance boost for combined operations such as encryption + HMAC generation, decryption + HMAC validation. Introduced operations add support for AES-128-CBC in combination with: SHA1 MAC, SHA1 HMAC Signed-off-by: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com> Signed-off-by: Emery Davis <emery.da...@caviumnetworks.com> --- drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S | 1719 ++++++++++++++++++++ drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S | 1650 +++++++++++++++++++ 2 files changed, 3369 insertions(+) create mode 100644 drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S create mode 100644 drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S diff --git a/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S b/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S new file mode 100644 index 0000000..8b8348a --- /dev/null +++ b/drivers/crypto/armv8/asm/aes128cbc_sha1_hmac.S @@ -0,0 +1,1719 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Enc/Auth Primitive = aes128cbc/sha1_hmac + * + * Operations: + * + * out = encrypt-AES128CBC(in) + * return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out)) + * + * Prototype: + * void aes128cbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * aes128cbc_sha1_hmac( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * arg->digest.hmac.i_key_pad (partially hashed i_key_pad) + * arg->digest.hmac.o_key_pad (partially hashed o_key_pad) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 -- temp register for SHA1 + * v20 -- ABCD copy (q20) + * v21 -- sha working state (q21) + * v22 -- sha working state (q22) + * v23 -- temp register for SHA1 + * v24 -- sha state ABCD + * v25 -- sha state E + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * Constraints: + * + * The variable "len" must be a multiple of 16, otherwise results are not + * defined. For AES partial blocks the user is required to pad the input + * to modulus 16 = 0. + * + * Short lengths are not optimized at < 12 AES blocks + */ + + .file "aes128cbc_sha1_hmac.S" + .text + .cpu generic+fp+simd+crypto+crc + .global aes128cbc_sha1_hmac + .type aes128cbc_sha1_hmac,%function + + + .align 4 +.Lrcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + +aes128cbc_sha1_hmac: +/* fetch args */ + ldr x6, [x5, #HMAC_IKEYPAD] + /* init ABCD, E */ + ld1 {v24.4s, v25.4s},[x6] + /* save pointer to o_key_pad partial hash */ + ldr x6, [x5, #HMAC_OKEYPAD] + + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] + +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,12 /* no main loop if <12 */ + b.lt .Lshort_cases /* branch if < 12 */ + + /* protect registers */ + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + /* proceed */ + ld1 {v3.16b},[x5] /* get 1st ivec */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov x11,x4 /* len -> x11 needed at end */ + lsr x12,x11,6 /* total_blocks */ +/* + * now we can do the loop prolog, 1st aes sequence of 4 blocks + */ + ld1 {v8.16b},[x2],16 /* rk[0] */ + ld1 {v9.16b},[x2],16 /* rk[1] */ + eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */ + ld1 {v10.16b},[x2],16 /* rk[2] */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + aesmc v0.16b,v0.16b + ld1 {v11.16b},[x2],16 /* rk[3] */ + aese v0.16b,v9.16b + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon + aesmc v0.16b,v0.16b + ld1 {v12.16b},[x2],16 /* rk[4] */ + aese v0.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aesmc v0.16b,v0.16b + ld1 {v13.16b},[x2],16 /* rk[5] */ + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + ld1 {v14.16b},[x2],16 /* rk[6] */ + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + ld1 {v15.16b},[x2],16 /* rk[7] */ + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x2],16 /* rk[8] */ + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + ld1 {v17.16b},[x2],16 /* rk[9] */ + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + ld1 {v18.16b},[x2],16 /* rk[10] */ + aese v0.16b,v16.16b + mov x4,x1 /* sha_ptr_in = aes_ptr_out */ + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + + eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + prfm PLDL1KEEP,[x8,0*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + prfm PLDL1KEEP,[x8,2*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + prfm PLDL1KEEP,[x8,4*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + prfm PLDL1KEEP,[x8,6*64] /* rcon */ + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + prfm PLDL1KEEP,[x8,8*64] /* rcon */ + eor v1.16b,v1.16b,v18.16b /* res 1 */ + + eor v2.16b,v2.16b,v1.16b /* xor w/ivec (modeop) */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + mov x2,x0 /* lead_ptr = aes_ptr_in */ + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + prfm PLDL1KEEP,[x8,10*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + prfm PLDL1KEEP,[x8,12*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + prfm PLDL1KEEP,[x8,14*64] /* rcon */ + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + + eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + /* main_blocks = total_blocks - 1 */ + sub x7,x12,1 + and x13,x10,3 /* aes_blocks_left */ + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ + +/* + * Note, aes_blocks_left := number after + * the main (sha) block is done. Can be 0 + */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ +/* + * main combined loop CBC + */ +.Lmain_loop: +/* + * because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. + * Thats OK since there are 6 cycles before we can use the load anyway; + * so this goes as fast as it can without SW pipelining (too complicated + * given the code size) + */ + rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */ + /* next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + +/* aes xform 0, sha quad 0 */ + aese v0.16b,v8.16b + rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */ + aesmc v0.16b,v0.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + aese v0.16b,v9.16b + add v19.4s,v4.4s,v26.4s + aesmc v0.16b,v0.16b + sha1su0 v26.4s,v27.4s,v28.4s + aese v0.16b,v10.16b + sha1h s22,s24 + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + add v23.4s,v4.4s,v27.4s + /* no place to get rid of this stall */ + rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */ + aesmc v0.16b,v0.16b + sha1c q24,s25,v19.4s + aese v0.16b,v12.16b + sha1su1 v26.4s,v29.4s + aesmc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + aese v0.16b,v13.16b + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + aesmc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aese v0.16b,v14.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesmc v0.16b,v0.16b + sha1su0 v28.4s,v29.4s,v26.4s + aese v0.16b,v15.16b + sha1h s22,s24 + aesmc v0.16b,v0.16b + sha1c q24,s21,v19.4s + aese v0.16b,v16.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesmc v0.16b,v0.16b + sha1h s21,s24 + aese v0.16b,v17.16b + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + add v23.4s,v5.4s,v27.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s +/* aes xform 1, sha quad 1 */ + eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aese v1.16b,v8.16b + add v19.4s,v5.4s,v28.4s + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + aesmc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aese v1.16b,v10.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + add v23.4s,v5.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesmc v1.16b,v1.16b + sha1h s22,s24 + aese v1.16b,v12.16b + sha1p q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + aesmc v1.16b,v1.16b + sha1su0 v29.4s,v26.4s,v27.4s + aese v1.16b,v13.16b + sha1h s21,s24 + aesmc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aese v1.16b,v14.16b + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + aesmc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + sha1su0 v26.4s,v27.4s,v28.4s + aese v1.16b,v15.16b + sha1h s22,s24 + add v23.4s,v5.4s,v27.4s + aesmc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aese v1.16b,v16.16b + sha1su1 v26.4s,v29.4s + aesmc v1.16b,v1.16b + sha1su0 v27.4s,v28.4s,v29.4s + aese v1.16b,v17.16b + sha1h s21,s24 + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + sha1p q24,s22,v23.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v27.4s,v26.4s + +/* mode op 2 */ + eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */ + +/* aes xform 2, sha quad 2 */ + aese v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesmc v2.16b,v2.16b + add v19.4s,v6.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + aese v2.16b,v9.16b + sha1h s22,s24 + aesmc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aese v2.16b,v10.16b + sha1su1 v28.4s,v27.4s + aesmc v2.16b,v2.16b + + aese v2.16b,v11.16b + add v19.4s,v6.4s,v26.4s + aesmc v2.16b,v2.16b + sha1su0 v29.4s,v26.4s,v27.4s + aese v2.16b,v12.16b + sha1h s21,s24 + aesmc v2.16b,v2.16b + sha1m q24,s22,v23.4s + aese v2.16b,v13.16b + sha1su1 v29.4s,v28.4s + aesmc v2.16b,v2.16b + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + aese v2.16b,v14.16b + add v23.4s,v6.4s,v27.4s + aesmc v2.16b,v2.16b + sha1su0 v26.4s,v27.4s,v28.4s + aese v2.16b,v15.16b + sha1h s22,s24 + aesmc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aese v2.16b,v16.16b + add v19.4s,v6.4s,v28.4s + aesmc v2.16b,v2.16b + sha1su1 v26.4s,v29.4s + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + +/* mode op 3 */ + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + +/* aes xform 3, sha quad 3 */ + aese v3.16b,v8.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesmc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aese v3.16b,v9.16b + sha1h s21,s24 + aesmc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aese v3.16b,v10.16b + sha1su1 v29.4s,v28.4s + aesmc v3.16b,v3.16b + add v19.4s,v7.4s,v26.4s + aese v3.16b,v11.16b + sha1h s22,s24 + aesmc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + add v23.4s,v7.4s,v27.4s + aese v3.16b,v13.16b + sha1h s21,s24 + aesmc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aese v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesmc v3.16b,v3.16b + add v19.4s,v7.4s,v28.4s + aese v3.16b,v15.16b + sha1h s22,s24 + aesmc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + add v23.4s,v7.4s,v29.4s + aese v3.16b,v17.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbnz x7,.Lmain_loop /* loop if more to do */ + + +/* + * epilog, process remaining aes blocks and b-2 sha block + * do this inline (no loop) to overlap with the sha part + * note there are 0-3 aes blocks left. + */ + rev32 v26.16b,v0.16b /* fix endian w0 */ + rev32 v27.16b,v1.16b /* fix endian w1 */ + rev32 v28.16b,v2.16b /* fix endian w2 */ + rev32 v29.16b,v3.16b /* fix endian w3 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + cbz x13, .Lbm2fromQ0 /* skip if none left */ + /* local copy of aes_blocks_left */ + subs x14,x13,1 + +/* + * mode op 0 + * read next aes block, update aes_ptr_in + */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + +/* aes xform 0, sha quad 0 */ + add v19.4s,v4.4s,v26.4s + aese v0.16b,v8.16b + add v23.4s,v4.4s,v27.4s + aesmc v0.16b,v0.16b + sha1su0 v26.4s,v27.4s,v28.4s + aese v0.16b,v9.16b + sha1h s22,s24 + aesmc v0.16b,v0.16b + sha1c q24,s25,v19.4s + aese v0.16b,v10.16b + sha1su1 v26.4s,v29.4s + add v19.4s,v4.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + sha1h s21,s24 + aesmc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aese v0.16b,v12.16b + sha1su1 v27.4s,v26.4s + add v23.4s,v4.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + sha1h s22,s24 + aesmc v0.16b,v0.16b + sha1c q24,s21,v19.4s + aese v0.16b,v14.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v4.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + sha1h s21,s24 + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + sha1c q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ1 +/* + * mode op 1 + * read next aes block, update aes_ptr_in + */ + ld1 {v1.16b},[x0],16 + + eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */ + +/* aes xform 1, sha quad 1 */ + add v23.4s,v5.4s,v27.4s + aese v1.16b,v8.16b + add v19.4s,v5.4s,v28.4s + aesmc v1.16b,v1.16b + sha1su0 v27.4s,v28.4s,v29.4s + aese v1.16b,v9.16b + sha1h s21,s24 + aesmc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aese v1.16b,v10.16b + sha1su1 v27.4s,v26.4s + add v23.4s,v5.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesmc v1.16b,v1.16b + subs x14,x14,1 /* dec counter */ + aese v1.16b,v11.16b + sha1h s22,s24 + aesmc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aese v1.16b,v12.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v5.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + sha1h s21,s24 + aesmc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aese v1.16b,v14.16b + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + sha1h s22,s24 + aesmc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aese v1.16b,v16.16b + sha1su1 v26.4s,v29.4s + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* if aes_blocks_left_count == 0 */ + beq .Lbm2fromQ2 + +/* + * mode op 2 + * read next aes block, update aes_ptr_in + */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + +/* aes xform 2, sha quad 2 */ + add v19.4s,v6.4s,v28.4s + aese v2.16b,v8.16b + add v23.4s,v6.4s,v29.4s + aesmc v2.16b,v2.16b + sha1su0 v28.4s,v29.4s,v26.4s + aese v2.16b,v9.16b + sha1h s22,s24 + aesmc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aese v2.16b,v10.16b + sha1su1 v28.4s,v27.4s + add v19.4s,v6.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + sha1h s21,s24 + aesmc v2.16b,v2.16b + sha1m q24,s22,v23.4s + aese v2.16b,v12.16b + sha1su1 v29.4s,v28.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + sha1h s22,s24 + aesmc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aese v2.16b,v14.16b + sha1su1 v26.4s,v29.4s + add v19.4s,v6.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + sha1h s21,s24 + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + sha1m q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* join common code at Quad 3 */ + b .Lbm2fromQ3 + +/* + * now there is the b-2 sha block before the final one. Execution takes over + * in the appropriate part of this depending on how many aes blocks were left. + * If there were none, the whole thing is executed. + */ +.Lbm2fromQ0: + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + +.Lbm2fromQ1: + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + +.Lbm2fromQ2: + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + +.Lbm2fromQ3: + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + eor v26.16b,v26.16b,v26.16b /* zero reg */ + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + eor v27.16b,v27.16b,v27.16b /* zero reg */ + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + eor v28.16b,v28.16b,v28.16b /* zero reg */ + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + +/* + * now we can do the final block, either all padding or 1-3 aes blocks + * len in x11, aes_blocks_left in x13. should move the aes data setup of this + * to the last aes bit. + */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + mov w15,0x80 /* that's the 1 of the pad */ + /* Add one SHA-1 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x9,x11,0xffffffff /* len_lo */ + mov v26.b[0],w15 /* assume block 0 is dst */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x9,x9,3 /* len_lo in bits */ + eor v29.16b,v29.16b,v29.16b /* zero reg */ +/* + * places the 0x80 in the correct block, copies the appropriate data + */ + cbz x13,.Lpad100 /* no data to get */ + mov v26.16b,v0.16b + sub x14,x13,1 /* dec amount left */ + mov v27.b[0],w15 /* assume block 1 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v27.16b,v1.16b + sub x14,x14,1 /* dec amount left */ + mov v28.b[0],w15 /* assume block 2 is dst */ + cbz x14,.Lpad100 /* branch if done */ + mov v28.16b,v2.16b + mov v29.b[3],w15 /* block 3, doesn't get rev'd */ +/* + * get the len_hi,LenLo in bits according to + * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12) + * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9) + * this is done before the if/else above + */ +.Lpad100: + mov v29.s[3],w9 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ +/* + * note that q29 is already built in the correct format, so no swap required + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ +/* + * do last sha of pad block + */ + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + + mov x11, #64+20 /* size of o_key_pad + inner hash */ + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x3],16 + st1 {v25.s}[0], [x3] + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v3.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + mov w15,0x80 /* sha padding word */ + + lsl x11,x10,4 /* len = aes_blocks*16 */ + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + + mov x9,x8 /* top of rcon */ + + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ +/* + * the idea in the short loop (at least 1) is to break out with the padding + * already in place excepting the final word. + */ +.Lshort_loop: + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x0],16 + eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */ + +/* aes xform 0 */ + aese v0.16b,v8.16b + aesmc v0.16b,v0.16b + aese v0.16b,v9.16b + aesmc v0.16b,v0.16b + aese v0.16b,v10.16b + aesmc v0.16b,v0.16b + aese v0.16b,v11.16b + aesmc v0.16b,v0.16b + aese v0.16b,v12.16b + aesmc v0.16b,v0.16b + aese v0.16b,v13.16b + aesmc v0.16b,v0.16b + aese v0.16b,v14.16b + aesmc v0.16b,v0.16b + aese v0.16b,v15.16b + aesmc v0.16b,v0.16b + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + /* assume this was final block */ + mov v27.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + /* load res to sha 0, endian swap */ + rev32 v26.16b,v0.16b + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x0],16 + eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */ + +/* aes xform 1 */ + aese v1.16b,v8.16b + aesmc v1.16b,v1.16b + aese v1.16b,v9.16b + aesmc v1.16b,v1.16b + aese v1.16b,v10.16b + aesmc v1.16b,v1.16b + aese v1.16b,v11.16b + aesmc v1.16b,v1.16b + aese v1.16b,v12.16b + aesmc v1.16b,v1.16b + aese v1.16b,v13.16b + aesmc v1.16b,v1.16b + aese v1.16b,v14.16b + aesmc v1.16b,v1.16b + aese v1.16b,v15.16b + aesmc v1.16b,v1.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + /* assume this was final block */ + mov v28.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + /* load res to sha 0, endian swap */ + rev32 v27.16b,v1.16b + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x0],16 + eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */ + +/* aes xform 2 */ + aese v2.16b,v8.16b + aesmc v2.16b,v2.16b + aese v2.16b,v9.16b + aesmc v2.16b,v2.16b + aese v2.16b,v10.16b + aesmc v2.16b,v2.16b + aese v2.16b,v11.16b + aesmc v2.16b,v2.16b + aese v2.16b,v12.16b + aesmc v2.16b,v2.16b + aese v2.16b,v13.16b + aesmc v2.16b,v2.16b + aese v2.16b,v14.16b + aesmc v2.16b,v2.16b + aese v2.16b,v15.16b + aesmc v2.16b,v2.16b + aese v2.16b,v16.16b + aesmc v2.16b,v2.16b + aese v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + /* assume this was final block */ + mov v29.b[3],w15 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + /* load res to sha 0, endian swap */ + rev32 v28.16b,v2.16b + sub x10,x10,1 /* dec num_blocks */ + cbz x10,.Lpost_short_loop /* break if no more */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x0],16 + eor v3.16b,v3.16b,v2.16b /* xor w/prev value */ + +/* aes xform 3 */ + aese v3.16b,v8.16b + aesmc v3.16b,v3.16b + aese v3.16b,v9.16b + aesmc v3.16b,v3.16b + aese v3.16b,v10.16b + aesmc v3.16b,v3.16b + aese v3.16b,v11.16b + aesmc v3.16b,v3.16b + aese v3.16b,v12.16b + aesmc v3.16b,v3.16b + aese v3.16b,v13.16b + aesmc v3.16b,v3.16b + aese v3.16b,v14.16b + aesmc v3.16b,v3.16b + aese v3.16b,v15.16b + aesmc v3.16b,v3.16b + aese v3.16b,v16.16b + aesmc v3.16b,v3.16b + aese v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + /* load res to sha 0, endian swap */ + rev32 v29.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * now we have the sha1 to do for these 4 aes blocks + */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */ + eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */ + eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */ + eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */ + /* assume this was final block */ + mov v26.b[3],w15 + + sub x10,x10,1 /* dec num_blocks */ + cbnz x10,.Lshort_loop /* keep looping if more */ +/* + * there are between 0 and 3 aes blocks in the final sha1 blocks + */ +.Lpost_short_loop: + /* Add one SHA-2 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x13,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x13,x13,3 /* len_lo in bits */ + + mov v29.s[3],w13 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + /* do final block */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + + mov x11, #64+20 /* size of o_key_pad + inner hash */ + lsl x11, x11, 3 + /* move length to the end of the block */ + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x3],16 + st1 {v25.s}[0], [x3] + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + ret + + .size aes128cbc_sha1_hmac, .-aes128cbc_sha1_hmac diff --git a/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S b/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S new file mode 100644 index 0000000..a5a9e85 --- /dev/null +++ b/drivers/crypto/armv8/asm/sha1_hmac_aes128cbc_dec.S @@ -0,0 +1,1650 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Combined Auth/Dec Primitive = sha1_hmac/aes128cbc + * + * Operations: + * + * out = decrypt-AES128CBC(in) + * return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in)) + * + * Prototype: + * + * void sha1_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, + * uint8_t *dsrc, uint8_t *ddst, + * uint64_t len, crypto_arg_t *arg) + * + * Registers used: + * + * sha1_hmac_aes128cbc_dec( + * csrc, x0 (cipher src address) + * cdst, x1 (cipher dst address) + * dsrc, x2 (digest src address - ignored) + * ddst, x3 (digest dst address) + * len, x4 (length) + * arg x5 : + * arg->cipher.key (round keys) + * arg->cipher.iv (initialization vector) + * arg->digest.hmac.i_key_pad (partially hashed i_key_pad) + * arg->digest.hmac.o_key_pad (partially hashed o_key_pad) + * ) + * + * Routine register definitions: + * + * v0 - v3 -- aes results + * v4 - v7 -- round consts for sha + * v8 - v18 -- round keys + * v19 -- temp register for SHA1 + * v20 -- ABCD copy (q20) + * v21 -- sha working state (q21) + * v22 -- sha working state (q22) + * v23 -- temp register for SHA1 + * v24 -- sha state ABCD + * v25 -- sha state E + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * + * Constraints: + * + * The variable "len" must be a multiple of 16, + * otherwise results are not defined. For AES partial blocks the user + * is required to pad the input to modulus 16 = 0. + * + * Short lengths are less optimized at < 16 AES blocks, + * however they are somewhat optimized, and more so than the enc/auth versions. + */ + .file "sha1_hmac_aes128cbc_dec.S" + .text + .cpu generic+fp+simd+crypto+crc + .global sha1_hmac_aes128cbc_dec + .type sha1_hmac_aes128cbc_dec,%function + + + .align 4 +.Lrcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + +sha1_hmac_aes128cbc_dec: +/* fetch args */ + ldr x6, [x5, #HMAC_IKEYPAD] + /* init ABCD, E */ + ld1 {v24.4s, v25.4s},[x6] + /* save pointer to o_key_pad partial hash */ + ldr x6, [x5, #HMAC_OKEYPAD] + + ldr x2, [x5, #CIPHER_KEY] + ldr x5, [x5, #CIPHER_IV] +/* + * init sha state, prefetch, check for small cases. + * Note that the output is prefetched as a load, for the in-place case + */ + prfm PLDL1KEEP,[x0,0] /* pref next *in */ + prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */ + lsr x10,x4,4 /* aes_blocks = len/16 */ + cmp x10,16 /* no main loop if <16 */ + blt .Lshort_cases /* branch if < 12 */ + +/* protect registers */ + sub sp,sp,8*16 + mov x11,x4 /* len -> x11 needed at end */ + mov x7,sp /* copy for address mode */ + ld1 {v30.16b},[x5] /* get 1st ivec */ + lsr x12,x11,6 /* total_blocks (sha) */ + mov x4,x0 /* sha_ptr_in = *in */ + ld1 {v26.16b},[x4],16 /* next w0 */ + ld1 {v27.16b},[x4],16 /* next w1 */ + ld1 {v28.16b},[x4],16 /* next w2 */ + ld1 {v29.16b},[x4],16 /* next w3 */ + +/* + * now we can do the loop prolog, 1st sha1 block + */ + prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */ + prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */ + /* base address for sha round consts */ + adr x8,.Lrcon +/* + * do the first sha1 block on the plaintext + */ + mov v20.16b,v24.16b /* init working ABCD */ + st1 {v8.16b},[x7],16 + st1 {v9.16b},[x7],16 + rev32 v26.16b,v26.16b /* endian swap w0 */ + st1 {v10.16b},[x7],16 + rev32 v27.16b,v27.16b /* endian swap w1 */ + st1 {v11.16b},[x7],16 + rev32 v28.16b,v28.16b /* endian swap w2 */ + st1 {v12.16b},[x7],16 + rev32 v29.16b,v29.16b /* endian swap w3 */ + st1 {v13.16b},[x7],16 + mov x9,x8 /* top of rcon */ + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + add v19.4s,v4.4s,v26.4s + st1 {v14.16b},[x7],16 + add v23.4s,v4.4s,v27.4s + st1 {v15.16b},[x7],16 +/* quad 0 */ + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v8.16b},[x2],16 /* rk[0] */ + sha1c q24,s25,v19.4s + sha1su1 v26.4s,v29.4s + ld1 {v9.16b},[x2],16 /* rk[1] */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + ld1 {v10.16b},[x2],16 /* rk[2] */ + sha1c q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + add v23.4s,v4.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + ld1 {v11.16b},[x2],16 /* rk[3] */ + sha1c q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v12.16b},[x2],16 /* rk[4] */ + sha1c q24,s21,v19.4s + add v19.4s,v5.4s,v28.4s + sha1su1 v26.4s,v29.4s + ld1 {v13.16b},[x2],16 /* rk[5] */ +/* quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + ld1 {v14.16b},[x2],16 /* rk[6] */ + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s + add v23.4s,v5.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + ld1 {v15.16b},[x2],16 /* rk[7] */ + sha1p q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + ld1 {v16.16b},[x2],16 /* rk[8] */ + sha1p q24,s21,v19.4s + sha1su1 v26.4s,v29.4s + ld1 {v17.16b},[x2],16 /* rk[9] */ + add v19.4s,v6.4s,v28.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + ld1 {v18.16b},[x2],16 /* rk[10] */ + sha1p q24,s22,v23.4s + sha1su1 v27.4s,v26.4s +/* quad 2 */ + add v23.4s,v6.4s,v29.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s + add v19.4s,v6.4s,v26.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + add v19.4s,v6.4s,v28.4s + sha1su1 v26.4s,v29.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s +/* quad 3 */ + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + ld1 {v26.16b},[x4],16 /* next w0 */ + sha1p q24,s21,v19.4s + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + ld1 {v27.16b},[x4],16 /* next w1 */ + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + ld1 {v28.16b},[x4],16 /* next w2 */ + sha1p q24,s21,v19.4s + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + ld1 {v29.16b},[x4],16 /* next w3 */ + sha1p q24,s22,v23.4s + +/* + * aes_blocks_left := number after the main (sha) block is done. + * can be 0 note we account for the extra unwind in main_blocks + */ + sub x7,x12,2 /* main_blocks=total_blocks-5 */ + add v24.4s,v24.4s,v20.4s + and x13,x10,3 /* aes_blocks_left */ + ld1 {v0.16b},[x0] /* next aes block, no update */ + add v25.4s,v25.4s,v21.4s + add x2,x0,128 /* lead_ptr = *in */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + +/* + * main combined loop CBC, can be used by auth/enc version + */ +.Lmain_loop: +/* + * Because both mov, rev32 and eor have a busy cycle, + * this takes longer than it looks. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] +/* aes xform 0, sha quad 0 */ + aesd v0.16b,v8.16b + rev32 v28.16b,v28.16b /* fix endian w2 */ + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + add v19.4s,v4.4s,v26.4s + aesimc v0.16b,v0.16b + sha1su0 v26.4s,v27.4s,v28.4s + aesd v0.16b,v10.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + add v23.4s,v4.4s,v27.4s + rev32 v29.16b,v29.16b /* fix endian w3 */ + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + sha1c q24,s25,v19.4s + aesd v0.16b,v12.16b + sha1su1 v26.4s,v29.4s + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v13.16b + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + aesimc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesimc v0.16b,v0.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v15.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + sha1c q24,s21,v19.4s + aesd v0.16b,v16.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + sha1h s21,s24 + aesd v0.16b,v17.16b + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* get next aes block, with update */ + ld1 {v30.16b},[x0],16 + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + add v23.4s,v5.4s,v27.4s + sha1su1 v26.4s,v29.4s +/* aes xform 1, sha quad 1 */ + sha1su0 v27.4s,v28.4s,v29.4s + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + aesd v1.16b,v8.16b + sha1h s21,s24 + add v19.4s,v5.4s,v28.4s + sha1p q24,s22,v23.4s + aesimc v1.16b,v1.16b + sha1su1 v27.4s,v26.4s + aesd v1.16b,v9.16b + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + aesimc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aesd v1.16b,v10.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + add v23.4s,v5.4s,v29.4s + sha1su1 v28.4s,v27.4s + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v1.16b,v1.16b + sha1h s21,s24 + aesd v1.16b,v12.16b + sha1p q24,s22,v23.4s + sha1su1 v29.4s,v28.4s + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + sha1h s22,s24 + add v19.4s,v5.4s,v26.4s + aesimc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aesd v1.16b,v14.16b + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v23.4s,v5.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + aesimc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aesd v1.16b,v16.16b + sha1su1 v27.4s,v26.4s + add v19.4s,v6.4s,v28.4s + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + add v23.4s,v6.4s,v29.4s + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 +/* aes xform 2, sha quad 2 */ + sha1su0 v28.4s,v29.4s,v26.4s + aesd v2.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + sha1h s22,s24 + aesimc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aesd v2.16b,v9.16b + sha1su1 v28.4s,v27.4s + aesimc v2.16b,v2.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesd v2.16b,v10.16b + sha1h s21,s24 + aesimc v2.16b,v2.16b + sha1m q24,s22,v23.4s + aesd v2.16b,v11.16b + sha1su1 v29.4s,v28.4s + add v19.4s,v6.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + sha1h s22,s24 + aesimc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aesd v2.16b,v13.16b + sha1su1 v26.4s,v29.4s + add v23.4s,v6.4s,v27.4s + sha1su0 v27.4s,v28.4s,v29.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + sha1h s21,s24 + aesimc v2.16b,v2.16b + sha1m q24,s22,v23.4s + aesd v2.16b,v15.16b + sha1su1 v27.4s,v26.4s + add v19.4s,v6.4s,v28.4s + aesimc v2.16b,v2.16b + sha1h s22,s24 + aesd v2.16b,v16.16b + sha1m q24,s21,v19.4s + aesimc v2.16b,v2.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesd v2.16b,v17.16b + sha1su1 v28.4s,v27.4s + add v23.4s,v7.4s,v29.4s + eor v2.16b,v2.16b,v18.16b /* res 2 */ + add v19.4s,v7.4s,v26.4s + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 +/* aes xform 3, sha quad 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + sha1h s21,s24 + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aesd v3.16b,v10.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v3.16b,v3.16b + sha1su1 v29.4s,v28.4s + aesd v3.16b,v11.16b + sha1h s22,s24 + ld1 {v26.16b},[x4],16 /* next w0 */ + aesimc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + add v23.4s,v7.4s,v27.4s + aesd v3.16b,v13.16b + sha1h s21,s24 + ld1 {v27.16b},[x4],16 /* next w1 */ + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aesd v3.16b,v14.16b + sub x7,x7,1 /* dec block count */ + aesimc v3.16b,v3.16b + add v19.4s,v7.4s,v28.4s + aesd v3.16b,v15.16b + ld1 {v0.16b},[x0] /* next aes block, no update */ + sha1h s22,s24 + ld1 {v28.16b},[x4],16 /* next w2 */ + aesimc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + add v23.4s,v7.4s,v29.4s + aesd v3.16b,v17.16b + sha1h s21,s24 + ld1 {v29.16b},[x4],16 /* next w3 */ + sha1p q24,s22,v23.4s + add v24.4s,v24.4s,v20.4s + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + add v25.4s,v25.4s,v21.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + /* loop if more to do */ + cbnz x7,.Lmain_loop +/* + * now the loop epilog. Since the reads for sha have already been done + * in advance, we have to have an extra unwind. + * This is why the test for the short cases is 16 and not 12. + * + * the unwind, which is just the main loop without the tests or final reads. + */ + rev32 v26.16b,v26.16b /* fix endian w0 */ + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + /* pref next aes_ptr_out, streaming */ + prfm PLDL1KEEP,[x1,64] +/* aes xform 0, sha quad 0 */ + aesd v0.16b,v8.16b + add v19.4s,v4.4s,v26.4s + rev32 v28.16b,v28.16b /* fix endian w2 */ + aesimc v0.16b,v0.16b + sha1su0 v26.4s,v27.4s,v28.4s + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesd v0.16b,v9.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + add v23.4s,v4.4s,v27.4s + aesimc v0.16b,v0.16b + sha1c q24,s25,v19.4s + aesd v0.16b,v11.16b + rev32 v29.16b,v29.16b /* fix endian w3 */ + aesimc v0.16b,v0.16b + sha1su1 v26.4s,v29.4s + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v13.16b + sha1h s21,s24 + add v19.4s,v4.4s,v28.4s + aesimc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + add v23.4s,v4.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesimc v0.16b,v0.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v15.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + sha1c q24,s21,v19.4s + aesd v0.16b,v16.16b + sha1su1 v28.4s,v27.4s + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + sha1h s21,s24 + aesd v0.16b,v17.16b + sha1c q24,s22,v23.4s + add v19.4s,v4.4s,v26.4s + sha1su1 v29.4s,v28.4s + eor v0.16b,v0.16b,v18.16b /* final res 0 */ + add v23.4s,v5.4s,v27.4s + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + sha1su0 v26.4s,v27.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su1 v26.4s,v29.4s +/* aes xform 1, sha quad 1 */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + sha1su0 v27.4s,v28.4s,v29.4s + aesd v1.16b,v8.16b + sha1h s21,s24 + add v19.4s,v5.4s,v28.4s + aesimc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + add v23.4s,v5.4s,v29.4s + sha1su1 v27.4s,v26.4s + aesd v1.16b,v10.16b + sha1su0 v28.4s,v29.4s,v26.4s + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesimc v1.16b,v1.16b + sha1h s22,s24 + aesd v1.16b,v11.16b + sha1p q24,s21,v19.4s + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + sha1su1 v28.4s,v27.4s + aesimc v1.16b,v1.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesd v1.16b,v13.16b + sha1h s21,s24 + aesimc v1.16b,v1.16b + sha1p q24,s22,v23.4s + aesd v1.16b,v14.16b + add v19.4s,v5.4s,v26.4s + sha1su1 v29.4s,v28.4s + aesimc v1.16b,v1.16b + add x2,x2,64 /* bump lead_ptr */ + aesd v1.16b,v15.16b + add v23.4s,v5.4s,v27.4s + aesimc v1.16b,v1.16b + sha1su0 v26.4s,v27.4s,v28.4s + aesd v1.16b,v16.16b + sha1h s22,s24 + aesimc v1.16b,v1.16b + sha1p q24,s21,v19.4s + aesd v1.16b,v17.16b + add v19.4s,v6.4s,v28.4s + eor v1.16b,v1.16b,v18.16b /* res xf 1 */ + sha1su1 v26.4s,v29.4s + eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */ + sha1su0 v27.4s,v28.4s,v29.4s + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + sha1h s21,s24 + sha1p q24,s22,v23.4s + add v23.4s,v6.4s,v29.4s + sha1su1 v27.4s,v26.4s +/* mode op 2 */ +/* aes xform 2, sha quad 2 */ + aesd v2.16b,v8.16b + sha1su0 v28.4s,v29.4s,v26.4s + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + sha1h s22,s24 + aesimc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aesd v2.16b,v10.16b + sha1su1 v28.4s,v27.4s + aesimc v2.16b,v2.16b + add v19.4s,v6.4s,v26.4s + aesd v2.16b,v11.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + sha1h s21,s24 + aesimc v2.16b,v2.16b + sha1m q24,s22,v23.4s + aesd v2.16b,v13.16b + sha1su1 v29.4s,v28.4s + aesimc v2.16b,v2.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesd v2.16b,v14.16b + add v23.4s,v6.4s,v27.4s + aesimc v2.16b,v2.16b + sha1su0 v26.4s,v27.4s,v28.4s + aesd v2.16b,v15.16b + sha1h s22,s24 + aesimc v2.16b,v2.16b + sha1m q24,s21,v19.4s + aesd v2.16b,v16.16b + add v19.4s,v6.4s,v28.4s + aesimc v2.16b,v2.16b + sha1su1 v26.4s,v29.4s + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + sha1su0 v27.4s,v28.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + add v23.4s,v7.4s,v29.4s + sha1su1 v27.4s,v26.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su1 v28.4s,v27.4s +/* mode op 3 */ +/* aes xform 3, sha quad 3 */ + aesd v3.16b,v8.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v3.16b,v3.16b + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + aesd v3.16b,v9.16b + sha1h s21,s24 + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aesd v3.16b,v10.16b + sha1su1 v29.4s,v28.4s + aesimc v3.16b,v3.16b + add v19.4s,v7.4s,v26.4s + aesd v3.16b,v11.16b + sha1h s22,s24 + aesimc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aesd v3.16b,v12.16b + /* read first aes block, no bump */ + ld1 {v0.16b},[x0] + aesimc v3.16b,v3.16b + add v23.4s,v7.4s,v27.4s + aesd v3.16b,v13.16b + sha1h s21,s24 + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + add v19.4s,v7.4s,v28.4s + aesd v3.16b,v14.16b + sha1h s22,s24 + aesimc v3.16b,v3.16b + sha1p q24,s21,v19.4s + aesd v3.16b,v15.16b + add v23.4s,v7.4s,v29.4s + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + sha1h s21,s24 + aesimc v3.16b,v3.16b + sha1p q24,s22,v23.4s + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* aes res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v31.16b},[x0],16 + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + +/* + * now we have to do the 4 aes blocks (b-2) that catch up to where sha is + */ + +/* aes xform 0 */ + aesd v0.16b,v8.16b + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + /* read next aes block, no update */ + ld1 {v1.16b},[x0] + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b /* res 0 */ + eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + /* read next aes block, no update */ + ld1 {v2.16b},[x0] + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b /* res 1 */ + eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v31.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + /* read next aes block, no update */ + ld1 {v3.16b},[x0] + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b /* res 2 */ + eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */ + /* read next aes block, update aes_ptr_in */ + ld1 {v30.16b},[x0],16 + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b /* res 3 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* + * Now, there is the final b-1 sha1 padded block. + * This contains between 0-3 aes blocks. We take some pains to avoid read spill + * by only reading the blocks that are actually defined. + * this is also the final sha block code for the short_cases. + */ +.Ljoin_common: + mov w15,0x80 /* that's the 1 of the pad */ + cbnz x13,.Lpad100 /* branch if there is some real data */ + eor v26.16b,v26.16b,v26.16b /* zero the rest */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v26.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad100: + sub x14,x13,1 /* dec amount left */ + ld1 {v26.16b},[x4],16 /* next w0 */ + cbnz x14,.Lpad200 /* branch if there is some real data */ + eor v27.16b,v27.16b,v27.16b /* zero the rest */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v27.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad200: + sub x14,x14,1 /* dec amount left */ + ld1 {v27.16b},[x4],16 /* next w1 */ + cbnz x14,.Lpad300 /* branch if there is some real data */ + eor v28.16b,v28.16b,v28.16b /* zero the rest */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v28.b[0],w15 /* all data is bogus */ + b .Lpad_done /* go do rest */ + +.Lpad300: + ld1 {v28.16b},[x4],16 /* next w2 */ + eor v29.16b,v29.16b,v29.16b /* zero the rest */ + mov v29.b[3],w15 /* all data is bogus */ + +.Lpad_done: + /* Add one SHA-1 block since hash is calculated including i_key_pad */ + add x11, x11, #64 + lsr x12,x11,32 /* len_hi */ + and x14,x11,0xffffffff /* len_lo */ + lsl x12,x12,3 /* len_hi in bits */ + lsl x14,x14,3 /* len_lo in bits */ + + mov v29.s[3],w14 /* len_lo */ + mov v29.s[2],w12 /* len_hi */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ +/* + * final sha block + * the strategy is to combine the 0-3 aes blocks, which is faster but + * a little gourmand on code space. + */ + cbz x13,.Lzero_aes_blocks_left /* none to do */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v31.16b},[x0],16 + + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + add v19.4s,v4.4s,v26.4s + aesd v0.16b,v10.16b + add v23.4s,v4.4s,v27.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + sha1su0 v26.4s,v27.4s,v28.4s + aesimc v0.16b,v0.16b + sha1h s22,s24 + aesd v0.16b,v12.16b + sha1c q24,s25,v19.4s + sha1su1 v26.4s,v29.4s + aesimc v0.16b,v0.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesd v0.16b,v13.16b + sha1h s21,s24 + aesimc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aesd v0.16b,v14.16b + sha1su1 v27.4s,v26.4s + add v19.4s,v4.4s,v28.4s + sha1su0 v28.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha1h s22,s24 + aesd v0.16b,v15.16b + sha1c q24,s21,v19.4s + aesimc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + add v23.4s,v4.4s,v29.4s + aesd v0.16b,v16.16b + sha1su0 v29.4s,v26.4s,v27.4s + sha1h s21,s24 + aesimc v0.16b,v0.16b + sha1c q24,s22,v23.4s + aesd v0.16b,v17.16b + sha1su1 v29.4s,v28.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + /* dec counter */ + sub x13,x13,1 + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad1 + +/* aes xform 1 */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0] + ld1 {v30.16b},[x0],16 + add v23.4s,v5.4s,v27.4s + aesd v0.16b,v8.16b + add v19.4s,v5.4s,v28.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + sha1h s21,s24 + aesimc v0.16b,v0.16b + sha1p q24,s22,v23.4s + aesd v0.16b,v11.16b + sha1su1 v27.4s,v26.4s + aesimc v0.16b,v0.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesd v0.16b,v12.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + sha1p q24,s21,v19.4s + aesd v0.16b,v13.16b + sha1su1 v28.4s,v27.4s + add v23.4s,v5.4s,v29.4s + aesimc v0.16b,v0.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesd v0.16b,v14.16b + sha1h s21,s24 + aesimc v0.16b,v0.16b + sha1p q24,s22,v23.4s + aesd v0.16b,v15.16b + sha1su1 v29.4s,v28.4s + aesimc v0.16b,v0.16b + add v19.4s,v5.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesd v0.16b,v16.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + sha1p q24,s21,v19.4s + aesd v0.16b,v17.16b + sha1su1 v26.4s,v29.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */ + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + sub x13,x13,1 /* dec counter */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + cbz x13,.Lfrmquad2 + +/* aes xform 2 */ + /* read first aes block, bump aes_ptr_in */ + ld1 {v0.16b},[x0],16 + add v19.4s,v6.4s,v28.4s + aesd v0.16b,v8.16b + add v23.4s,v6.4s,v29.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + sha1su0 v28.4s,v29.4s,v26.4s + aesimc v0.16b,v0.16b + sha1h s22,s24 + aesd v0.16b,v10.16b + sha1m q24,s21,v19.4s + aesimc v0.16b,v0.16b + sha1su1 v28.4s,v27.4s + aesd v0.16b,v11.16b + sha1su0 v29.4s,v26.4s,v27.4s + aesimc v0.16b,v0.16b + sha1h s21,s24 + aesd v0.16b,v12.16b + sha1m q24,s22,v23.4s + aesimc v0.16b,v0.16b + sha1su1 v29.4s,v28.4s + aesd v0.16b,v13.16b + add v19.4s,v6.4s,v26.4s + sha1su0 v26.4s,v27.4s,v28.4s + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + sha1h s22,s24 + aesimc v0.16b,v0.16b + sha1m q24,s21,v19.4s + aesd v0.16b,v15.16b + sha1su1 v26.4s,v29.4s + aesimc v0.16b,v0.16b + add v23.4s,v6.4s,v27.4s + aesd v0.16b,v16.16b + sha1su0 v27.4s,v28.4s,v29.4s + aesimc v0.16b,v0.16b + sha1h s21,s24 + aesd v0.16b,v17.16b + sha1m q24,s22,v23.4s + eor v3.16b,v0.16b,v18.16b /* res 0 */ + sha1su1 v27.4s,v26.4s + eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */ + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 + b .Lfrmquad3 +/* + * the final block with no aes component, i.e from here there were zero blocks + */ + +.Lzero_aes_blocks_left: + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + +/* quad 1 */ +.Lfrmquad1: + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + +/* quad 2 */ +.Lfrmquad2: + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + +/* quad 3 */ +.Lfrmquad3: + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v26.4s,v24.4s,v20.4s + add v27.4s,v25.4s,v21.4s + + /* Calculate final HMAC */ + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + /* load o_key_pad partial hash */ + ld1 {v24.16b,v25.16b}, [x6] + /* working ABCD <- ABCD */ + mov v20.16b,v24.16b + + /* Set padding 1 to the first reg */ + mov w11, #0x80 /* that's the 1 of the pad */ + mov v27.b[7], w11 + /* size of o_key_pad + inner hash */ + mov x11, #64+20 + /* move length to the end of the block */ + lsl x11, x11, 3 + mov v29.s[3], w11 + lsr x11, x11, 32 + mov v29.s[2], w11 /* and the higher part */ + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + + st1 {v24.16b}, [x3],16 + st1 {v25.s}[0], [x3] + + mov x9,sp + add sp,sp,8*16 + ld1 {v8.16b - v11.16b},[x9],4*16 + ld1 {v12.16b - v15.16b},[x9] + + ret + +/* + * These are the short cases (less efficient), here used for 1-11 aes blocks. + * x10 = aes_blocks + */ +.Lshort_cases: + sub sp,sp,8*16 + mov x9,sp /* copy for address mode */ + st1 {v8.16b - v11.16b},[x9],4*16 + st1 {v12.16b - v15.16b},[x9] + + ld1 {v30.16b},[x5] /* get ivec */ + ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */ + ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */ + ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */ + adr x8,.Lrcon /* rcon */ + lsl x11,x10,4 /* len = aes_blocks*16 */ + mov x4,x0 /* sha_ptr_in = in */ + + mov x9,x8 /* top of rcon */ + + ld1 {v4.16b},[x9],16 /* key0 */ + ld1 {v5.16b},[x9],16 /* key1 */ + ld1 {v6.16b},[x9],16 /* key2 */ + ld1 {v7.16b},[x9],16 /* key3 */ + +/* + * This loop does 4 at a time, so that at the end there is a final sha block + * and 0-3 aes blocks. Note that everything is done serially + * to avoid complication. + */ +.Lshort_loop: + cmp x10,4 /* check if 4 or more */ + /* if less, bail to last block */ + blt .Llast_sha_block + + ld1 {v31.16b},[x4] /* next w no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v0.16b},[x4],16 + rev32 v26.16b,v0.16b /* endian swap for sha */ + add x0,x0,64 + +/* aes xform 0 */ + aesd v0.16b,v8.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v9.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v10.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v11.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v12.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v13.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v14.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v15.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v0.16b,v17.16b + eor v0.16b,v0.16b,v18.16b + eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v1.16b},[x4],16 + rev32 v27.16b,v1.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v0.16b},[x1],16 + +/* aes xform 1 */ + aesd v1.16b,v8.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v9.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v10.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v11.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v12.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v13.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v14.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v15.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */ + + ld1 {v31.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v2.16b},[x4],16 + rev32 v28.16b,v2.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v1.16b},[x1],16 + +/* aes xform 2 */ + aesd v2.16b,v8.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v9.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v10.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v11.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v12.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v13.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v14.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v15.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v16.16b + aesimc v2.16b,v2.16b + aesd v2.16b,v17.16b + eor v2.16b,v2.16b,v18.16b + eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */ + + ld1 {v30.16b},[x4] /* read no update */ + /* read next aes block, update aes_ptr_in */ + ld1 {v3.16b},[x4],16 + rev32 v29.16b,v3.16b /* endian swap for sha */ + /* save aes res, bump aes_out_ptr */ + st1 {v2.16b},[x1],16 + +/* aes xform 3 */ + aesd v3.16b,v8.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v9.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v10.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v11.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v12.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v13.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v14.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v15.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v16.16b + aesimc v3.16b,v3.16b + aesd v3.16b,v17.16b + eor v3.16b,v3.16b,v18.16b + eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */ +/* + * now we have the sha1 to do for these 4 aes blocks. Note that. + */ + + mov v20.16b,v24.16b /* working ABCD <- ABCD */ + /* save aes res, bump aes_out_ptr */ + st1 {v3.16b},[x1],16 +/* quad 0 */ + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s25,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v4.4s,v27.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v4.4s,v28.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v4.4s,v29.4s + sha1h s21,s24 + sha1c q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v4.4s,v26.4s + sha1h s22,s24 + sha1c q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s +/* quad 1 */ + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v5.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v5.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v5.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v5.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s +/* quad 2 */ + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v6.4s,v29.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v19.4s,v6.4s,v26.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v23.4s,v6.4s,v27.4s + sha1h s21,s24 + sha1m q24,s22,v23.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v19.4s,v6.4s,v28.4s + sha1h s22,s24 + sha1m q24,s21,v19.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s +/* quad 3 */ + add v19.4s,v7.4s,v26.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v27.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v19.4s,v7.4s,v28.4s + sha1h s22,s24 + sha1p q24,s21,v19.4s + + add v23.4s,v7.4s,v29.4s + sha1h s21,s24 + sha1p q24,s22,v23.4s + + add v25.4s,v25.4s,v21.4s + add v24.4s,v24.4s,v20.4s + + sub x10,x10,4 /* 4 less */ + b .Lshort_loop /* keep looping */ +/* + * this is arranged so that we can join the common unwind code + * that does the last sha block and the final 0-3 aes blocks + */ +.Llast_sha_block: + mov x13,x10 /* copy aes blocks for common */ + b .Ljoin_common /* join common code */ + + .size sha1_hmac_aes128cbc_dec, .-sha1_hmac_aes128cbc_dec -- 1.9.1