Module Name: src Committed By: christos Date: Wed May 31 19:35:31 UTC 2023
Modified Files: src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64: aes-gcm-armv8_64.S aesv8-armx.S src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm: aes-gcm-armv8_64.S bsaes-armv7.S src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc: ecp_nistp521-ppc64.S src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64: ecp_nistp521-ppc64.S Log Message: regen To generate a diff of this commit: cvs rdiff -u -r1.2 -r1.3 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S cvs rdiff -u -r1.5 -r1.6 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S cvs rdiff -u -r1.1 -r1.2 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S cvs rdiff -u -r1.6 -r1.7 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S cvs rdiff -u -r1.1 -r1.2 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S cvs rdiff -u -r1.1 -r1.2 \ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.2 src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.3 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S:1.2 Wed May 10 21:31:54 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aes-gcm-armv8_64.S Wed May 31 15:35:31 2023 @@ -19,28 +19,36 @@ aes_gcm_enc_128_kernel: stp d14, d15, [sp, #96] ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif ldp x13, x14, [x8, #160] //load rk10 - +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif ld1 {v11.16b}, [x3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b lsr x5, x1, #3 //byte_len mov x15, x5 - ldr q27, [x8, #144] //load rk9 + ld1 {v18.4s}, [x8], #16 //load rk0 add x4, x0, x1, lsr #3 //end_input_ptr sub x5, x5, #1 //byte_len - 1 lsr x12, x11, #32 ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif fmov d1, x10 //CTR block 1 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 orr w11, w11, w11 - ldr q18, [x8, #0] //load rk0 + ld1 {v19.4s}, [x8], #16 //load rk1 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 @@ -60,30 +68,33 @@ aes_gcm_enc_128_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ldr q19, [x8, #16] //load rk1 + ld1 {v20.4s}, [x8], #16 //load rk2 add w12, w12, #1 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q20, [x8, #32] //load rk2 + ld1 {v21.4s}, [x8], #16 //load rk3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 +#endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 - ldr q26, [x8, #128] //load rk8 + ld1 {v22.4s}, [x8], #16 //load rk4 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ldr q21, [x8, #48] //load rk3 + ld1 {v23.4s}, [x8], #16 //load rk5 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 @@ -91,11 +102,11 @@ aes_gcm_enc_128_kernel: aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 - ldr q24, [x8, #96] //load rk6 + ld1 {v24.4s}, [x8], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ldr q25, [x8, #112] //load rk7 + ld1 {v25.4s}, [x8], #16 //load rk7 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 @@ -103,12 +114,14 @@ aes_gcm_enc_128_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ldr q23, [x8, #80] //load rk5 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 +#endif aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 @@ -125,7 +138,7 @@ aes_gcm_enc_128_kernel: aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 - ldr q22, [x8, #64] //load rk4 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 @@ -208,13 +221,25 @@ aes_gcm_enc_128_kernel: b.ge .L128_enc_tail //handle tail ldp x6, x7, [x0, #0] //AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif ldp x21, x22, [x0, #32] //AES block 2 - load plaintext - +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif ldp x19, x20, [x0, #16] //AES block 1 - load plaintext - +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif eor x6, x6, x13 //AES block 0 - round 10 low eor x7, x7, x14 //AES block 0 - round 10 high @@ -279,6 +304,10 @@ aes_gcm_enc_128_kernel: .L128_enc_main_loop: //main loop start ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) @@ -313,7 +342,10 @@ aes_gcm_enc_128_kernel: pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 rev w9, w12 //CTR block 4k+8 @@ -395,7 +427,10 @@ aes_gcm_enc_128_kernel: aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid @@ -403,7 +438,10 @@ aes_gcm_enc_128_kernel: aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low @@ -712,7 +750,10 @@ aes_gcm_enc_128_kernel: sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif cmp x5, #48 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag @@ -750,7 +791,10 @@ aes_gcm_enc_128_kernel: st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif rev64 v4.16b, v5.16b //GHASH final-3 block eor v4.16b, v4.16b, v8.16b //feed in partial tag @@ -779,7 +823,10 @@ aes_gcm_enc_128_kernel: rev64 v4.16b, v5.16b //GHASH final-2 block ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x6, x6, x13 //AES final-1 block - round 10 low @@ -813,7 +860,10 @@ aes_gcm_enc_128_kernel: rev64 v4.16b, v5.16b //GHASH final-1 block ldp x6, x7, [x0], #16 //AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final block - round 10 high @@ -876,9 +926,11 @@ aes_gcm_enc_128_kernel: ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored eor v8.8b, v8.8b, v4.8b //GHASH final block - mid - +#ifndef __AARCH64EB__ rev w9, w12 - +#else + mov w9, w12 +#endif pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid @@ -951,20 +1003,29 @@ aes_gcm_dec_128_kernel: lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #160] //load rk10 +#ifdef __AARCH64EB__ + ror x14, x14, 32 + ror x13, x13, 32 +#endif sub x5, x5, #1 //byte_len - 1 - ldr q18, [x8, #0] //load rk0 + ld1 {v18.4s}, [x8], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif lsr x12, x11, #32 fmov d2, x10 //CTR block 2 - ldr q19, [x8, #16] //load rk1 + ld1 {v19.4s}, [x8], #16 //load rk1 orr w11, w11, w11 rev w12, w12 //rev_ctr32 @@ -976,7 +1037,7 @@ aes_gcm_dec_128_kernel: rev w9, w12 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ldr q20, [x8, #32] //load rk2 + ld1 {v20.4s}, [x8], #16 //load rk2 add w12, w12, #1 //CTR block 1 fmov v1.d[1], x9 //CTR block 1 @@ -999,19 +1060,19 @@ aes_gcm_dec_128_kernel: aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q21, [x8, #48] //load rk3 + ld1 {v21.4s}, [x8], #16 //load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ldr q24, [x8, #96] //load rk6 + ld1 {v22.4s}, [x8], #16 //load rk4 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q25, [x8, #112] //load rk7 + ld1 {v23.4s}, [x8], #16 //load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ldr q22, [x8, #64] //load rk4 + ld1 {v24.4s}, [x8], #16 //load rk6 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 @@ -1021,7 +1082,6 @@ aes_gcm_dec_128_kernel: aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 - ldp x13, x14, [x8, #160] //load rk10 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 @@ -1031,7 +1091,7 @@ aes_gcm_dec_128_kernel: aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 - ldr q23, [x8, #80] //load rk5 + ld1 {v25.4s}, [x8], #16 //load rk7 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 @@ -1041,7 +1101,7 @@ aes_gcm_dec_128_kernel: aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ldr q27, [x8, #144] //load rk9 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 @@ -1052,11 +1112,12 @@ aes_gcm_dec_128_kernel: aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 - ldr q26, [x8, #128] //load rk8 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 @@ -1073,8 +1134,9 @@ aes_gcm_dec_128_kernel: aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 @@ -1092,7 +1154,9 @@ aes_gcm_dec_128_kernel: trn1 v8.2d, v12.2d, v13.2d //h2h | h1h ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 +#endif trn2 v16.2d, v12.2d, v13.2d //h2l | h1l add x5, x5, x0 @@ -1134,12 +1198,10 @@ aes_gcm_dec_128_kernel: eor v17.16b, v17.16b, v9.16b //h4k | h3k b.ge .L128_dec_tail //handle tail - ldr q5, [x0, #16] //AES block 1 - load ciphertext - - ldr q4, [x0, #0] //AES block 0 - load ciphertext + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result - ldr q6, [x0, #32] //AES block 2 - load ciphertext + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev64 v4.16b, v4.16b //GHASH block 0 @@ -1147,10 +1209,9 @@ aes_gcm_dec_128_kernel: orr x9, x11, x9, lsl #32 //CTR block 4 add w12, w12, #1 //CTR block 4 - ldr q7, [x0, #48] //AES block 3 - load ciphertext + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext rev64 v5.16b, v5.16b //GHASH block 1 - add x0, x0, #64 //AES input_ptr update mov x19, v1.d[0] //AES block 1 - mov low mov x20, v1.d[1] //AES block 1 - mov high @@ -1165,7 +1226,9 @@ aes_gcm_dec_128_kernel: fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 eor x19, x19, x13 //AES block 1 - round 10 low - +#ifdef __AARCH64EB__ + rev x19, x19 +#endif fmov d1, x10 //CTR block 5 add w12, w12, #1 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 @@ -1177,10 +1240,19 @@ aes_gcm_dec_128_kernel: orr x9, x11, x9, lsl #32 //CTR block 6 eor x20, x20, x14 //AES block 1 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif eor x6, x6, x13 //AES block 0 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v2.16b, v6.16b, v2.16b //AES block 2 - result eor x7, x7, x14 //AES block 0 - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif stp x6, x7, [x2], #16 //AES block 0 - store result stp x19, x20, [x2], #16 //AES block 1 - store result @@ -1248,9 +1320,14 @@ aes_gcm_dec_128_kernel: aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 eor x23, x23, x13 //AES block 4k+3 - round 10 low - +#ifdef __AARCH64EB__ + rev x23, x23 +#endif pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid eor x22, x22, x14 //AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif mov d31, v6.d[1] //GHASH block 4k+2 - mid aese v0.16b, v19.16b @@ -1288,7 +1365,9 @@ aes_gcm_dec_128_kernel: pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid eor x24, x24, x14 //AES block 4k+3 - round 10 high - +#ifdef __AARCH64EB__ + rev x24, x24 +#endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid @@ -1296,7 +1375,9 @@ aes_gcm_dec_128_kernel: aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 eor x21, x21, x13 //AES block 4k+2 - round 10 low - +#ifdef __AARCH64EB__ + rev x21, x21 +#endif aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 movi v8.8b, #0xc2 @@ -1318,7 +1399,7 @@ aes_gcm_dec_128_kernel: pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high - ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext + ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 @@ -1345,7 +1426,7 @@ aes_gcm_dec_128_kernel: rev w9, w12 //CTR block 4k+8 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid - ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext + ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v0.16b, v27.16b //AES block 4k+4 - round 9 @@ -1363,7 +1444,7 @@ aes_gcm_dec_128_kernel: aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 - ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext + ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext add w12, w12, #1 //CTR block 4k+8 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid @@ -1371,11 +1452,10 @@ aes_gcm_dec_128_kernel: aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 - ldr q7, [x0, #48] //AES block 4k+3 - load ciphertext + ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 - add x0, x0, #64 //AES input_ptr update rev64 v5.16b, v5.16b //GHASH block 4k+5 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid @@ -1400,11 +1480,15 @@ aes_gcm_dec_128_kernel: aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor x7, x7, x14 //AES block 4k+4 - round 10 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor v11.16b, v11.16b, v8.16b //MODULO - fold into low mov x20, v1.d[1] //AES block 4k+5 - mov high eor x6, x6, x13 //AES block 4k+4 - round 10 low - +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result mov x19, v1.d[0] //AES block 4k+5 - mov low add w12, w12, #1 //CTR block 4k+9 @@ -1421,9 +1505,15 @@ aes_gcm_dec_128_kernel: add w12, w12, #1 //CTR block 4k+10 eor x20, x20, x14 //AES block 4k+5 - round 10 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 10 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif stp x19, x20, [x2], #16 //AES block 4k+5 - store result orr x9, x11, x9, lsl #32 //CTR block 4k+10 @@ -1528,9 +1618,14 @@ aes_gcm_dec_128_kernel: aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 eor x23, x23, x13 //AES block 4k+3 - round 10 low - +#ifdef __AARCH64EB__ + rev x23, x23 +#endif pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor x21, x21, x13 //AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low aese v2.16b, v21.16b @@ -1603,7 +1698,9 @@ aes_gcm_dec_128_kernel: pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor x24, x24, x14 //AES block 4k+3 - round 10 high - +#ifdef __AARCH64EB__ + rev x24, x24 +#endif aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment @@ -1621,7 +1718,9 @@ aes_gcm_dec_128_kernel: aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 eor x22, x22, x14 //AES block 4k+2 - round 10 high - +#ifdef __AARCH64EB__ + rev x22, x22 +#endif aese v0.16b, v27.16b //AES block 4k+4 - round 9 stp x21, x22, [x2], #16 //AES block 4k+2 - store result @@ -1645,9 +1744,14 @@ aes_gcm_dec_128_kernel: cmp x5, #48 eor x7, x7, x14 //AES block 4k+4 - round 10 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag eor x6, x6, x13 //AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif b.gt .L128_dec_blocks_more_than_3 mov v3.16b, v2.16b @@ -1691,9 +1795,14 @@ aes_gcm_dec_128_kernel: movi v8.8b, #0 //suppress further partial tag feed in eor x7, x7, x14 //AES final-2 block - round 10 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x6, x6, x13 //AES final-2 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif .L128_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block @@ -1719,12 +1828,18 @@ aes_gcm_dec_128_kernel: pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid eor x6, x6, x13 //AES final-1 block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid eor x7, x7, x14 //AES final-1 block - round 10 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif .L128_dec_blocks_more_than_1: //blocks left > 1 rev64 v4.16b, v5.16b //GHASH final-1 block @@ -1755,8 +1870,13 @@ aes_gcm_dec_128_kernel: eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high eor x7, x7, x14 //AES final block - round 10 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor x6, x6, x13 //AES final block - round 10 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L128_dec_blocks_less_than_1: //blocks left <= 1 @@ -1802,7 +1922,11 @@ aes_gcm_dec_128_kernel: bic x4, x4, x9 //mask out low existing bytes and x6, x6, x9 +#ifndef __AARCH64EB__ rev w9, w12 +#else + mov w9, w12 +#endif eor v10.16b, v10.16b, v8.16b //GHASH final block - mid movi v8.8b, #0xc2 @@ -1869,18 +1993,26 @@ aes_gcm_enc_192_kernel: stp d14, d15, [sp, #96] ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 - ldr q23, [x8, #80] //load rk5 - - ldr q22, [x8, #64] //load rk4 + ld1 {v19.4s}, [x8], #16 //load rk1 - ldr q26, [x8, #128] //load rk8 + ld1 {v20.4s}, [x8], #16 //load rk2 lsr x12, x11, #32 - ldr q24, [x8, #96] //load rk6 + ld1 {v21.4s}, [x8], #16 //load rk3 orr w11, w11, w11 - ldr q25, [x8, #112] //load rk7 + ld1 {v22.4s}, [x8], #16 //load rk4 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 @@ -1904,15 +2036,13 @@ aes_gcm_enc_192_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ldr q18, [x8, #0] //load rk0 + ld1 {v23.4s}, [x8], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 - ldr q21, [x8, #48] //load rk3 + ld1 {v24.4s}, [x8], #16 //load rk6 - ldp x13, x14, [x8, #192] //load rk12 - - ldr q19, [x8, #16] //load rk1 + ld1 {v25.4s}, [x8], #16 //load rk7 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 @@ -1922,35 +2052,38 @@ aes_gcm_enc_192_kernel: aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ldr q29, [x8, #176] //load rk11 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q20, [x8, #32] //load rk2 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 - ldr q28, [x8, #160] //load rk10 + ld1 {v28.4s}, [x8], #16 //load rk10 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ldr q27, [x8, #144] //load rk9 + ld1 {v29.4s}, [x8], #16 //load rk11 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 @@ -2007,8 +2140,9 @@ aes_gcm_enc_192_kernel: aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese v1.16b, v24.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 @@ -2088,13 +2222,26 @@ aes_gcm_enc_192_kernel: rev w9, w12 //CTR block 4 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif orr x9, x11, x9, lsl #32 //CTR block 4 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext - +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif ldp x19, x20, [x0, #16] //AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif add x0, x0, #64 //AES input_ptr update cmp x0, x5 //check if we have <= 8 blocks @@ -2166,7 +2313,10 @@ aes_gcm_enc_192_kernel: aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 fmov d3, x10 //CTR block 4k+3 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) @@ -2178,11 +2328,17 @@ aes_gcm_enc_192_kernel: pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext - +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor v4.16b, v4.16b, v11.16b //PRE 1 @@ -2275,7 +2431,10 @@ aes_gcm_enc_192_kernel: aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low @@ -2640,7 +2799,10 @@ aes_gcm_enc_192_kernel: sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor x6, x6, x13 //AES block 4k+4 - round 12 low eor x7, x7, x14 //AES block 4k+4 - round 12 high @@ -2677,7 +2839,10 @@ aes_gcm_enc_192_kernel: st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif rev64 v4.16b, v5.16b //GHASH final-3 block eor x6, x6, x13 //AES final-2 block - round 12 low @@ -2708,7 +2873,10 @@ aes_gcm_enc_192_kernel: rev64 v4.16b, v5.16b //GHASH final-2 block ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor v4.16b, v4.16b, v8.16b //feed in partial tag eor x7, x7, x14 //AES final-1 block - round 12 high @@ -2739,7 +2907,10 @@ aes_gcm_enc_192_kernel: st1 { v5.16b}, [x2], #16 //AES final-1 block - store result ldp x6, x7, [x0], #16 //AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif rev64 v4.16b, v5.16b //GHASH final-1 block eor x6, x6, x13 //AES final block - round 12 low @@ -2771,7 +2942,11 @@ aes_gcm_enc_192_kernel: .L192_enc_blocks_less_than_1: //blocks left <= 1 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored +#ifndef __AARCH64EB__ rev w9, w12 +#else + mov w9, w12 +#endif and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 @@ -2876,14 +3051,22 @@ aes_gcm_dec_192_kernel: add x4, x0, x1, lsr #3 //end_input_ptr ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #192] //load rk12 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible - ldr q18, [x8, #0] //load rk0 + ld1 {v18.4s}, [x8], #16 //load rk0 lsr x5, x1, #3 //byte_len mov x15, x5 - ldr q20, [x8, #32] //load rk2 + ld1 {v19.4s}, [x8], #16 //load rk1 lsr x12, x11, #32 orr w11, w11, w11 @@ -2893,7 +3076,7 @@ aes_gcm_dec_192_kernel: fmov d1, x10 //CTR block 1 add w12, w12, #1 //increment rev_ctr32 - ldr q19, [x8, #16] //load rk1 + ld1 {v20.4s}, [x8], #16 //load rk2 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 @@ -2901,7 +3084,7 @@ aes_gcm_dec_192_kernel: add w12, w12, #1 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ldr q21, [x8, #48] //load rk3 + ld1 {v21.4s}, [x8], #16 //load rk3 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 @@ -2919,54 +3102,57 @@ aes_gcm_dec_192_kernel: fmov v3.d[1], x9 //CTR block 3 - ldr q26, [x8, #128] //load rk8 + ld1 {v22.4s}, [x8], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q29, [x8, #176] //load rk11 + ld1 {v23.4s}, [x8], #16 //load rk5 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ldp x13, x14, [x8, #192] //load rk12 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ldr q28, [x8, #160] //load rk10 + ld1 {v24.4s}, [x8], #16 //load rk6 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 - ldr q27, [x8, #144] //load rk9 + ld1 {v25.4s}, [x8], #16 //load rk7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 - ldr q25, [x8, #112] //load rk7 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 - ldr q22, [x8, #64] //load rk4 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 @@ -2984,7 +3170,7 @@ aes_gcm_dec_192_kernel: aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 - ldr q23, [x8, #80] //load rk5 + ld1 {v28.4s}, [x8], #16 //load rk10 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 @@ -2999,7 +3185,7 @@ aes_gcm_dec_192_kernel: aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 - ldr q24, [x8, #96] //load rk6 + ld1 {v29.4s}, [x8], #16 //load rk11 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 @@ -3086,17 +3272,13 @@ aes_gcm_dec_192_kernel: aese v0.16b, v29.16b //AES block 0 - round 11 b.ge .L192_dec_tail //handle tail - ldr q5, [x0, #16] //AES block 1 - load ciphertext - - ldr q4, [x0, #0] //AES block 0 - load ciphertext + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev w9, w12 //CTR block 4 - ldr q7, [x0, #48] //AES block 3 - load ciphertext - - ldr q6, [x0, #32] //AES block 2 - load ciphertext + ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext mov x19, v1.d[0] //AES block 1 - mov low @@ -3108,27 +3290,35 @@ aes_gcm_dec_192_kernel: mov x7, v0.d[1] //AES block 0 - mov high rev64 v4.16b, v4.16b //GHASH block 0 - add x0, x0, #64 //AES input_ptr update fmov d0, x10 //CTR block 4 rev64 v5.16b, v5.16b //GHASH block 1 cmp x0, x5 //check if we have <= 8 blocks eor x19, x19, x13 //AES block 1 - round 12 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif fmov v0.d[1], x9 //CTR block 4 rev w9, w12 //CTR block 5 orr x9, x11, x9, lsl #32 //CTR block 5 fmov d1, x10 //CTR block 5 eor x20, x20, x14 //AES block 1 - round 12 high - +#ifdef __AARCH64EB__ + rev x20, x20 +#endif add w12, w12, #1 //CTR block 5 fmov v1.d[1], x9 //CTR block 5 eor x6, x6, x13 //AES block 0 - round 12 low - +#ifdef __AARCH64EB__ + rev x6, x6 +#endif rev w9, w12 //CTR block 6 eor x7, x7, x14 //AES block 0 - round 12 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif stp x6, x7, [x2], #16 //AES block 0 - store result orr x9, x11, x9, lsl #32 //CTR block 6 @@ -3191,7 +3381,9 @@ aes_gcm_dec_192_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor x22, x22, x14 //AES block 4k+2 - round 12 high - +#ifdef __AARCH64EB__ + rev x22, x22 +#endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid @@ -3208,7 +3400,9 @@ aes_gcm_dec_192_kernel: pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low eor x21, x21, x13 //AES block 4k+2 - round 12 low - +#ifdef __AARCH64EB__ + rev x21, x21 +#endif aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 @@ -3310,16 +3504,18 @@ aes_gcm_dec_192_kernel: aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 - ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid - ldr q7, [x0, #48] //AES block 4k+7 - load ciphertext + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext eor x23, x23, x13 //AES block 4k+3 - round 12 low - +#ifdef __AARCH64EB__ + rev x23, x23 +#endif aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment @@ -3333,10 +3529,10 @@ aes_gcm_dec_192_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 - ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v29.16b //AES block 4k+5 - round 11 - ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext rev w9, w12 //CTR block 4k+8 aese v3.16b, v26.16b @@ -3347,11 +3543,13 @@ aes_gcm_dec_192_kernel: aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid - add x0, x0, #64 //AES input_ptr update cmp x0, x5 //.LOOP CONTROL eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result aese v2.16b, v28.16b @@ -3384,18 +3582,28 @@ aes_gcm_dec_192_kernel: rev w9, w12 //CTR block 4k+9 eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif orr x9, x11, x9, lsl #32 //CTR block 4k+9 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low fmov d1, x10 //CTR block 4k+9 add w12, w12, #1 //CTR block 4k+9 eor x19, x19, x13 //AES block 4k+5 - round 12 low - +#ifdef __AARCH64EB__ + rev x19, x19 +#endif fmov v1.d[1], x9 //CTR block 4k+9 rev w9, w12 //CTR block 4k+10 eor x20, x20, x14 //AES block 4k+5 - round 12 high - +#ifdef __AARCH64EB__ + rev x20, x20 +#endif eor x7, x7, x14 //AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor v11.16b, v11.16b, v10.16b //MODULO - fold into low @@ -3449,18 +3657,29 @@ aes_gcm_dec_192_kernel: pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor x24, x24, x14 //AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif fmov v3.d[1], x9 //CTR block 4k+7 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor x21, x21, x13 //AES block 4k+2 - round 12 low - +#ifdef __AARCH64EB__ + rev x21, x21 +#endif pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high eor x22, x22, x14 //AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev x22, x22 +#endif eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor x23, x23, x13 //AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev x23, x23 +#endif stp x21, x22, [x2], #16 //AES block 4k+2 - store result rev64 v7.16b, v7.16b //GHASH block 4k+3 @@ -3650,8 +3869,13 @@ aes_gcm_dec_192_kernel: cmp x5, #48 eor x7, x7, x14 //AES block 4k+4 - round 12 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor x6, x6, x13 //AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif b.gt .L192_dec_blocks_more_than_3 movi v11.8b, #0 @@ -3695,10 +3919,16 @@ aes_gcm_dec_192_kernel: pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high eor x6, x6, x13 //AES final-2 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif movi v8.8b, #0 //suppress further partial tag feed in pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x7, x7, x14 //AES final-2 block - round 12 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif .L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block @@ -3728,8 +3958,13 @@ aes_gcm_dec_192_kernel: eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor x7, x7, x14 //AES final-1 block - round 12 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor x6, x6, x13 //AES final-1 block - round 12 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L192_dec_blocks_more_than_1: //blocks left > 1 @@ -3760,9 +3995,13 @@ aes_gcm_dec_192_kernel: movi v8.8b, #0 //suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor x7, x7, x14 //AES final block - round 12 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor x6, x6, x13 //AES final block - round 12 low - +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L192_dec_blocks_less_than_1: //blocks left <= 1 @@ -3789,8 +4028,11 @@ aes_gcm_dec_192_kernel: orr x6, x6, x4 mov v0.d[1], x10 - +#ifndef __AARCH64EB__ rev w9, w12 +#else + mov w9, w12 +#endif and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits str w9, [x16, #12] //store the updated counter @@ -3878,14 +4120,22 @@ aes_gcm_enc_256_kernel: lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x13, x13, #32 + ror x14, x14, #32 +#endif ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 //byte_len - 1 - ldr q18, [x8, #0] //load rk0 + ld1 {v18.4s}, [x8], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - ldr q25, [x8, #112] //load rk7 + ld1 {v19.4s}, [x8], #16 //load rk1 add x5, x5, x0 lsr x12, x11, #32 @@ -3905,14 +4155,14 @@ aes_gcm_enc_256_kernel: orr x9, x11, x9, lsl #32 //CTR block 1 add w12, w12, #1 //CTR block 1 - ldr q19, [x8, #16] //load rk1 + ld1 {v20.4s}, [x8], #16 //load rk2 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 - ldr q20, [x8, #32] //load rk2 + ld1 {v21.4s}, [x8], #16 //load rk3 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 @@ -3925,50 +4175,53 @@ aes_gcm_enc_256_kernel: aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q21, [x8, #48] //load rk3 + ld1 {v22.4s}, [x8], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ldr q24, [x8, #96] //load rk6 + ld1 {v23.4s}, [x8], #16 //load rk5 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q23, [x8, #80] //load rk5 + ld1 {v24.4s}, [x8], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ldr q31, [x8, #208] //load rk13 + ld1 {v25.4s}, [x8], #16 //load rk7 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ldr q22, [x8, #64] //load rk4 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ldr q30, [x8, #192] //load rk12 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 - ldr q29, [x8, #176] //load rk11 + ld1 {v28.4s}, [x8], #16 //load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 - ldr q26, [x8, #128] //load rk8 + ld1 {v29.4s}, [x8], #16 //load rk11 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 @@ -3976,7 +4229,6 @@ aes_gcm_enc_256_kernel: aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 - ldp x13, x14, [x8, #224] //load rk14 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 @@ -4014,16 +4266,17 @@ aes_gcm_enc_256_kernel: aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 - ldr q27, [x8, #144] //load rk9 + ld1 {v30.4s}, [x8], #16 //load rk12 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 - ldr q28, [x8, #160] //load rk10 + ld1 {v31.4s}, [x8], #16 //load rk13 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 @@ -4112,13 +4365,26 @@ aes_gcm_enc_256_kernel: b.ge .L256_enc_tail //handle tail ldp x19, x20, [x0, #16] //AES block 1 - load plaintext - +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif rev w9, w12 //CTR block 4 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif ldp x23, x24, [x0, #48] //AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif ldp x21, x22, [x0, #32] //AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif add x0, x0, #64 //AES input_ptr update eor x19, x19, x13 //AES block 1 - round 14 low @@ -4201,11 +4467,17 @@ aes_gcm_enc_256_kernel: aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext - +#ifdef __AARCH64EB__ + rev x23, x23 + rev x24, x24 +#endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev x21, x21 + rev x22, x22 +#endif aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 eor v4.16b, v4.16b, v11.16b //PRE 1 @@ -4315,7 +4587,10 @@ aes_gcm_enc_256_kernel: aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev x19, x19 + rev x20, x20 +#endif aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 mov d4, v7.d[1] //GHASH block 4k+3 - mid @@ -4352,7 +4627,10 @@ aes_gcm_enc_256_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 shl d8, d8, #56 //mod_constant @@ -4714,7 +4992,10 @@ aes_gcm_enc_256_kernel: ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor x6, x6, x13 //AES block 4k+4 - round 14 low eor x7, x7, x14 //AES block 4k+4 - round 14 high @@ -4749,7 +5030,10 @@ aes_gcm_enc_256_kernel: st1 { v5.16b}, [x2], #16 //AES final-3 block - store result ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif rev64 v4.16b, v5.16b //GHASH final-3 block eor x6, x6, x13 //AES final-2 block - round 14 low @@ -4778,7 +5062,10 @@ aes_gcm_enc_256_kernel: st1 { v5.16b}, [x2], #16 //AES final-2 block - store result ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif rev64 v4.16b, v5.16b //GHASH final-2 block eor x6, x6, x13 //AES final-1 block - round 14 low @@ -4814,7 +5101,10 @@ aes_gcm_enc_256_kernel: rev64 v4.16b, v5.16b //GHASH final-1 block ldp x6, x7, [x0], #16 //AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev x6, x6 + rev x7, x7 +#endif eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in @@ -4875,7 +5165,11 @@ aes_gcm_enc_256_kernel: pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high mov d8, v4.d[1] //GHASH final block - mid +#ifndef __AARCH64EB__ rev w9, w12 +#else + mov w9, w12 +#endif pmull v21.1q, v4.1d, v12.1d //GHASH final block - low @@ -4949,21 +5243,29 @@ aes_gcm_dec_256_kernel: lsr x5, x1, #3 //byte_len mov x15, x5 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 - - ldr q26, [x8, #128] //load rk8 +#ifdef __AARCH64EB__ + rev x10, x10 + rev x11, x11 +#endif + ldp x13, x14, [x8, #224] //load rk14 +#ifdef __AARCH64EB__ + ror x14, x14, #32 + ror x13, x13, #32 +#endif + ld1 {v18.4s}, [x8], #16 //load rk0 sub x5, x5, #1 //byte_len - 1 - ldr q25, [x8, #112] //load rk7 + ld1 {v19.4s}, [x8], #16 //load rk1 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x4, x0, x1, lsr #3 //end_input_ptr - ldr q24, [x8, #96] //load rk6 + ld1 {v20.4s}, [x8], #16 //load rk2 lsr x12, x11, #32 - ldr q23, [x8, #80] //load rk5 + ld1 {v21.4s}, [x8], #16 //load rk3 orr w11, w11, w11 - ldr q21, [x8, #48] //load rk3 + ld1 {v22.4s}, [x8], #16 //load rk4 add x5, x5, x0 rev w12, w12 //rev_ctr32 @@ -4988,39 +5290,44 @@ aes_gcm_dec_256_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ldr q18, [x8, #0] //load rk0 + ld1 {v23.4s}, [x8], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 add w12, w12, #1 //CTR block 3 - ldr q22, [x8, #64] //load rk4 + ld1 {v24.4s}, [x8], #16 //load rk6 - ldr q31, [x8, #208] //load rk13 + ld1 {v25.4s}, [x8], #16 //load rk7 - ldr q19, [x8, #16] //load rk1 + ld1 {v26.4s}, [x8], #16 //load rk8 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 ldr q14, [x3, #80] //load h3l | h3h +#ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 +#endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 ldr q15, [x3, #112] //load h4l | h4h +#ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 +#endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 ldr q13, [x3, #64] //load h2l | h2h +#ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 +#endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q20, [x8, #32] //load rk2 + ld1 {v27.4s}, [x8], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 - ldp x13, x14, [x8, #224] //load rk14 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 @@ -5030,20 +5337,21 @@ aes_gcm_dec_256_kernel: aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ldr q27, [x8, #144] //load rk9 + ld1 {v28.4s}, [x8], #16 //load rk10 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ldr q30, [x8, #192] //load rk12 + ld1 {v29.4s}, [x8], #16 //load rk11 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 ldr q12, [x3, #32] //load h1l | h1h +#ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ldr q28, [x8, #160] //load rk10 + ld1 {v30.4s}, [x8], #16 //load rk12 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 @@ -5126,7 +5434,7 @@ aes_gcm_dec_256_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 - ldr q29, [x8, #176] //load rk11 + ld1 {v31.4s}, [x8], #16 //load rk13 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 @@ -5191,9 +5499,7 @@ aes_gcm_dec_256_kernel: aese v0.16b, v31.16b //AES block 0 - round 13 b.ge .L256_dec_tail //handle tail - ldr q4, [x0, #0] //AES block 0 - load ciphertext - - ldr q5, [x0, #16] //AES block 1 - load ciphertext + ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext rev w9, w12 //CTR block 4 @@ -5201,7 +5507,7 @@ aes_gcm_dec_256_kernel: eor v1.16b, v5.16b, v1.16b //AES block 1 - result rev64 v5.16b, v5.16b //GHASH block 1 - ldr q7, [x0, #48] //AES block 3 - load ciphertext + ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext mov x7, v0.d[1] //AES block 0 - mov high @@ -5221,22 +5527,32 @@ aes_gcm_dec_256_kernel: orr x9, x11, x9, lsl #32 //CTR block 5 mov x20, v1.d[1] //AES block 1 - mov high eor x7, x7, x14 //AES block 0 - round 14 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif eor x6, x6, x13 //AES block 0 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif stp x6, x7, [x2], #16 //AES block 0 - store result fmov d1, x10 //CTR block 5 - ldr q6, [x0, #32] //AES block 2 - load ciphertext - add x0, x0, #64 //AES input_ptr update + ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 add w12, w12, #1 //CTR block 6 eor x19, x19, x13 //AES block 1 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif orr x9, x11, x9, lsl #32 //CTR block 6 eor x20, x20, x14 //AES block 1 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif stp x19, x20, [x2], #16 //AES block 1 - store result eor v2.16b, v6.16b, v2.16b //AES block 2 - result @@ -5287,7 +5603,9 @@ aes_gcm_dec_256_kernel: aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 eor x22, x22, x14 //AES block 4k+2 - round 14 high - +#ifdef __AARCH64EB__ + rev x22, x22 +#endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 mov d10, v17.d[1] //GHASH block 4k - mid @@ -5299,7 +5617,9 @@ aes_gcm_dec_256_kernel: aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 eor x21, x21, x13 //AES block 4k+2 - round 14 low - +#ifdef __AARCH64EB__ + rev x21, x21 +#endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 stp x21, x22, [x2], #16 //AES block 4k+2 - store result @@ -5314,9 +5634,14 @@ aes_gcm_dec_256_kernel: pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid eor x23, x23, x13 //AES block 4k+3 - round 14 low - +#ifdef __AARCH64EB__ + rev x23, x23 +#endif pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high aese v2.16b, v22.16b @@ -5437,7 +5762,7 @@ aes_gcm_dec_256_kernel: aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 - ldr q4, [x0, #0] //AES block 4k+4 - load ciphertext + ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext aese v0.16b, v31.16b //AES block 4k+4 - round 13 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment @@ -5448,7 +5773,7 @@ aes_gcm_dec_256_kernel: aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 - ldr q5, [x0, #16] //AES block 4k+5 - load ciphertext + ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 @@ -5464,11 +5789,11 @@ aes_gcm_dec_256_kernel: aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 - ldr q7, [x0, #48] //AES block 4k+7 - load ciphertext + ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 - ldr q6, [x0, #32] //AES block 4k+6 - load ciphertext + ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 @@ -5479,7 +5804,6 @@ aes_gcm_dec_256_kernel: eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid aese v1.16b, v31.16b //AES block 4k+5 - round 13 - add x0, x0, #64 //AES input_ptr update mov x6, v0.d[0] //AES block 4k+4 - mov low aese v2.16b, v30.16b @@ -5501,8 +5825,13 @@ aes_gcm_dec_256_kernel: add w12, w12, #1 //CTR block 4k+9 eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor x7, x7, x14 //AES block 4k+4 - round 14 high - +#ifdef __AARCH64EB__ + rev x7, x7 +#endif mov x20, v1.d[1] //AES block 4k+5 - mov high eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result eor v11.16b, v11.16b, v8.16b //MODULO - fold into low @@ -5523,9 +5852,15 @@ aes_gcm_dec_256_kernel: rev64 v5.16b, v5.16b //GHASH block 4k+5 eor x20, x20, x14 //AES block 4k+5 - round 14 high +#ifdef __AARCH64EB__ + rev x20, x20 +#endif stp x6, x7, [x2], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 14 low +#ifdef __AARCH64EB__ + rev x19, x19 +#endif stp x19, x20, [x2], #16 //AES block 4k+5 - store result rev64 v4.16b, v4.16b //GHASH block 4k+4 @@ -5732,11 +6067,15 @@ aes_gcm_dec_256_kernel: aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 eor x22, x22, x14 //AES block 4k+2 - round 14 high - +#ifdef __AARCH64EB__ + rev x22, x22 +#endif aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 eor x23, x23, x13 //AES block 4k+3 - round 14 low - +#ifdef __AARCH64EB__ + rev x23, x23 +#endif aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid @@ -5748,12 +6087,18 @@ aes_gcm_dec_256_kernel: aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 eor x21, x21, x13 //AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev x21, x21 +#endif aese v2.16b, v30.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low eor x24, x24, x14 //AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev x24, x24 +#endif aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 @@ -5794,8 +6139,14 @@ aes_gcm_dec_256_kernel: cmp x5, #48 eor x6, x6, x13 //AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor x7, x7, x14 //AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif b.gt .L256_dec_blocks_more_than_3 sub w12, w12, #1 @@ -5843,9 +6194,15 @@ aes_gcm_dec_256_kernel: pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid eor x6, x6, x13 //AES final-2 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low eor x7, x7, x14 //AES final-2 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif .L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block @@ -5873,9 +6230,15 @@ aes_gcm_dec_256_kernel: eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high eor x6, x6, x13 //AES final-1 block - round 14 low +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid eor x7, x7, x14 //AES final-1 block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif .L256_dec_blocks_more_than_1: //blocks left > 1 stp x6, x7, [x2], #16 //AES final-1 block - store result @@ -5903,13 +6266,18 @@ aes_gcm_dec_256_kernel: pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid eor x6, x6, x13 //AES final block - round 14 low - +#ifdef __AARCH64EB__ + rev x6, x6 +#endif eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid eor x7, x7, x14 //AES final block - round 14 high +#ifdef __AARCH64EB__ + rev x7, x7 +#endif .L256_dec_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 @@ -5935,7 +6303,11 @@ aes_gcm_dec_256_kernel: mov v0.d[1], x10 bic x4, x4, x9 //mask out low existing bytes +#ifndef __AARCH64EB__ rev w9, w12 +#else + mov w9, w12 +#endif bic x5, x5, x10 //mask out high existing bytes Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.5 src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.6 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S:1.5 Wed May 10 21:31:54 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/aarch64/aesv8-armx.S Wed May 31 15:35:31 2023 @@ -1859,10 +1859,10 @@ aes_v8_xts_encrypt: b.ne .Lxts_enc_big_size // Encrypt the iv with key2, as the first XEX iv. ldr w6,[x4,#240] - ld1 {v0.16b},[x4],#16 + ld1 {v0.4s},[x4],#16 ld1 {v6.16b},[x5] sub w6,w6,#2 - ld1 {v1.16b},[x4],#16 + ld1 {v1.4s},[x4],#16 .Loop_enc_iv_enc: aese v6.16b,v0.16b @@ -2462,9 +2462,9 @@ aes_v8_xts_encrypt: // Encrypt the composite block to get the last second encrypted text block ldr w6,[x3,#240] // load key schedule... - ld1 {v0.16b},[x3],#16 + ld1 {v0.4s},[x3],#16 sub w6,w6,#2 - ld1 {v1.16b},[x3],#16 // load key schedule... + ld1 {v1.4s},[x3],#16 // load key schedule... .Loop_final_enc: aese v26.16b,v0.16b aesmc v26.16b,v26.16b @@ -2500,10 +2500,10 @@ aes_v8_xts_decrypt: b.ne .Lxts_dec_big_size // Encrypt the iv with key2, as the first XEX iv. ldr w6,[x4,#240] - ld1 {v0.16b},[x4],#16 + ld1 {v0.4s},[x4],#16 ld1 {v6.16b},[x5] sub w6,w6,#2 - ld1 {v1.16b},[x4],#16 + ld1 {v1.4s},[x4],#16 .Loop_dec_small_iv_enc: aese v6.16b,v0.16b @@ -2581,10 +2581,10 @@ aes_v8_xts_decrypt: // Encrypt the iv with key2, as the first XEX iv ldr w6,[x4,#240] - ld1 {v0.16b},[x4],#16 + ld1 {v0.4s},[x4],#16 ld1 {v6.16b},[x5] sub w6,w6,#2 - ld1 {v1.16b},[x4],#16 + ld1 {v1.4s},[x4],#16 .Loop_dec_iv_enc: aese v6.16b,v0.16b @@ -2914,7 +2914,7 @@ aes_v8_xts_decrypt: .align 4 .Lxts_dec_tail4x: add x0,x0,#16 - ld1 {v0.4s},[x0],#16 + tst x21,#0xf eor v5.16b,v1.16b,v4.16b st1 {v5.16b},[x1],#16 eor v17.16b,v24.16b,v17.16b @@ -2923,6 +2923,8 @@ aes_v8_xts_decrypt: eor v31.16b,v26.16b,v31.16b st1 {v30.16b,v31.16b},[x1],#32 + b.eq .Lxts_dec_abort + ld1 {v0.16b},[x0],#16 b .Lxts_done .align 4 .Lxts_outer_dec_tail: @@ -3100,7 +3102,7 @@ aes_v8_xts_decrypt: // Processing the last two blocks with cipher stealing. mov x7,x3 cbnz x2,.Lxts_dec_1st_done - ld1 {v0.4s},[x0],#16 + ld1 {v0.16b},[x0],#16 // Decrypt the last secod block to get the last plain text block .Lxts_dec_1st_done: @@ -3145,9 +3147,9 @@ aes_v8_xts_decrypt: // Decrypt the composite block to get the last second plain text block ldr w6,[x7,#240] - ld1 {v0.16b},[x7],#16 + ld1 {v0.4s},[x7],#16 sub w6,w6,#2 - ld1 {v1.16b},[x7],#16 + ld1 {v1.4s},[x7],#16 .Loop_final_dec: aesd v26.16b,v0.16b aesimc v26.16b,v26.16b Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.2 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S:1.1 Tue May 9 13:22:43 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/aes-gcm-armv8_64.S Wed May 31 15:35:31 2023 @@ -29,28 +29,36 @@ aes_gcm_enc_128_kernel: stp d14, d15, [sp, #96] ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif ldp r13, r14, [r8, #160] @ load rk10 - +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif ld1 {v11.16b}, [r3] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b lsr r5, r1, #3 @ byte_len mov r15, r5 - ldr q27, [r8, #144] @ load rk9 + ld1 {v18.4s}, [r8], #16 @ load rk0 add r4, r0, r1, lsr #3 @ end_input_ptr sub r5, r5, #1 @ byte_len - 1 lsr r12, r11, #32 ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif fmov d1, r10 @ CTR block 1 rev r12, r12 @ rev_ctr32 add r12, r12, #1 @ increment rev_ctr32 orr r11, r11, r11 - ldr q18, [r8, #0] @ load rk0 + ld1 {v19.4s}, [r8], #16 @ load rk1 rev r9, r12 @ CTR block 1 add r12, r12, #1 @ CTR block 1 @@ -70,30 +78,33 @@ aes_gcm_enc_128_kernel: rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 - ldr q19, [r8, #16] @ load rk1 + ld1 {v20.4s}, [r8], #16 @ load rk2 add r12, r12, #1 @ CTR block 3 fmov v3.d[1], r9 @ CTR block 3 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 - ldr q20, [r8, #32] @ load rk2 + ld1 {v21.4s}, [r8], #16 @ load rk3 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 +#endif aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 - ldr q26, [r8, #128] @ load rk8 + ld1 {v22.4s}, [r8], #16 @ load rk4 aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 - ldr q21, [r8, #48] @ load rk3 + ld1 {v23.4s}, [r8], #16 @ load rk5 aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 @@ -101,11 +112,11 @@ aes_gcm_enc_128_kernel: aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 - ldr q24, [r8, #96] @ load rk6 + ld1 {v24.4s}, [r8], #16 @ load rk6 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 - ldr q25, [r8, #112] @ load rk7 + ld1 {v25.4s}, [r8], #16 @ load rk7 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 @@ -113,12 +124,14 @@ aes_gcm_enc_128_kernel: aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 - ldr q23, [r8, #80] @ load rk5 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 +#endif aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 @@ -135,7 +148,7 @@ aes_gcm_enc_128_kernel: aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 - ldr q22, [r8, #64] @ load rk4 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 @@ -218,13 +231,25 @@ aes_gcm_enc_128_kernel: bge .L128_enc_tail @ handle tail ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext - +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext - +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext - +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif eor r6, r6, r13 @ AES block 0 - round 10 low eor r7, r7, r14 @ AES block 0 - round 10 high @@ -289,6 +314,10 @@ aes_gcm_enc_128_kernel: .L128_enc_main_loop:@ main loop start ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif rev64 q4, q4 @ GHASH block 4k (only t0 is free) rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free) @@ -323,7 +352,10 @@ aes_gcm_enc_128_kernel: pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif aese q0, v19.16b aesmc q0, q0 @ AES block 4k+4 - round 1 rev r9, r12 @ CTR block 4k+8 @@ -405,7 +437,10 @@ aes_gcm_enc_128_kernel: aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext - +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif aese q3, v21.16b aesmc q3, q3 @ AES block 4k+7 - round 3 eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid @@ -413,7 +448,10 @@ aes_gcm_enc_128_kernel: aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext - +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low @@ -722,7 +760,10 @@ aes_gcm_enc_128_kernel: sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif cmp r5, #48 ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag @@ -760,7 +801,10 @@ aes_gcm_enc_128_kernel: st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif rev64 q4, q5 @ GHASH final-3 block eor q4, q4, q8 @ feed in partial tag @@ -789,7 +833,10 @@ aes_gcm_enc_128_kernel: rev64 q4, q5 @ GHASH final-2 block ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor q4, q4, q8 @ feed in partial tag eor r6, r6, r13 @ AES final-1 block - round 10 low @@ -823,7 +870,10 @@ aes_gcm_enc_128_kernel: rev64 q4, q5 @ GHASH final-1 block ldp r6, r7, [r0], #16 @ AES final block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final block - round 10 high @@ -886,9 +936,11 @@ aes_gcm_enc_128_kernel: ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored eor q8, q8, q4 @ GHASH final block - mid - +#ifndef __ARMEB__ rev r9, r12 - +#else + mov r9, r12 +#endif pmull2 v20.1q, q4, v12.2d @ GHASH final block - high pmull v8.1q, q8, v16.1d @ GHASH final block - mid @@ -961,20 +1013,29 @@ aes_gcm_dec_128_kernel: lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 - +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #160] @ load rk10 +#ifdef __ARMEB__ + ror r14, r14, 32 + ror r13, r13, 32 +#endif sub r5, r5, #1 @ byte_len - 1 - ldr q18, [r8, #0] @ load rk0 + ld1 {v18.4s}, [r8], #16 @ load rk0 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif lsr r12, r11, #32 fmov d2, r10 @ CTR block 2 - ldr q19, [r8, #16] @ load rk1 + ld1 {v19.4s}, [r8], #16 @ load rk1 orr r11, r11, r11 rev r12, r12 @ rev_ctr32 @@ -986,7 +1047,7 @@ aes_gcm_dec_128_kernel: rev r9, r12 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 - ldr q20, [r8, #32] @ load rk2 + ld1 {v20.4s}, [r8], #16 @ load rk2 add r12, r12, #1 @ CTR block 1 fmov v1.d[1], r9 @ CTR block 1 @@ -1009,19 +1070,19 @@ aes_gcm_dec_128_kernel: aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 - ldr q21, [r8, #48] @ load rk3 + ld1 {v21.4s}, [r8], #16 @ load rk3 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 - ldr q24, [r8, #96] @ load rk6 + ld1 {v22.4s}, [r8], #16 @ load rk4 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 - ldr q25, [r8, #112] @ load rk7 + ld1 {v23.4s}, [r8], #16 @ load rk5 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 - ldr q22, [r8, #64] @ load rk4 + ld1 {v24.4s}, [r8], #16 @ load rk6 aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 @@ -1031,7 +1092,6 @@ aes_gcm_dec_128_kernel: aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 - ldp r13, r14, [r8, #160] @ load rk10 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 @@ -1041,7 +1101,7 @@ aes_gcm_dec_128_kernel: aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 - ldr q23, [r8, #80] @ load rk5 + ld1 {v25.4s}, [r8], #16 @ load rk7 aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 @@ -1051,7 +1111,7 @@ aes_gcm_dec_128_kernel: aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 - ldr q27, [r8, #144] @ load rk9 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 @@ -1062,11 +1122,12 @@ aes_gcm_dec_128_kernel: aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 - ldr q26, [r8, #128] @ load rk8 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 @@ -1083,8 +1144,9 @@ aes_gcm_dec_128_kernel: aese q2, v23.16b aesmc q2, q2 @ AES block 2 - round 5 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese q3, v23.16b aesmc q3, q3 @ AES block 3 - round 5 @@ -1102,7 +1164,9 @@ aes_gcm_dec_128_kernel: trn1 q8, v12.2d, v13.2d @ h2h | h1h ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 +#endif trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l add r5, r5, r0 @@ -1144,12 +1208,10 @@ aes_gcm_dec_128_kernel: eor v17.16b, v17.16b, q9 @ h4k | h3k bge .L128_dec_tail @ handle tail - ldr q5, [r0, #16] @ AES block 1 - load ciphertext - - ldr q4, [r0, #0] @ AES block 0 - load ciphertext + ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext eor q1, q5, q1 @ AES block 1 - result - ldr q6, [r0, #32] @ AES block 2 - load ciphertext + ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext eor q0, q4, q0 @ AES block 0 - result rev64 q4, q4 @ GHASH block 0 @@ -1157,10 +1219,9 @@ aes_gcm_dec_128_kernel: orr r9, r11, r9, lsl #32 @ CTR block 4 add r12, r12, #1 @ CTR block 4 - ldr q7, [r0, #48] @ AES block 3 - load ciphertext + ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext rev64 q5, q5 @ GHASH block 1 - add r0, r0, #64 @ AES input_ptr update mov r19, v1.d[0] @ AES block 1 - mov low mov r20, v1.d[1] @ AES block 1 - mov high @@ -1175,7 +1236,9 @@ aes_gcm_dec_128_kernel: fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 eor r19, r19, r13 @ AES block 1 - round 10 low - +#ifdef __ARMEB__ + rev r19, r19 +#endif fmov d1, r10 @ CTR block 5 add r12, r12, #1 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 @@ -1187,10 +1250,19 @@ aes_gcm_dec_128_kernel: orr r9, r11, r9, lsl #32 @ CTR block 6 eor r20, r20, r14 @ AES block 1 - round 10 high +#ifdef __ARMEB__ + rev r20, r20 +#endif eor r6, r6, r13 @ AES block 0 - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor q2, q6, q2 @ AES block 2 - result eor r7, r7, r14 @ AES block 0 - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif stp r6, r7, [r2], #16 @ AES block 0 - store result stp r19, r20, [r2], #16 @ AES block 1 - store result @@ -1258,9 +1330,14 @@ aes_gcm_dec_128_kernel: aese q3, v19.16b aesmc q3, q3 @ AES block 4k+7 - round 1 eor r23, r23, r13 @ AES block 4k+3 - round 10 low - +#ifdef __ARMEB__ + rev r23, r23 +#endif pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid eor r22, r22, r14 @ AES block 4k+2 - round 10 high +#ifdef __ARMEB__ + rev r22, r22 +#endif mov d31, v6.d[1] @ GHASH block 4k+2 - mid aese q0, v19.16b @@ -1298,7 +1375,9 @@ aes_gcm_dec_128_kernel: pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid eor r24, r24, r14 @ AES block 4k+3 - round 10 high - +#ifdef __ARMEB__ + rev r24, r24 +#endif aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid @@ -1306,7 +1385,9 @@ aes_gcm_dec_128_kernel: aese q1, v23.16b aesmc q1, q1 @ AES block 4k+5 - round 5 eor r21, r21, r13 @ AES block 4k+2 - round 10 low - +#ifdef __ARMEB__ + rev r21, r21 +#endif aese q0, v23.16b aesmc q0, q0 @ AES block 4k+4 - round 5 movi q8, #0xc2 @@ -1328,7 +1409,7 @@ aes_gcm_dec_128_kernel: pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor q9, q9, q4 @ GHASH block 4k+3 - high - ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext + ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext aese q1, v25.16b aesmc q1, q1 @ AES block 4k+5 - round 7 @@ -1355,7 +1436,7 @@ aes_gcm_dec_128_kernel: rev r9, r12 @ CTR block 4k+8 pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid - ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext + ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext ext q9, q9, q9, #8 @ MODULO - other top alignment aese q0, v27.16b @ AES block 4k+4 - round 9 @@ -1373,7 +1454,7 @@ aes_gcm_dec_128_kernel: aese q3, v23.16b aesmc q3, q3 @ AES block 4k+7 - round 5 - ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext + ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext add r12, r12, #1 @ CTR block 4k+8 eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid @@ -1381,11 +1462,10 @@ aes_gcm_dec_128_kernel: aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 - ldr q7, [r0, #48] @ AES block 4k+3 - load ciphertext + ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 - add r0, r0, #64 @ AES input_ptr update rev64 q5, q5 @ GHASH block 4k+5 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid @@ -1410,11 +1490,15 @@ aes_gcm_dec_128_kernel: aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor r7, r7, r14 @ AES block 4k+4 - round 10 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor v11.16b, v11.16b, q8 @ MODULO - fold into low mov r20, v1.d[1] @ AES block 4k+5 - mov high eor r6, r6, r13 @ AES block 4k+4 - round 10 low - +#ifdef __ARMEB__ + rev r6, r6 +#endif eor q2, q6, q2 @ AES block 4k+6 - result mov r19, v1.d[0] @ AES block 4k+5 - mov low add r12, r12, #1 @ CTR block 4k+9 @@ -1431,9 +1515,15 @@ aes_gcm_dec_128_kernel: add r12, r12, #1 @ CTR block 4k+10 eor r20, r20, r14 @ AES block 4k+5 - round 10 high +#ifdef __ARMEB__ + rev r20, r20 +#endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor r19, r19, r13 @ AES block 4k+5 - round 10 low +#ifdef __ARMEB__ + rev r19, r19 +#endif stp r19, r20, [r2], #16 @ AES block 4k+5 - store result orr r9, r11, r9, lsl #32 @ CTR block 4k+10 @@ -1538,9 +1628,14 @@ aes_gcm_dec_128_kernel: aese q3, v20.16b aesmc q3, q3 @ AES block 4k+7 - round 2 eor r23, r23, r13 @ AES block 4k+3 - round 10 low - +#ifdef __ARMEB__ + rev r23, r23 +#endif pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid eor r21, r21, r13 @ AES block 4k+2 - round 10 low +#ifdef __ARMEB__ + rev r21, r21 +#endif eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low aese q2, v21.16b @@ -1613,7 +1708,9 @@ aes_gcm_dec_128_kernel: pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor r24, r24, r14 @ AES block 4k+3 - round 10 high - +#ifdef __ARMEB__ + rev r24, r24 +#endif aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment @@ -1631,7 +1728,9 @@ aes_gcm_dec_128_kernel: aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 eor r22, r22, r14 @ AES block 4k+2 - round 10 high - +#ifdef __ARMEB__ + rev r22, r22 +#endif aese q0, v27.16b @ AES block 4k+4 - round 9 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result @@ -1655,9 +1754,14 @@ aes_gcm_dec_128_kernel: cmp r5, #48 eor r7, r7, r14 @ AES block 4k+4 - round 10 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag eor r6, r6, r13 @ AES block 4k+4 - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif bgt .L128_dec_blocks_more_than_3 mov q3, q2 @@ -1701,9 +1805,14 @@ aes_gcm_dec_128_kernel: movi q8, #0 @ suppress further partial tag feed in eor r7, r7, r14 @ AES final-2 block - round 10 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r6, r6, r13 @ AES final-2 block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif .L128_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block @@ -1729,12 +1838,18 @@ aes_gcm_dec_128_kernel: pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid eor r6, r6, r13 @ AES final-1 block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low eor q9, q9, v20.16b @ GHASH final-2 block - high eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid eor r7, r7, r14 @ AES final-1 block - round 10 high +#ifdef __ARMEB__ + rev r7, r7 +#endif .L128_dec_blocks_more_than_1:@ blocks left > 1 rev64 q4, q5 @ GHASH final-1 block @@ -1765,8 +1880,13 @@ aes_gcm_dec_128_kernel: eor q9, q9, v20.16b @ GHASH final-1 block - high eor r7, r7, r14 @ AES final block - round 10 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor r6, r6, r13 @ AES final block - round 10 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid .L128_dec_blocks_less_than_1:@ blocks left <= 1 @@ -1812,7 +1932,11 @@ aes_gcm_dec_128_kernel: bic r4, r4, r9 @ mask out low existing bytes and r6, r6, r9 +#ifndef __ARMEB__ rev r9, r12 +#else + mov r9, r12 +#endif eor v10.16b, v10.16b, q8 @ GHASH final block - mid movi q8, #0xc2 @@ -1879,18 +2003,26 @@ aes_gcm_enc_192_kernel: stp d14, d15, [sp, #96] ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #192] @ load rk12 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif + ld1 {v18.4s}, [r8], #16 @ load rk0 - ldr q23, [r8, #80] @ load rk5 - - ldr q22, [r8, #64] @ load rk4 + ld1 {v19.4s}, [r8], #16 @ load rk1 - ldr q26, [r8, #128] @ load rk8 + ld1 {v20.4s}, [r8], #16 @ load rk2 lsr r12, r11, #32 - ldr q24, [r8, #96] @ load rk6 + ld1 {v21.4s}, [r8], #16 @ load rk3 orr r11, r11, r11 - ldr q25, [r8, #112] @ load rk7 + ld1 {v22.4s}, [r8], #16 @ load rk4 rev r12, r12 @ rev_ctr32 add r12, r12, #1 @ increment rev_ctr32 @@ -1914,15 +2046,13 @@ aes_gcm_enc_192_kernel: rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 - ldr q18, [r8, #0] @ load rk0 + ld1 {v23.4s}, [r8], #16 @ load rk5 fmov v3.d[1], r9 @ CTR block 3 - ldr q21, [r8, #48] @ load rk3 + ld1 {v24.4s}, [r8], #16 @ load rk6 - ldp r13, r14, [r8, #192] @ load rk12 - - ldr q19, [r8, #16] @ load rk1 + ld1 {v25.4s}, [r8], #16 @ load rk7 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 @@ -1932,35 +2062,38 @@ aes_gcm_enc_192_kernel: aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 - ldr q29, [r8, #176] @ load rk11 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 - ldr q20, [r8, #32] @ load rk2 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 - ldr q28, [r8, #160] @ load rk10 + ld1 {v28.4s}, [r8], #16 @ load rk10 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 - ldr q27, [r8, #144] @ load rk9 + ld1 {v29.4s}, [r8], #16 @ load rk11 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 @@ -2017,8 +2150,9 @@ aes_gcm_enc_192_kernel: aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese q1, v24.16b aesmc q1, q1 @ AES block 1 - round 6 @@ -2098,13 +2232,26 @@ aes_gcm_enc_192_kernel: rev r9, r12 @ CTR block 4 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif orr r9, r11, r9, lsl #32 @ CTR block 4 ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext - +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext - +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif add r0, r0, #64 @ AES input_ptr update cmp r0, r5 @ check if we have <= 8 blocks @@ -2176,7 +2323,10 @@ aes_gcm_enc_192_kernel: aese q1, v18.16b aesmc q1, q1 @ AES block 4k+5 - round 0 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext - +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0 fmov d3, r10 @ CTR block 4k+3 rev64 q4, q4 @ GHASH block 4k (only t0 is free) @@ -2188,11 +2338,17 @@ aes_gcm_enc_192_kernel: pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext - +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif aese q0, v18.16b aesmc q0, q0 @ AES block 4k+4 - round 0 ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext - +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low eor q4, q4, v11.16b @ PRE 1 @@ -2285,7 +2441,10 @@ aes_gcm_enc_192_kernel: aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif aese q0, v24.16b aesmc q0, q0 @ AES block 4k+4 - round 6 eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low @@ -2650,7 +2809,10 @@ aes_gcm_enc_192_kernel: sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor r6, r6, r13 @ AES block 4k+4 - round 12 low eor r7, r7, r14 @ AES block 4k+4 - round 12 high @@ -2687,7 +2849,10 @@ aes_gcm_enc_192_kernel: st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif rev64 q4, q5 @ GHASH final-3 block eor r6, r6, r13 @ AES final-2 block - round 12 low @@ -2718,7 +2883,10 @@ aes_gcm_enc_192_kernel: rev64 q4, q5 @ GHASH final-2 block ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor q4, q4, q8 @ feed in partial tag eor r7, r7, r14 @ AES final-1 block - round 12 high @@ -2749,7 +2917,10 @@ aes_gcm_enc_192_kernel: st1 { q5}, [r2], #16 @ AES final-1 block - store result ldp r6, r7, [r0], #16 @ AES final block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif rev64 q4, q5 @ GHASH final-1 block eor r6, r6, r13 @ AES final block - round 12 low @@ -2781,7 +2952,11 @@ aes_gcm_enc_192_kernel: .L192_enc_blocks_less_than_1:@ blocks left <= 1 ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored +#ifndef __ARMEB__ rev r9, r12 +#else + mov r9, r12 +#endif and r1, r1, #127 @ bit_length %= 128 sub r1, r1, #128 @ bit_length -= 128 @@ -2886,14 +3061,22 @@ aes_gcm_dec_192_kernel: add r4, r0, r1, lsr #3 @ end_input_ptr ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 - +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #192] @ load rk12 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible - ldr q18, [r8, #0] @ load rk0 + ld1 {v18.4s}, [r8], #16 @ load rk0 lsr r5, r1, #3 @ byte_len mov r15, r5 - ldr q20, [r8, #32] @ load rk2 + ld1 {v19.4s}, [r8], #16 @ load rk1 lsr r12, r11, #32 orr r11, r11, r11 @@ -2903,7 +3086,7 @@ aes_gcm_dec_192_kernel: fmov d1, r10 @ CTR block 1 add r12, r12, #1 @ increment rev_ctr32 - ldr q19, [r8, #16] @ load rk1 + ld1 {v20.4s}, [r8], #16 @ load rk2 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 @@ -2911,7 +3094,7 @@ aes_gcm_dec_192_kernel: add r12, r12, #1 @ CTR block 1 orr r9, r11, r9, lsl #32 @ CTR block 1 - ldr q21, [r8, #48] @ load rk3 + ld1 {v21.4s}, [r8], #16 @ load rk3 fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 @@ -2929,54 +3112,57 @@ aes_gcm_dec_192_kernel: fmov v3.d[1], r9 @ CTR block 3 - ldr q26, [r8, #128] @ load rk8 + ld1 {v22.4s}, [r8], #16 @ load rk4 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 - ldr q29, [r8, #176] @ load rk11 + ld1 {v23.4s}, [r8], #16 @ load rk5 aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 - ldp r13, r14, [r8, #192] @ load rk12 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 - ldr q28, [r8, #160] @ load rk10 + ld1 {v24.4s}, [r8], #16 @ load rk6 aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 - ldr q27, [r8, #144] @ load rk9 + ld1 {v25.4s}, [r8], #16 @ load rk7 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 - ldr q25, [r8, #112] @ load rk7 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 - ldr q22, [r8, #64] @ load rk4 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 @@ -2994,7 +3180,7 @@ aes_gcm_dec_192_kernel: aese q0, v22.16b aesmc q0, q0 @ AES block 0 - round 4 - ldr q23, [r8, #80] @ load rk5 + ld1 {v28.4s}, [r8], #16 @ load rk10 aese q1, v22.16b aesmc q1, q1 @ AES block 1 - round 4 @@ -3009,7 +3195,7 @@ aes_gcm_dec_192_kernel: aese q0, v23.16b aesmc q0, q0 @ AES block 0 - round 5 - ldr q24, [r8, #96] @ load rk6 + ld1 {v29.4s}, [r8], #16 @ load rk11 aese q1, v23.16b aesmc q1, q1 @ AES block 1 - round 5 @@ -3096,17 +3282,13 @@ aes_gcm_dec_192_kernel: aese q0, v29.16b @ AES block 0 - round 11 bge .L192_dec_tail @ handle tail - ldr q5, [r0, #16] @ AES block 1 - load ciphertext - - ldr q4, [r0, #0] @ AES block 0 - load ciphertext + ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext eor q1, q5, q1 @ AES block 1 - result eor q0, q4, q0 @ AES block 0 - result rev r9, r12 @ CTR block 4 - ldr q7, [r0, #48] @ AES block 3 - load ciphertext - - ldr q6, [r0, #32] @ AES block 2 - load ciphertext + ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext mov r19, v1.d[0] @ AES block 1 - mov low @@ -3118,27 +3300,35 @@ aes_gcm_dec_192_kernel: mov r7, v0.d[1] @ AES block 0 - mov high rev64 q4, q4 @ GHASH block 0 - add r0, r0, #64 @ AES input_ptr update fmov d0, r10 @ CTR block 4 rev64 q5, q5 @ GHASH block 1 cmp r0, r5 @ check if we have <= 8 blocks eor r19, r19, r13 @ AES block 1 - round 12 low +#ifdef __ARMEB__ + rev r19, r19 +#endif fmov v0.d[1], r9 @ CTR block 4 rev r9, r12 @ CTR block 5 orr r9, r11, r9, lsl #32 @ CTR block 5 fmov d1, r10 @ CTR block 5 eor r20, r20, r14 @ AES block 1 - round 12 high - +#ifdef __ARMEB__ + rev r20, r20 +#endif add r12, r12, #1 @ CTR block 5 fmov v1.d[1], r9 @ CTR block 5 eor r6, r6, r13 @ AES block 0 - round 12 low - +#ifdef __ARMEB__ + rev r6, r6 +#endif rev r9, r12 @ CTR block 6 eor r7, r7, r14 @ AES block 0 - round 12 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif stp r6, r7, [r2], #16 @ AES block 0 - store result orr r9, r11, r9, lsl #32 @ CTR block 6 @@ -3201,7 +3391,9 @@ aes_gcm_dec_192_kernel: aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor r22, r22, r14 @ AES block 4k+2 - round 12 high - +#ifdef __ARMEB__ + rev r22, r22 +#endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 eor q4, q4, q5 @ GHASH block 4k+1 - mid @@ -3218,7 +3410,9 @@ aes_gcm_dec_192_kernel: pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low eor r21, r21, r13 @ AES block 4k+2 - round 12 low - +#ifdef __ARMEB__ + rev r21, r21 +#endif aese q1, v22.16b aesmc q1, q1 @ AES block 4k+5 - round 4 @@ -3320,16 +3514,18 @@ aes_gcm_dec_192_kernel: aese q2, v24.16b aesmc q2, q2 @ AES block 4k+6 - round 6 - ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext + ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid - ldr q7, [r0, #48] @ AES block 4k+7 - load ciphertext + ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext eor r23, r23, r13 @ AES block 4k+3 - round 12 low - +#ifdef __ARMEB__ + rev r23, r23 +#endif aese q2, v25.16b aesmc q2, q2 @ AES block 4k+6 - round 7 ext q9, q9, q9, #8 @ MODULO - other top alignment @@ -3343,10 +3539,10 @@ aes_gcm_dec_192_kernel: aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 - ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext + ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q1, v29.16b @ AES block 4k+5 - round 11 - ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext + ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext rev r9, r12 @ CTR block 4k+8 aese q3, v26.16b @@ -3357,11 +3553,13 @@ aes_gcm_dec_192_kernel: aesmc q2, q2 @ AES block 4k+6 - round 9 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid - add r0, r0, #64 @ AES input_ptr update cmp r0, r5 @ .LOOP CONTROL eor q0, q4, q0 @ AES block 4k+4 - result eor r24, r24, r14 @ AES block 4k+3 - round 12 high +#ifdef __ARMEB__ + rev r24, r24 +#endif eor q1, q5, q1 @ AES block 4k+5 - result aese q2, v28.16b @@ -3394,18 +3592,28 @@ aes_gcm_dec_192_kernel: rev r9, r12 @ CTR block 4k+9 eor r6, r6, r13 @ AES block 4k+4 - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif orr r9, r11, r9, lsl #32 @ CTR block 4k+9 eor v11.16b, v11.16b, q8 @ MODULO - fold into low fmov d1, r10 @ CTR block 4k+9 add r12, r12, #1 @ CTR block 4k+9 eor r19, r19, r13 @ AES block 4k+5 - round 12 low - +#ifdef __ARMEB__ + rev r19, r19 +#endif fmov v1.d[1], r9 @ CTR block 4k+9 rev r9, r12 @ CTR block 4k+10 eor r20, r20, r14 @ AES block 4k+5 - round 12 high - +#ifdef __ARMEB__ + rev r20, r20 +#endif eor r7, r7, r14 @ AES block 4k+4 - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low @@ -3459,18 +3667,29 @@ aes_gcm_dec_192_kernel: pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low eor r24, r24, r14 @ AES block 4k+3 - round 12 high +#ifdef __ARMEB__ + rev r24, r24 +#endif fmov v3.d[1], r9 @ CTR block 4k+7 aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor r21, r21, r13 @ AES block 4k+2 - round 12 low - +#ifdef __ARMEB__ + rev r21, r21 +#endif pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high eor r22, r22, r14 @ AES block 4k+2 - round 12 high +#ifdef __ARMEB__ + rev r22, r22 +#endif eor q4, q4, q5 @ GHASH block 4k+1 - mid pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor r23, r23, r13 @ AES block 4k+3 - round 12 low +#ifdef __ARMEB__ + rev r23, r23 +#endif stp r21, r22, [r2], #16 @ AES block 4k+2 - store result rev64 q7, q7 @ GHASH block 4k+3 @@ -3660,8 +3879,13 @@ aes_gcm_dec_192_kernel: cmp r5, #48 eor r7, r7, r14 @ AES block 4k+4 - round 12 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor r6, r6, r13 @ AES block 4k+4 - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif bgt .L192_dec_blocks_more_than_3 movi v11.8b, #0 @@ -3705,10 +3929,16 @@ aes_gcm_dec_192_kernel: pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high eor r6, r6, r13 @ AES final-2 block - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif movi q8, #0 @ suppress further partial tag feed in pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r7, r7, r14 @ AES final-2 block - round 12 high +#ifdef __ARMEB__ + rev r7, r7 +#endif .L192_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block @@ -3738,8 +3968,13 @@ aes_gcm_dec_192_kernel: eor q9, q9, v20.16b @ GHASH final-2 block - high eor r7, r7, r14 @ AES final-1 block - round 12 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor r6, r6, r13 @ AES final-1 block - round 12 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid .L192_dec_blocks_more_than_1:@ blocks left > 1 @@ -3770,9 +4005,13 @@ aes_gcm_dec_192_kernel: movi q8, #0 @ suppress further partial tag feed in eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor r7, r7, r14 @ AES final block - round 12 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor r6, r6, r13 @ AES final block - round 12 low - +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid .L192_dec_blocks_less_than_1:@ blocks left <= 1 @@ -3799,8 +4038,11 @@ aes_gcm_dec_192_kernel: orr r6, r6, r4 mov v0.d[1], r10 - +#ifndef __ARMEB__ rev r9, r12 +#else + mov r9, r12 +#endif and q5, q5, q0 @ possibly partial last block has zeroes in highest bits str r9, [r16, #12] @ store the updated counter @@ -3888,14 +4130,22 @@ aes_gcm_enc_256_kernel: lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 - +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #224] @ load rk14 +#ifdef __ARMEB__ + ror r13, r13, #32 + ror r14, r14, #32 +#endif ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible sub r5, r5, #1 @ byte_len - 1 - ldr q18, [r8, #0] @ load rk0 + ld1 {v18.4s}, [r8], #16 @ load rk0 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - ldr q25, [r8, #112] @ load rk7 + ld1 {v19.4s}, [r8], #16 @ load rk1 add r5, r5, r0 lsr r12, r11, #32 @@ -3915,14 +4165,14 @@ aes_gcm_enc_256_kernel: orr r9, r11, r9, lsl #32 @ CTR block 1 add r12, r12, #1 @ CTR block 1 - ldr q19, [r8, #16] @ load rk1 + ld1 {v20.4s}, [r8], #16 @ load rk2 fmov v1.d[1], r9 @ CTR block 1 rev r9, r12 @ CTR block 2 add r12, r12, #1 @ CTR block 2 orr r9, r11, r9, lsl #32 @ CTR block 2 - ldr q20, [r8, #32] @ load rk2 + ld1 {v21.4s}, [r8], #16 @ load rk3 fmov v2.d[1], r9 @ CTR block 2 rev r9, r12 @ CTR block 3 @@ -3935,50 +4185,53 @@ aes_gcm_enc_256_kernel: aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 - ldr q21, [r8, #48] @ load rk3 + ld1 {v22.4s}, [r8], #16 @ load rk4 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 - ldr q24, [r8, #96] @ load rk6 + ld1 {v23.4s}, [r8], #16 @ load rk5 aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 - ldr q23, [r8, #80] @ load rk5 + ld1 {v24.4s}, [r8], #16 @ load rk6 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 - +#endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 - ldr q31, [r8, #208] @ load rk13 + ld1 {v25.4s}, [r8], #16 @ load rk7 aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 - ldr q22, [r8, #64] @ load rk4 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q1, v20.16b aesmc q1, q1 @ AES block 1 - round 2 ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 - +#endif aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 - ldr q30, [r8, #192] @ load rk12 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 - +#endif aese q1, v21.16b aesmc q1, q1 @ AES block 1 - round 3 - ldr q29, [r8, #176] @ load rk11 + ld1 {v28.4s}, [r8], #16 @ load rk10 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 - ldr q26, [r8, #128] @ load rk8 + ld1 {v29.4s}, [r8], #16 @ load rk11 aese q2, v21.16b aesmc q2, q2 @ AES block 2 - round 3 @@ -3986,7 +4239,6 @@ aes_gcm_enc_256_kernel: aese q0, v21.16b aesmc q0, q0 @ AES block 0 - round 3 - ldp r13, r14, [r8, #224] @ load rk14 aese q3, v21.16b aesmc q3, q3 @ AES block 3 - round 3 @@ -4024,16 +4276,17 @@ aes_gcm_enc_256_kernel: aese q3, v24.16b aesmc q3, q3 @ AES block 3 - round 6 - ldr q27, [r8, #144] @ load rk9 + ld1 {v30.4s}, [r8], #16 @ load rk12 aese q0, v24.16b aesmc q0, q0 @ AES block 0 - round 6 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese q2, v24.16b aesmc q2, q2 @ AES block 2 - round 6 - ldr q28, [r8, #160] @ load rk10 + ld1 {v31.4s}, [r8], #16 @ load rk13 aese q1, v25.16b aesmc q1, q1 @ AES block 1 - round 7 @@ -4122,13 +4375,26 @@ aes_gcm_enc_256_kernel: bge .L256_enc_tail @ handle tail ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext - +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif rev r9, r12 @ CTR block 4 ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext - +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif add r0, r0, #64 @ AES input_ptr update eor r19, r19, r13 @ AES block 1 - round 14 low @@ -4211,11 +4477,17 @@ aes_gcm_enc_256_kernel: aese q1, v19.16b aesmc q1, q1 @ AES block 4k+5 - round 1 ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext - +#ifdef __ARMEB__ + rev r23, r23 + rev r24, r24 +#endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext - +#ifdef __ARMEB__ + rev r21, r21 + rev r22, r22 +#endif aese q0, v20.16b aesmc q0, q0 @ AES block 4k+4 - round 2 eor q4, q4, v11.16b @ PRE 1 @@ -4325,7 +4597,10 @@ aes_gcm_enc_256_kernel: aese q3, v24.16b aesmc q3, q3 @ AES block 4k+7 - round 6 ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext - +#ifdef __ARMEB__ + rev r19, r19 + rev r20, r20 +#endif aese q1, v26.16b aesmc q1, q1 @ AES block 4k+5 - round 8 mov d4, v7.d[1] @ GHASH block 4k+3 - mid @@ -4362,7 +4637,10 @@ aes_gcm_enc_256_kernel: aese q2, v26.16b aesmc q2, q2 @ AES block 4k+6 - round 8 ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 shl d8, d8, #56 @ mod_constant @@ -4724,7 +5002,10 @@ aes_gcm_enc_256_kernel: ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor r6, r6, r13 @ AES block 4k+4 - round 14 low eor r7, r7, r14 @ AES block 4k+4 - round 14 high @@ -4759,7 +5040,10 @@ aes_gcm_enc_256_kernel: st1 { q5}, [r2], #16 @ AES final-3 block - store result ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif rev64 q4, q5 @ GHASH final-3 block eor r6, r6, r13 @ AES final-2 block - round 14 low @@ -4788,7 +5072,10 @@ aes_gcm_enc_256_kernel: st1 { q5}, [r2], #16 @ AES final-2 block - store result ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif rev64 q4, q5 @ GHASH final-2 block eor r6, r6, r13 @ AES final-1 block - round 14 low @@ -4824,7 +5111,10 @@ aes_gcm_enc_256_kernel: rev64 q4, q5 @ GHASH final-1 block ldp r6, r7, [r0], #16 @ AES final block - load input low & high - +#ifdef __ARMEB__ + rev r6, r6 + rev r7, r7 +#endif eor q4, q4, q8 @ feed in partial tag movi q8, #0 @ suppress further partial tag feed in @@ -4885,7 +5175,11 @@ aes_gcm_enc_256_kernel: pmull2 v20.1q, q4, v12.2d @ GHASH final block - high mov d8, v4.d[1] @ GHASH final block - mid +#ifndef __ARMEB__ rev r9, r12 +#else + mov r9, r12 +#endif pmull v21.1q, q4, v12.1d @ GHASH final block - low @@ -4959,21 +5253,29 @@ aes_gcm_dec_256_kernel: lsr r5, r1, #3 @ byte_len mov r15, r5 ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32 - - ldr q26, [r8, #128] @ load rk8 +#ifdef __ARMEB__ + rev r10, r10 + rev r11, r11 +#endif + ldp r13, r14, [r8, #224] @ load rk14 +#ifdef __ARMEB__ + ror r14, r14, #32 + ror r13, r13, #32 +#endif + ld1 {v18.4s}, [r8], #16 @ load rk0 sub r5, r5, #1 @ byte_len - 1 - ldr q25, [r8, #112] @ load rk7 + ld1 {v19.4s}, [r8], #16 @ load rk1 and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add r4, r0, r1, lsr #3 @ end_input_ptr - ldr q24, [r8, #96] @ load rk6 + ld1 {v20.4s}, [r8], #16 @ load rk2 lsr r12, r11, #32 - ldr q23, [r8, #80] @ load rk5 + ld1 {v21.4s}, [r8], #16 @ load rk3 orr r11, r11, r11 - ldr q21, [r8, #48] @ load rk3 + ld1 {v22.4s}, [r8], #16 @ load rk4 add r5, r5, r0 rev r12, r12 @ rev_ctr32 @@ -4998,39 +5300,44 @@ aes_gcm_dec_256_kernel: rev r9, r12 @ CTR block 3 orr r9, r11, r9, lsl #32 @ CTR block 3 - ldr q18, [r8, #0] @ load rk0 + ld1 {v23.4s}, [r8], #16 @ load rk5 fmov v3.d[1], r9 @ CTR block 3 add r12, r12, #1 @ CTR block 3 - ldr q22, [r8, #64] @ load rk4 + ld1 {v24.4s}, [r8], #16 @ load rk6 - ldr q31, [r8, #208] @ load rk13 + ld1 {v25.4s}, [r8], #16 @ load rk7 - ldr q19, [r8, #16] @ load rk1 + ld1 {v26.4s}, [r8], #16 @ load rk8 aese q0, v18.16b aesmc q0, q0 @ AES block 0 - round 0 ldr q14, [r3, #80] @ load h3l | h3h +#ifndef __ARMEB__ ext v14.16b, v14.16b, v14.16b, #8 +#endif aese q3, v18.16b aesmc q3, q3 @ AES block 3 - round 0 ldr q15, [r3, #112] @ load h4l | h4h +#ifndef __ARMEB__ ext v15.16b, v15.16b, v15.16b, #8 +#endif aese q1, v18.16b aesmc q1, q1 @ AES block 1 - round 0 ldr q13, [r3, #64] @ load h2l | h2h +#ifndef __ARMEB__ ext v13.16b, v13.16b, v13.16b, #8 +#endif aese q2, v18.16b aesmc q2, q2 @ AES block 2 - round 0 - ldr q20, [r8, #32] @ load rk2 + ld1 {v27.4s}, [r8], #16 @ load rk9 aese q0, v19.16b aesmc q0, q0 @ AES block 0 - round 1 - ldp r13, r14, [r8, #224] @ load rk14 aese q1, v19.16b aesmc q1, q1 @ AES block 1 - round 1 @@ -5040,20 +5347,21 @@ aes_gcm_dec_256_kernel: aese q2, v19.16b aesmc q2, q2 @ AES block 2 - round 1 - ldr q27, [r8, #144] @ load rk9 + ld1 {v28.4s}, [r8], #16 @ load rk10 aese q3, v19.16b aesmc q3, q3 @ AES block 3 - round 1 - ldr q30, [r8, #192] @ load rk12 + ld1 {v29.4s}, [r8], #16 @ load rk11 aese q0, v20.16b aesmc q0, q0 @ AES block 0 - round 2 ldr q12, [r3, #32] @ load h1l | h1h +#ifndef __ARMEB__ ext v12.16b, v12.16b, v12.16b, #8 - +#endif aese q2, v20.16b aesmc q2, q2 @ AES block 2 - round 2 - ldr q28, [r8, #160] @ load rk10 + ld1 {v30.4s}, [r8], #16 @ load rk12 aese q3, v20.16b aesmc q3, q3 @ AES block 3 - round 2 @@ -5136,7 +5444,7 @@ aes_gcm_dec_256_kernel: aese q2, v26.16b aesmc q2, q2 @ AES block 2 - round 8 - ldr q29, [r8, #176] @ load rk11 + ld1 {v31.4s}, [r8], #16 @ load rk13 aese q1, v27.16b aesmc q1, q1 @ AES block 1 - round 9 @@ -5201,9 +5509,7 @@ aes_gcm_dec_256_kernel: aese q0, v31.16b @ AES block 0 - round 13 bge .L256_dec_tail @ handle tail - ldr q4, [r0, #0] @ AES block 0 - load ciphertext - - ldr q5, [r0, #16] @ AES block 1 - load ciphertext + ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext rev r9, r12 @ CTR block 4 @@ -5211,7 +5517,7 @@ aes_gcm_dec_256_kernel: eor q1, q5, q1 @ AES block 1 - result rev64 q5, q5 @ GHASH block 1 - ldr q7, [r0, #48] @ AES block 3 - load ciphertext + ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext mov r7, v0.d[1] @ AES block 0 - mov high @@ -5231,22 +5537,32 @@ aes_gcm_dec_256_kernel: orr r9, r11, r9, lsl #32 @ CTR block 5 mov r20, v1.d[1] @ AES block 1 - mov high eor r7, r7, r14 @ AES block 0 - round 14 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif eor r6, r6, r13 @ AES block 0 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif stp r6, r7, [r2], #16 @ AES block 0 - store result fmov d1, r10 @ CTR block 5 - ldr q6, [r0, #32] @ AES block 2 - load ciphertext - add r0, r0, #64 @ AES input_ptr update + ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext fmov v1.d[1], r9 @ CTR block 5 rev r9, r12 @ CTR block 6 add r12, r12, #1 @ CTR block 6 eor r19, r19, r13 @ AES block 1 - round 14 low +#ifdef __ARMEB__ + rev r19, r19 +#endif orr r9, r11, r9, lsl #32 @ CTR block 6 eor r20, r20, r14 @ AES block 1 - round 14 high +#ifdef __ARMEB__ + rev r20, r20 +#endif stp r19, r20, [r2], #16 @ AES block 1 - store result eor q2, q6, q2 @ AES block 2 - result @@ -5297,7 +5613,9 @@ aes_gcm_dec_256_kernel: aese q0, v21.16b aesmc q0, q0 @ AES block 4k+4 - round 3 eor r22, r22, r14 @ AES block 4k+2 - round 14 high - +#ifdef __ARMEB__ + rev r22, r22 +#endif aese q2, v19.16b aesmc q2, q2 @ AES block 4k+6 - round 1 mov d10, v17.d[1] @ GHASH block 4k - mid @@ -5309,7 +5627,9 @@ aes_gcm_dec_256_kernel: aese q3, v18.16b aesmc q3, q3 @ AES block 4k+7 - round 0 eor r21, r21, r13 @ AES block 4k+2 - round 14 low - +#ifdef __ARMEB__ + rev r21, r21 +#endif aese q2, v20.16b aesmc q2, q2 @ AES block 4k+6 - round 2 stp r21, r22, [r2], #16 @ AES block 4k+2 - store result @@ -5324,9 +5644,14 @@ aes_gcm_dec_256_kernel: pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid eor r23, r23, r13 @ AES block 4k+3 - round 14 low - +#ifdef __ARMEB__ + rev r23, r23 +#endif pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low eor r24, r24, r14 @ AES block 4k+3 - round 14 high +#ifdef __ARMEB__ + rev r24, r24 +#endif eor q9, q9, q4 @ GHASH block 4k+1 - high aese q2, v22.16b @@ -5447,7 +5772,7 @@ aes_gcm_dec_256_kernel: aese q1, v27.16b aesmc q1, q1 @ AES block 4k+5 - round 9 - ldr q4, [r0, #0] @ AES block 4k+4 - load ciphertext + ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext aese q0, v31.16b @ AES block 4k+4 - round 13 ext q9, q9, q9, #8 @ MODULO - other top alignment @@ -5458,7 +5783,7 @@ aes_gcm_dec_256_kernel: aese q2, v27.16b aesmc q2, q2 @ AES block 4k+6 - round 9 - ldr q5, [r0, #16] @ AES block 4k+5 - load ciphertext + ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext aese q3, v26.16b aesmc q3, q3 @ AES block 4k+7 - round 8 @@ -5474,11 +5799,11 @@ aes_gcm_dec_256_kernel: aese q3, v27.16b aesmc q3, q3 @ AES block 4k+7 - round 9 - ldr q7, [r0, #48] @ AES block 4k+7 - load ciphertext + ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext aese q1, v30.16b aesmc q1, q1 @ AES block 4k+5 - round 12 - ldr q6, [r0, #32] @ AES block 4k+6 - load ciphertext + ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 @@ -5489,7 +5814,6 @@ aes_gcm_dec_256_kernel: eor v10.16b, v10.16b, q9 @ MODULO - fold into mid aese q1, v31.16b @ AES block 4k+5 - round 13 - add r0, r0, #64 @ AES input_ptr update mov r6, v0.d[0] @ AES block 4k+4 - mov low aese q2, v30.16b @@ -5511,8 +5835,13 @@ aes_gcm_dec_256_kernel: add r12, r12, #1 @ CTR block 4k+9 eor r6, r6, r13 @ AES block 4k+4 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor r7, r7, r14 @ AES block 4k+4 - round 14 high - +#ifdef __ARMEB__ + rev r7, r7 +#endif mov r20, v1.d[1] @ AES block 4k+5 - mov high eor q2, q6, q2 @ AES block 4k+6 - result eor v11.16b, v11.16b, q8 @ MODULO - fold into low @@ -5533,9 +5862,15 @@ aes_gcm_dec_256_kernel: rev64 q5, q5 @ GHASH block 4k+5 eor r20, r20, r14 @ AES block 4k+5 - round 14 high +#ifdef __ARMEB__ + rev r20, r20 +#endif stp r6, r7, [r2], #16 @ AES block 4k+4 - store result eor r19, r19, r13 @ AES block 4k+5 - round 14 low +#ifdef __ARMEB__ + rev r19, r19 +#endif stp r19, r20, [r2], #16 @ AES block 4k+5 - store result rev64 q4, q4 @ GHASH block 4k+4 @@ -5742,11 +6077,15 @@ aes_gcm_dec_256_kernel: aese q0, v28.16b aesmc q0, q0 @ AES block 4k+4 - round 10 eor r22, r22, r14 @ AES block 4k+2 - round 14 high - +#ifdef __ARMEB__ + rev r22, r22 +#endif aese q1, v28.16b aesmc q1, q1 @ AES block 4k+5 - round 10 eor r23, r23, r13 @ AES block 4k+3 - round 14 low - +#ifdef __ARMEB__ + rev r23, r23 +#endif aese q2, v29.16b aesmc q2, q2 @ AES block 4k+6 - round 11 eor v10.16b, v10.16b, q9 @ MODULO - fold into mid @@ -5758,12 +6097,18 @@ aes_gcm_dec_256_kernel: aese q1, v29.16b aesmc q1, q1 @ AES block 4k+5 - round 11 eor r21, r21, r13 @ AES block 4k+2 - round 14 low +#ifdef __ARMEB__ + rev r21, r21 +#endif aese q2, v30.16b aesmc q2, q2 @ AES block 4k+6 - round 12 pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low eor r24, r24, r14 @ AES block 4k+3 - round 14 high +#ifdef __ARMEB__ + rev r24, r24 +#endif aese q3, v29.16b aesmc q3, q3 @ AES block 4k+7 - round 11 @@ -5804,8 +6149,14 @@ aes_gcm_dec_256_kernel: cmp r5, #48 eor r6, r6, r13 @ AES block 4k+4 - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor r7, r7, r14 @ AES block 4k+4 - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif bgt .L256_dec_blocks_more_than_3 sub r12, r12, #1 @@ -5853,9 +6204,15 @@ aes_gcm_dec_256_kernel: pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid eor r6, r6, r13 @ AES final-2 block - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low eor r7, r7, r14 @ AES final-2 block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif .L256_dec_blocks_more_than_2:@ blocks left > 2 rev64 q4, q5 @ GHASH final-2 block @@ -5883,9 +6240,15 @@ aes_gcm_dec_256_kernel: eor q9, q9, v20.16b @ GHASH final-2 block - high eor r6, r6, r13 @ AES final-1 block - round 14 low +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid eor r7, r7, r14 @ AES final-1 block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif .L256_dec_blocks_more_than_1:@ blocks left > 1 stp r6, r7, [r2], #16 @ AES final-1 block - store result @@ -5913,13 +6276,18 @@ aes_gcm_dec_256_kernel: pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid eor r6, r6, r13 @ AES final block - round 14 low - +#ifdef __ARMEB__ + rev r6, r6 +#endif eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low eor q9, q9, v20.16b @ GHASH final-1 block - high eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid eor r7, r7, r14 @ AES final block - round 14 high +#ifdef __ARMEB__ + rev r7, r7 +#endif .L256_dec_blocks_less_than_1:@ blocks left <= 1 and r1, r1, #127 @ bit_length %= 128 @@ -5945,7 +6313,11 @@ aes_gcm_dec_256_kernel: mov v0.d[1], r10 bic r4, r4, r9 @ mask out low existing bytes +#ifndef __ARMEB__ rev r9, r12 +#else + mov r9, r12 +#endif bic r5, r5, r10 @ mask out high existing bytes Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.6 src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.7 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S:1.6 Tue May 9 13:21:16 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/arm/bsaes-armv7.S Wed May 31 15:35:31 2023 @@ -1,5 +1,5 @@ #include "arm_asm.h" -@ Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved. +@ Copyright 2012-2023 The OpenSSL Project Authors. All Rights Reserved. @ @ Licensed under the Apache License 2.0 (the "License"). You may not use @ this file except in compliance with the License. You can obtain a copy @@ -14,7 +14,7 @@ @ details see http://www.openssl.org/~appro/cryptogams/. @ @ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. +@ of Linaro. @ ==================================================================== @ Bit-sliced AES for ARM NEON @@ -1394,7 +1394,7 @@ ossl_bsaes_ctr32_encrypt_blocks: .align 2 add r12, r3, #248 vld1.8 {q0}, [r8] @ load counter - adrl r8, .LREVM0SR @ borrow r8 + add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 vldmia r12, {q4} @ load round0 key sub sp, #0x10 @ place for adjusted round0 key #endif Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.2 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S:1.1 Tue May 9 13:22:44 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc/ecp_nistp521-ppc64.S Wed May 31 15:35:31 2023 @@ -1,3 +1,4 @@ +.machine "any" .text .globl p521_felem_mul Index: src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S diff -u src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.1 src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.2 --- src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S:1.1 Tue May 9 13:22:44 2023 +++ src/crypto/external/bsd/openssl/lib/libcrypto/arch/powerpc64/ecp_nistp521-ppc64.S Wed May 31 15:35:31 2023 @@ -1,3 +1,4 @@ +.machine "any" .text .globl p521_felem_mul