From: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com> This patch adds core low-level crypto operations for ARMv8 processors. The assembly code is a base for an optimized PMD and is currently excluded from the build.
Standalone SHA1 and SHA256 are provided to support partial hashing of inner/outer key+padding and authentication keys longer than 160/256 bits. Optimized AES key schedule is also included. Signed-off-by: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com> Signed-off-by: Emery Davis <emery.da...@caviumnetworks.com> --- drivers/crypto/armv8/asm/aes_core.S | 151 ++++++++++ drivers/crypto/armv8/asm/sha1_core.S | 518 ++++++++++++++++++++++++++++++++ drivers/crypto/armv8/asm/sha256_core.S | 525 +++++++++++++++++++++++++++++++++ 3 files changed, 1194 insertions(+) create mode 100644 drivers/crypto/armv8/asm/aes_core.S create mode 100644 drivers/crypto/armv8/asm/sha1_core.S create mode 100644 drivers/crypto/armv8/asm/sha256_core.S diff --git a/drivers/crypto/armv8/asm/aes_core.S b/drivers/crypto/armv8/asm/aes_core.S new file mode 100644 index 0000000..b7ceae6 --- /dev/null +++ b/drivers/crypto/armv8/asm/aes_core.S @@ -0,0 +1,151 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + .file "aes_core.S" + .text + .cpu generic+fp+simd+crypto+crc + .align 4 + .global aes128_key_sched_enc + .type aes128_key_sched_enc, %function + .global aes128_key_sched_dec + .type aes128_key_sched_dec, %function + + /* + * AES key expand algorithm for single round. + */ + .macro key_expand res, key, shuffle_mask, rcon, tq0, tq1, td + /* temp = rotword(key[3]) */ + tbl \td\().8b,{\key\().16b},\shuffle_mask\().8b + dup \tq0\().2d,\td\().d[0] + /* temp = subbytes(temp) */ + aese \tq0\().16b,v19\().16b /* q19 := 0 */ + /* temp = temp + rcon */ + mov w11,\rcon + dup \tq1\().4s,w11 + eor \tq0\().16b,\tq0\().16b,\tq1\().16b + /* tq1 = [0, a, b, c] */ + ext \tq1\().16b,v19\().16b,\key\().16b,12 /* q19 := 0 */ + eor \res\().16b,\key\().16b,\tq1\().16b + /* tq1 = [0, 0, a, b] */ + ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */ + eor \res\().16b,\res\().16b,\tq1\().16b + /* tq1 = [0, 0, 0, a] */ + ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */ + eor \res\().16b,\res\().16b,\tq1\().16b + /* + temp */ + eor \res\().16b,\res\().16b,\tq0\().16b + .endm +/* + * *expanded_key, *user_key + */ + .align 4 +aes128_key_sched_enc: + sub sp,sp,4*16 + st1 {v8.16b - v11.16b},[sp] + ld1 {v0.16b},[x1] /* user_key */ + mov w10,0x0e0d /* form shuffle_word */ + mov w11,0x0c0f + orr w10,w10,w11,lsl 16 + dup v20.4s,w10 /* shuffle_mask */ + eor v19.16b,v19.16b,v19.16b /* zero */ + /* Expand key */ + key_expand v1,v0,v20,0x1,v21,v16,v17 + key_expand v2,v1,v20,0x2,v21,v16,v17 + key_expand v3,v2,v20,0x4,v21,v16,v17 + key_expand v4,v3,v20,0x8,v21,v16,v17 + key_expand v5,v4,v20,0x10,v21,v16,v17 + key_expand v6,v5,v20,0x20,v21,v16,v17 + key_expand v7,v6,v20,0x40,v21,v16,v17 + key_expand v8,v7,v20,0x80,v21,v16,v17 + key_expand v9,v8,v20,0x1b,v21,v16,v17 + key_expand v10,v9,v20,0x36,v21,v16,v17 + /* Store round keys in the correct order */ + st1 {v0.16b - v3.16b},[x0],64 + st1 {v4.16b - v7.16b},[x0],64 + st1 {v8.16b - v10.16b},[x0],48 + + ld1 {v8.16b - v11.16b},[sp] + add sp,sp,4*16 + ret + + .size aes128_key_sched_enc, .-aes128_key_sched_enc + +/* + * *expanded_key, *user_key + */ + .align 4 +aes128_key_sched_dec: + sub sp,sp,4*16 + st1 {v8.16b-v11.16b},[sp] + ld1 {v0.16b},[x1] /* user_key */ + mov w10,0x0e0d /* form shuffle_word */ + mov w11,0x0c0f + orr w10,w10,w11,lsl 16 + dup v20.4s,w10 /* shuffle_mask */ + eor v19.16b,v19.16b,v19.16b /* zero */ + /* + * Expand key. + * Intentionally reverse registers order to allow + * for multiple store later. + * (Store must be performed in the ascending registers' order) + */ + key_expand v10,v0,v20,0x1,v21,v16,v17 + key_expand v9,v10,v20,0x2,v21,v16,v17 + key_expand v8,v9,v20,0x4,v21,v16,v17 + key_expand v7,v8,v20,0x8,v21,v16,v17 + key_expand v6,v7,v20,0x10,v21,v16,v17 + key_expand v5,v6,v20,0x20,v21,v16,v17 + key_expand v4,v5,v20,0x40,v21,v16,v17 + key_expand v3,v4,v20,0x80,v21,v16,v17 + key_expand v2,v3,v20,0x1b,v21,v16,v17 + key_expand v1,v2,v20,0x36,v21,v16,v17 + /* Inverse mixcolumns for keys 1-9 (registers v10-v2) */ + aesimc v10.16b, v10.16b + aesimc v9.16b, v9.16b + aesimc v8.16b, v8.16b + aesimc v7.16b, v7.16b + aesimc v6.16b, v6.16b + aesimc v5.16b, v5.16b + aesimc v4.16b, v4.16b + aesimc v3.16b, v3.16b + aesimc v2.16b, v2.16b + /* Store round keys in the correct order */ + st1 {v1.16b - v4.16b},[x0],64 + st1 {v5.16b - v8.16b},[x0],64 + st1 {v9.16b, v10.16b},[x0],32 + st1 {v0.16b},[x0],16 + + ld1 {v8.16b - v11.16b},[sp] + add sp,sp,4*16 + ret + + .size aes128_key_sched_dec, .-aes128_key_sched_dec diff --git a/drivers/crypto/armv8/asm/sha1_core.S b/drivers/crypto/armv8/asm/sha1_core.S new file mode 100644 index 0000000..283c946 --- /dev/null +++ b/drivers/crypto/armv8/asm/sha1_core.S @@ -0,0 +1,518 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Core SHA-1 Primitives + * + * Operations: + * sha1_block_partial: + * out = partial_sha1(init, in, len) <- no final block + * + * sha1_block: + * out = sha1(init, in, len) + * + * Prototype: + * + * int sha1_block_partial(uint8_t *init, + * uint8_t *dsrc, uint8_t *ddst, uint64_t len) + * + * int sha1_block(uint8_t *init, + * uint8_t *dsrc, uint8_t *ddst, uint64_t len) + * + * returns: 0 (success), -1 (failure) + * + * Registers used: + * + * sha1_block_partial( + * init, x0 (hash init state - NULL for default) + * dsrc, x1 (digest src address) + * ddst, x2 (digest dst address) + * len, x3 (length) + * ) + * + * sha1_block( + * init, x0 (hash init state - NULL for default) + * dsrc, x1 (digest src address) + * ddst, x2 (digest dst address) + * len, x3 (length) + * ) + * + * Routine register definitions: + * + * v4 - v7 -- round consts for sha + * v22 -- sha working state ABCD (q22) + * v24 -- reg_sha_stateABCD + * v25 -- reg_sha_stateEFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * Constraints: + * + * The variable "len" must be a multiple of 16 (+20 for the HMAC), + * otherwise error code is returned. + * + */ + .file "sha1_core.S" + .text + .cpu generic+fp+simd+crypto+crc + .align 4 + .global sha1_block_partial + .type sha1_block_partial,%function + .global sha1_block + .type sha1_block,%function + + .align 4 +.Lrcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + + .align 4 +.Linit_sha_state: + .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476 + .word 0xc3d2e1f0, 0x00000000, 0x00000000, 0x00000000 + + .align 4 + +sha1_block_partial: + mov x6, #1 /* indicate partial hash */ + ands x5, x3, #0x3f /* Check size mod 1 SHA block */ + b.ne .Lsha1_error + cbnz x0, 1f + /* address of sha init state consts */ + adr x0,.Linit_sha_state +1: + ld1 {v24.4s},[x0],16 /* init ABCD */ + ld1 {v25.4s},[x0] /* and E */ + + /* Load SHA-1 constants */ + adr x4,.Lrcon + ld1 {v4.16b},[x4],16 /* key0 */ + ld1 {v5.16b},[x4],16 /* key1 */ + ld1 {v6.16b},[x4],16 /* key2 */ + ld1 {v7.16b},[x4],16 /* key3 */ + + lsr x5, x3, 2 /* number of 4B blocks */ + b .Lsha1_loop + +sha1_block: + mov x6, xzr /* indicate full hash */ + and x5, x3, #0xf /* check size mod 16B block */ + cmp x5, #4 /* additional word is accepted */ + b.eq 1f + cbnz x5, .Lsha1_error +1: + cbnz x0, 2f + /* address of sha init state consts */ + adr x0,.Linit_sha_state +2: + ld1 {v24.4s},[x0],16 /* init ABCD */ + ld1 {v25.4s},[x0] /* and E */ + + /* Load SHA-1 constants */ + adr x4,.Lrcon + ld1 {v4.16b},[x4],16 /* key0 */ + ld1 {v5.16b},[x4],16 /* key1 */ + ld1 {v6.16b},[x4],16 /* key2 */ + ld1 {v7.16b},[x4],16 /* key3 */ + + lsr x5, x3, 2 /* number of 4B blocks */ + /* at least 16 4B blocks give 1 SHA block */ + cmp x5, #16 + b.lo .Lsha1_last + + .align 4 + +.Lsha1_loop: + sub x5, x5, #16 /* substract 1 SHA block */ + + ld1 {v26.16b},[x1],16 /* dsrc[0] */ + ld1 {v27.16b},[x1],16 /* dsrc[1] */ + ld1 {v28.16b},[x1],16 /* dsrc[2] */ + ld1 {v29.16b},[x1],16 /* dsrc[3] */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ +/* quad 0 */ + add v16.4s,v4.4s,v26.4s + sha1h s19,s24 + sha1c q24,s25,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v4.4s,v27.4s + sha1h s18,s24 + sha1c q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v4.4s,v28.4s + sha1h s19,s24 + sha1c q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v4.4s,v29.4s + sha1h s18,s24 + sha1c q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v4.4s,v26.4s + sha1h s19,s24 + sha1c q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s +/* quad 1 */ + add v17.4s,v5.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v5.4s,v28.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v5.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v5.4s,v26.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v5.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s +/* quad 2 */ + add v16.4s,v6.4s,v28.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v6.4s,v29.4s + sha1h s18,s24 + sha1m q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v6.4s,v26.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v6.4s,v27.4s + sha1h s18,s24 + sha1m q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v6.4s,v28.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s +/* quad 3 */ + add v17.4s,v7.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v7.4s,v26.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + + add v17.4s,v7.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + + add v16.4s,v7.4s,v28.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + + add v17.4s,v7.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + + add v24.4s,v24.4s,v22.4s + add v25.4s,v25.4s,v18.4s + + cmp x5, #16 + b.hs .Lsha1_loop + + /* Store partial hash and return or complete hash */ + cbz x6, .Lsha1_last + + st1 {v24.16b},[x2],16 + st1 {v25.16b},[x2] + + mov x0, xzr + ret + + /* + * Last block with padding. v24-v25[0] contain hash state. + */ +.Lsha1_last: + + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + + adr x4,.Lrcon + /* Number of bits in message */ + lsl x3, x3, 3 + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + /* move length to the end of the block */ + mov v29.s[3], w3 + lsr x3, x3, 32 + /* and the higher part */ + mov v29.s[2], w3 + + /* The remaining part is up to 3 16B blocks and up to 1 4B block */ + mov w6, #0x80 /* that's the 1 of the pad */ + mov v26.b[3], w6 + cbz x5,.Lsha1_final + /* Are there 3 16B blocks? */ + cmp x5, #12 + b.lo 1f + ld1 {v26.16b},[x1],16 + ld1 {v27.16b},[x1],16 + ld1 {v28.16b},[x1],16 + rev32 v26.16b, v26.16b + rev32 v27.16b, v27.16b + rev32 v28.16b, v28.16b + sub x5,x5,#12 + mov v29.b[7], w6 + cbz x5,.Lsha1_final + mov v29.b[7], wzr + ld1 {v29.s}[0],[x1],4 + rev32 v29.16b,v29.16b + mov v29.b[7], w6 + b .Lsha1_final +1: + /* Are there 2 16B blocks? */ + cmp x5, #8 + b.lo 2f + ld1 {v26.16b},[x1],16 + ld1 {v27.16b},[x1],16 + rev32 v26.16b,v26.16b + rev32 v27.16b,v27.16b + sub x5,x5,#8 + mov v28.b[7], w6 + cbz x5,.Lsha1_final + mov v28.b[7], wzr + ld1 {v28.s}[0],[x1],4 + rev32 v28.16b,v28.16b + mov v28.b[7], w6 + b .Lsha1_final +2: + /* Is there 1 16B block? */ + cmp x5, #4 + b.lo 3f + ld1 {v26.16b},[x1],16 + rev32 v26.16b,v26.16b + sub x5,x5,#4 + mov v27.b[7], w6 + cbz x5,.Lsha1_final + mov v27.b[7], wzr + ld1 {v27.s}[0],[x1],4 + rev32 v27.16b,v27.16b + mov v27.b[7], w6 + b .Lsha1_final +3: + ld1 {v26.s}[0],[x1],4 + rev32 v26.16b,v26.16b + mov v26.b[7], w6 + +.Lsha1_final: + ld1 {v4.16b},[x4],16 /* key0 */ + ld1 {v5.16b},[x4],16 /* key1 */ + ld1 {v6.16b},[x4],16 /* key2 */ + ld1 {v7.16b},[x4],16 /* key3 */ +/* quad 0 */ + add v16.4s,v4.4s,v26.4s + sha1h s19,s24 + sha1c q24,s25,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v4.4s,v27.4s + sha1h s18,s24 + sha1c q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v4.4s,v28.4s + sha1h s19,s24 + sha1c q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v4.4s,v29.4s + sha1h s18,s24 + sha1c q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v4.4s,v26.4s + sha1h s19,s24 + sha1c q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s +/* quad 1 */ + add v17.4s,v5.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v5.4s,v28.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v5.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v5.4s,v26.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v5.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s +/* quad 2 */ + add v16.4s,v6.4s,v28.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s + + add v17.4s,v6.4s,v29.4s + sha1h s18,s24 + sha1m q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v6.4s,v26.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v26.4s,v27.4s,v28.4s + sha1su1 v26.4s,v29.4s + + add v17.4s,v6.4s,v27.4s + sha1h s18,s24 + sha1m q24,s19,v17.4s + sha1su0 v27.4s,v28.4s,v29.4s + sha1su1 v27.4s,v26.4s + + add v16.4s,v6.4s,v28.4s + sha1h s19,s24 + sha1m q24,s18,v16.4s + sha1su0 v28.4s,v29.4s,v26.4s + sha1su1 v28.4s,v27.4s +/* quad 3 */ + add v17.4s,v7.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + sha1su0 v29.4s,v26.4s,v27.4s + sha1su1 v29.4s,v28.4s + + add v16.4s,v7.4s,v26.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + + add v17.4s,v7.4s,v27.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + + add v16.4s,v7.4s,v28.4s + sha1h s19,s24 + sha1p q24,s18,v16.4s + + add v17.4s,v7.4s,v29.4s + sha1h s18,s24 + sha1p q24,s19,v17.4s + + add v25.4s,v25.4s,v18.4s + add v24.4s,v24.4s,v22.4s + + rev32 v24.16b,v24.16b + rev32 v25.16b,v25.16b + + st1 {v24.16b}, [x2],16 + st1 {v25.s}[0], [x2] + + mov x0, xzr + ret + +.Lsha1_error: + mov x0, #-1 + ret + + .size sha1_block_partial, .-sha1_block_partial + .size sha1_block, .-sha1_block diff --git a/drivers/crypto/armv8/asm/sha256_core.S b/drivers/crypto/armv8/asm/sha256_core.S new file mode 100644 index 0000000..2b2da7f --- /dev/null +++ b/drivers/crypto/armv8/asm/sha256_core.S @@ -0,0 +1,525 @@ +/* + * BSD LICENSE + * + * Copyright (C) Cavium networks Ltd. 2016. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Cavium networks nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "assym.s" + +/* + * Description: + * + * Core SHA-2 Primitives + * + * Operations: + * sha256_block_partial: + * out = partial_sha256(init, in, len) <- no final block + * + * sha256_block: + * out = sha256(init, in, len) + * + * Prototype: + * + * int sha256_block_partial(uint8_t *init, + * uint8_t *dsrc, uint8_t *ddst, uint64_t len) + * + * int sha256_block(uint8_t *init, + * uint8_t *dsrc, uint8_t *ddst, uint64_t len) + * + * returns: 0 (success), -1 (failure) + * + * Registers used: + * + * sha256_block_partial( + * init, x0 (hash init state - NULL for default) + * dsrc, x1 (digest src address) + * ddst, x2 (digest dst address) + * len, x3 (length) + * ) + * + * sha256_block( + * init, x0 (hash init state - NULL for default) + * dsrc, x1 (digest src address) + * ddst, x2 (digest dst address) + * len, x3 (length) + * ) + * + * Routine register definitions: + * + * v4 - v7 -- round consts for sha + * v21 -- ABCD tmp + * v22 -- sha working state ABCD (q22) + * v23 -- sha working state EFGH (q23) + * v24 -- reg_sha_stateABCD + * v25 -- reg_sha_stateEFGH + * v26 -- sha block 0 + * v27 -- sha block 1 + * v28 -- sha block 2 + * v29 -- sha block 3 + * v30 -- reserved + * v31 -- reserved + * + * Constraints: + * + * The variable "len" must be a multiple of 16, + * otherwise error code is returned. + * + */ + .file "sha256_core.S" + .text + .cpu generic+fp+simd+crypto+crc + .align 4 + .global sha256_block_partial + .type sha256_block_partial,%function + .global sha256_block + .type sha256_block,%function + + .align 4 +.Lrcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + .align 4 +.Linit_sha_state: + .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + + .align 4 + +sha256_block_partial: + mov x6, #1 /* indicate partial hash */ + ands x5, x3, #0x3f /* check size mod 1 SHA block */ + b.ne .Lsha256_error + cbnz x0, 1f + /* address of sha init state consts */ + adr x0,.Linit_sha_state +1: + ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH */ + /* number of 16B blocks (will be at least 4) */ + lsr x5, x3, 4 + b .Lsha256_loop + +sha256_block: + mov x6, xzr /* indicate full hash */ + ands x5, x3, #0xf /* check size mod 16B block */ + b.ne .Lsha256_error + cbnz x0, 1f + /* address of sha init state consts */ + adr x0,.Linit_sha_state +1: + ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH. (2 cycs) */ + lsr x5, x3, 4 /* number of 16B blocks */ + cmp x5, #4 /* at least 4 16B blocks give 1 SHA block */ + b.lo .Lsha256_last + + .align 4 +.Lsha256_loop: + sub x5, x5, #4 /* substract 1 SHA block */ + adr x4,.Lrcon + + ld1 {v26.16b},[x1],16 /* dsrc[0] */ + ld1 {v27.16b},[x1],16 /* dsrc[1] */ + ld1 {v28.16b},[x1],16 /* dsrc[2] */ + ld1 {v29.16b},[x1],16 /* dsrc[3] */ + + rev32 v26.16b,v26.16b /* fix endian w0 */ + rev32 v27.16b,v27.16b /* fix endian w1 */ + rev32 v28.16b,v28.16b /* fix endian w2 */ + rev32 v29.16b,v29.16b /* fix endian w3 */ + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + ld1 {v4.16b},[x4],16 /* key0 */ + ld1 {v5.16b},[x4],16 /* key1 */ + ld1 {v6.16b},[x4],16 /* key2 */ + ld1 {v7.16b},[x4],16 /* key3 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key4 */ + ld1 {v5.16b},[x4],16 /* key5 */ + ld1 {v6.16b},[x4],16 /* key6 */ + ld1 {v7.16b},[x4],16 /* key7 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key8 */ + ld1 {v5.16b},[x4],16 /* key9 */ + ld1 {v6.16b},[x4],16 /* key10 */ + ld1 {v7.16b},[x4],16 /* key11 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key12 */ + ld1 {v5.16b},[x4],16 /* key13 */ + ld1 {v6.16b},[x4],16 /* key14 */ + ld1 {v7.16b},[x4],16 /* key15 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + cmp x5, #4 + b.hs .Lsha256_loop + + /* Store partial hash and return or complete hash */ + cbz x6, .Lsha256_last + + st1 {v24.16b, v25.16b}, [x2] + + mov x0, xzr + ret + + /* + * Last block with padding. v24-v25 contain hash state. + */ +.Lsha256_last: + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b + eor v29.16b, v29.16b, v29.16b + + adr x4,.Lrcon + lsl x3, x3, 3 + + mov v22.16b,v24.16b /* working ABCD <- ABCD */ + mov v23.16b,v25.16b /* working EFGH <- EFGH */ + + /* Fill out the first vector register and the end of the block */ + + /* move length to the end of the block */ + mov v29.s[3], w3 + lsr x3, x3, 32 + mov v29.s[2], w3 /* and the higher part */ + /* set padding 1 to the first reg */ + mov w6, #0x80 /* that's the 1 of the pad */ + mov v26.b[3], w6 + cbz x5,.Lsha256_final + + sub x5, x5, #1 + mov v27.16b, v26.16b + ld1 {v26.16b},[x1],16 + rev32 v26.16b,v26.16b /* fix endian w0 */ + cbz x5,.Lsha256_final + + sub x5, x5, #1 + mov v28.16b, v27.16b + ld1 {v27.16b},[x1],16 + rev32 v27.16b,v27.16b /* fix endian w1 */ + cbz x5,.Lsha256_final + + mov v29.b[0], w6 + ld1 {v28.16b},[x1],16 + rev32 v28.16b,v28.16b /* fix endian w2 */ + +.Lsha256_final: + + ld1 {v4.16b},[x4],16 /* key0 */ + ld1 {v5.16b},[x4],16 /* key1 */ + ld1 {v6.16b},[x4],16 /* key2 */ + ld1 {v7.16b},[x4],16 /* key3 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key4 */ + ld1 {v5.16b},[x4],16 /* key5 */ + ld1 {v6.16b},[x4],16 /* key6 */ + ld1 {v7.16b},[x4],16 /* key7 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key8 */ + ld1 {v5.16b},[x4],16 /* key9 */ + ld1 {v6.16b},[x4],16 /* key10 */ + ld1 {v7.16b},[x4],16 /* key11 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + sha256su0 v26.4s,v27.4s + sha256su1 v26.4s,v28.4s,v29.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + sha256su0 v27.4s,v28.4s + sha256su1 v27.4s,v29.4s,v26.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + sha256su0 v28.4s,v29.4s + sha256su1 v28.4s,v26.4s,v27.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + sha256su0 v29.4s,v26.4s + sha256su1 v29.4s,v27.4s,v28.4s + + ld1 {v4.16b},[x4],16 /* key12 */ + ld1 {v5.16b},[x4],16 /* key13 */ + ld1 {v6.16b},[x4],16 /* key14 */ + ld1 {v7.16b},[x4],16 /* key15 */ + + mov v21.16b, v22.16b /* copy abcd */ + + add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */ + sha256h q22, q23, v4.4s + sha256h2 q23, q21, v4.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */ + sha256h q22, q23, v5.4s + sha256h2 q23, q21, v5.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */ + sha256h q22, q23, v6.4s + sha256h2 q23, q21, v6.4s + + mov v21.16b, v22.16b /* copy abcd */ + + add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */ + sha256h q22, q23, v7.4s + sha256h2 q23, q21, v7.4s + + add v24.4s,v24.4s,v22.4s /* ABCD += working copy */ + add v25.4s,v25.4s,v23.4s /* EFGH += working copy */ + + rev32 v24.16b, v24.16b + rev32 v25.16b, v25.16b + st1 {v24.4s,v25.4s},[x2] /* save them both */ + + mov x0, xzr + ret + +.Lsha256_error: + mov x0, #-1 + ret + + .size sha256_block_partial, .-sha256_block_partial -- 1.9.1