From: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com>

This patch adds core low-level crypto operations
for ARMv8 processors. The assembly code is a base
for an optimized PMD and is currently excluded
from the build.

Standalone SHA1 and SHA256 are provided to support
partial hashing of inner/outer key+padding and
authentication keys longer than 160/256 bits.
Optimized AES key schedule is also included.

Signed-off-by: Zbigniew Bodek <zbigniew.bo...@caviumnetworks.com>
Signed-off-by: Emery Davis <emery.da...@caviumnetworks.com>
---
 drivers/crypto/armv8/asm/aes_core.S    | 151 ++++++++++
 drivers/crypto/armv8/asm/sha1_core.S   | 518 ++++++++++++++++++++++++++++++++
 drivers/crypto/armv8/asm/sha256_core.S | 525 +++++++++++++++++++++++++++++++++
 3 files changed, 1194 insertions(+)
 create mode 100644 drivers/crypto/armv8/asm/aes_core.S
 create mode 100644 drivers/crypto/armv8/asm/sha1_core.S
 create mode 100644 drivers/crypto/armv8/asm/sha256_core.S

diff --git a/drivers/crypto/armv8/asm/aes_core.S 
b/drivers/crypto/armv8/asm/aes_core.S
new file mode 100644
index 0000000..b7ceae6
--- /dev/null
+++ b/drivers/crypto/armv8/asm/aes_core.S
@@ -0,0 +1,151 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+       .file   "aes_core.S"
+       .text
+       .cpu generic+fp+simd+crypto+crc
+       .align  4
+       .global aes128_key_sched_enc
+       .type   aes128_key_sched_enc, %function
+       .global aes128_key_sched_dec
+       .type   aes128_key_sched_dec, %function
+
+       /*
+        * AES key expand algorithm for single round.
+        */
+       .macro  key_expand res, key, shuffle_mask, rcon, tq0, tq1, td
+       /* temp = rotword(key[3]) */
+       tbl     \td\().8b,{\key\().16b},\shuffle_mask\().8b
+       dup     \tq0\().2d,\td\().d[0]
+       /* temp = subbytes(temp) */
+       aese    \tq0\().16b,v19\().16b                  /* q19 := 0 */
+       /* temp = temp + rcon */
+       mov     w11,\rcon
+       dup     \tq1\().4s,w11
+       eor     \tq0\().16b,\tq0\().16b,\tq1\().16b
+       /* tq1 = [0, a, b, c] */
+       ext     \tq1\().16b,v19\().16b,\key\().16b,12   /* q19 := 0 */
+       eor     \res\().16b,\key\().16b,\tq1\().16b
+       /* tq1 = [0, 0, a, b] */
+       ext     \tq1\().16b,v19\().16b,\tq1\().16b,12   /* q19 := 0 */
+       eor     \res\().16b,\res\().16b,\tq1\().16b
+       /* tq1 = [0, 0, 0, a] */
+       ext     \tq1\().16b,v19\().16b,\tq1\().16b,12   /* q19 := 0 */
+       eor     \res\().16b,\res\().16b,\tq1\().16b
+       /* + temp */
+       eor     \res\().16b,\res\().16b,\tq0\().16b
+       .endm
+/*
+ * *expanded_key, *user_key
+ */
+       .align  4
+aes128_key_sched_enc:
+       sub     sp,sp,4*16
+       st1     {v8.16b - v11.16b},[sp]
+       ld1     {v0.16b},[x1]                           /* user_key */
+       mov     w10,0x0e0d                              /* form shuffle_word */
+       mov     w11,0x0c0f
+       orr     w10,w10,w11,lsl 16
+       dup     v20.4s,w10                              /* shuffle_mask */
+       eor     v19.16b,v19.16b,v19.16b                 /* zero */
+       /* Expand key */
+       key_expand v1,v0,v20,0x1,v21,v16,v17
+       key_expand v2,v1,v20,0x2,v21,v16,v17
+       key_expand v3,v2,v20,0x4,v21,v16,v17
+       key_expand v4,v3,v20,0x8,v21,v16,v17
+       key_expand v5,v4,v20,0x10,v21,v16,v17
+       key_expand v6,v5,v20,0x20,v21,v16,v17
+       key_expand v7,v6,v20,0x40,v21,v16,v17
+       key_expand v8,v7,v20,0x80,v21,v16,v17
+       key_expand v9,v8,v20,0x1b,v21,v16,v17
+       key_expand v10,v9,v20,0x36,v21,v16,v17
+       /* Store round keys in the correct order */
+       st1     {v0.16b - v3.16b},[x0],64
+       st1     {v4.16b - v7.16b},[x0],64
+       st1     {v8.16b - v10.16b},[x0],48
+
+       ld1     {v8.16b - v11.16b},[sp]
+       add     sp,sp,4*16
+       ret
+
+       .size   aes128_key_sched_enc, .-aes128_key_sched_enc
+
+/*
+ * *expanded_key, *user_key
+ */
+       .align  4
+aes128_key_sched_dec:
+       sub     sp,sp,4*16
+       st1     {v8.16b-v11.16b},[sp]
+       ld1     {v0.16b},[x1]                           /* user_key */
+       mov     w10,0x0e0d                              /* form shuffle_word */
+       mov     w11,0x0c0f
+       orr     w10,w10,w11,lsl 16
+       dup     v20.4s,w10                              /* shuffle_mask */
+       eor     v19.16b,v19.16b,v19.16b                 /* zero */
+       /*
+        * Expand key.
+        * Intentionally reverse registers order to allow
+        * for multiple store later.
+        * (Store must be performed in the ascending registers' order)
+        */
+       key_expand v10,v0,v20,0x1,v21,v16,v17
+       key_expand v9,v10,v20,0x2,v21,v16,v17
+       key_expand v8,v9,v20,0x4,v21,v16,v17
+       key_expand v7,v8,v20,0x8,v21,v16,v17
+       key_expand v6,v7,v20,0x10,v21,v16,v17
+       key_expand v5,v6,v20,0x20,v21,v16,v17
+       key_expand v4,v5,v20,0x40,v21,v16,v17
+       key_expand v3,v4,v20,0x80,v21,v16,v17
+       key_expand v2,v3,v20,0x1b,v21,v16,v17
+       key_expand v1,v2,v20,0x36,v21,v16,v17
+       /* Inverse mixcolumns for keys 1-9 (registers v10-v2) */
+       aesimc  v10.16b, v10.16b
+       aesimc  v9.16b, v9.16b
+       aesimc  v8.16b, v8.16b
+       aesimc  v7.16b, v7.16b
+       aesimc  v6.16b, v6.16b
+       aesimc  v5.16b, v5.16b
+       aesimc  v4.16b, v4.16b
+       aesimc  v3.16b, v3.16b
+       aesimc  v2.16b, v2.16b
+       /* Store round keys in the correct order */
+       st1     {v1.16b - v4.16b},[x0],64
+       st1     {v5.16b - v8.16b},[x0],64
+       st1     {v9.16b, v10.16b},[x0],32
+       st1     {v0.16b},[x0],16
+
+       ld1     {v8.16b - v11.16b},[sp]
+       add     sp,sp,4*16
+       ret
+
+       .size   aes128_key_sched_dec, .-aes128_key_sched_dec
diff --git a/drivers/crypto/armv8/asm/sha1_core.S 
b/drivers/crypto/armv8/asm/sha1_core.S
new file mode 100644
index 0000000..283c946
--- /dev/null
+++ b/drivers/crypto/armv8/asm/sha1_core.S
@@ -0,0 +1,518 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-1 Primitives
+ *
+ * Operations:
+ * sha1_block_partial:
+ *     out = partial_sha1(init, in, len)       <- no final block
+ *
+ * sha1_block:
+ *     out = sha1(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha1_block_partial(uint8_t *init,
+ *                     uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha1_block(uint8_t *init,
+ *                     uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha1_block_partial(
+ *     init,                   x0      (hash init state - NULL for default)
+ *     dsrc,                   x1      (digest src address)
+ *     ddst,                   x2      (digest dst address)
+ *     len,                    x3      (length)
+ *     )
+ *
+ * sha1_block(
+ *     init,                   x0      (hash init state - NULL for default)
+ *     dsrc,                   x1      (digest src address)
+ *     ddst,                   x2      (digest dst address)
+ *     len,                    x3      (length)
+ *     )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v22 -- sha working state ABCD (q22)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16 (+20 for the HMAC),
+ * otherwise error code is returned.
+ *
+ */
+       .file "sha1_core.S"
+       .text
+       .cpu generic+fp+simd+crypto+crc
+       .align  4
+       .global sha1_block_partial
+       .type   sha1_block_partial,%function
+       .global sha1_block
+       .type   sha1_block,%function
+
+       .align  4
+.Lrcon:
+       .word           0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+       .word           0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+       .word           0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+       .word           0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+       .align  4
+.Linit_sha_state:
+       .word           0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+       .word           0xc3d2e1f0, 0x00000000, 0x00000000, 0x00000000
+
+       .align  4
+
+sha1_block_partial:
+       mov             x6, #1                  /* indicate partial hash */
+       ands            x5, x3, #0x3f           /* Check size mod 1 SHA block */
+       b.ne            .Lsha1_error
+       cbnz            x0, 1f
+       /* address of sha init state consts */
+       adr             x0,.Linit_sha_state
+1:
+       ld1             {v24.4s},[x0],16        /* init ABCD */
+       ld1             {v25.4s},[x0]           /* and E */
+
+       /* Load SHA-1 constants */
+       adr             x4,.Lrcon
+       ld1             {v4.16b},[x4],16        /* key0 */
+       ld1             {v5.16b},[x4],16        /* key1 */
+       ld1             {v6.16b},[x4],16        /* key2 */
+       ld1             {v7.16b},[x4],16        /* key3 */
+
+       lsr             x5, x3, 2               /* number of 4B blocks */
+       b               .Lsha1_loop
+
+sha1_block:
+       mov             x6, xzr         /* indicate full hash */
+       and             x5, x3, #0xf    /* check size mod 16B block */
+       cmp             x5, #4          /* additional word is accepted */
+       b.eq            1f
+       cbnz            x5, .Lsha1_error
+1:
+       cbnz            x0, 2f
+       /* address of sha init state consts */
+       adr             x0,.Linit_sha_state
+2:
+       ld1             {v24.4s},[x0],16        /* init ABCD */
+       ld1             {v25.4s},[x0]           /* and E */
+
+       /* Load SHA-1 constants */
+       adr             x4,.Lrcon
+       ld1             {v4.16b},[x4],16        /* key0 */
+       ld1             {v5.16b},[x4],16        /* key1 */
+       ld1             {v6.16b},[x4],16        /* key2 */
+       ld1             {v7.16b},[x4],16        /* key3 */
+
+       lsr             x5, x3, 2               /* number of 4B blocks */
+       /* at least 16 4B blocks give 1 SHA block */
+       cmp             x5, #16
+       b.lo            .Lsha1_last
+
+       .align  4
+
+.Lsha1_loop:
+       sub             x5, x5, #16             /* substract 1 SHA block */
+
+       ld1             {v26.16b},[x1],16       /* dsrc[0] */
+       ld1             {v27.16b},[x1],16       /* dsrc[1] */
+       ld1             {v28.16b},[x1],16       /* dsrc[2] */
+       ld1             {v29.16b},[x1],16       /* dsrc[3] */
+
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+/* quad 0 */
+       add             v16.4s,v4.4s,v26.4s
+       sha1h           s19,s24
+       sha1c           q24,s25,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v4.4s,v27.4s
+       sha1h           s18,s24
+       sha1c           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v4.4s,v28.4s
+       sha1h           s19,s24
+       sha1c           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v4.4s,v29.4s
+       sha1h           s18,s24
+       sha1c           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v4.4s,v26.4s
+       sha1h           s19,s24
+       sha1c           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+/* quad 1 */
+       add             v17.4s,v5.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v5.4s,v28.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v5.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v5.4s,v26.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v5.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+/* quad 2 */
+       add             v16.4s,v6.4s,v28.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v6.4s,v29.4s
+       sha1h           s18,s24
+       sha1m           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v6.4s,v26.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v6.4s,v27.4s
+       sha1h           s18,s24
+       sha1m           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v6.4s,v28.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+/* quad 3 */
+       add             v17.4s,v7.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v7.4s,v26.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+
+       add             v17.4s,v7.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+
+       add             v16.4s,v7.4s,v28.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+
+       add             v17.4s,v7.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+
+       add             v24.4s,v24.4s,v22.4s
+       add             v25.4s,v25.4s,v18.4s
+
+       cmp             x5, #16
+       b.hs            .Lsha1_loop
+
+       /* Store partial hash and return or complete hash */
+       cbz             x6, .Lsha1_last
+
+       st1             {v24.16b},[x2],16
+       st1             {v25.16b},[x2]
+
+       mov             x0, xzr
+       ret
+
+       /*
+        * Last block with padding. v24-v25[0] contain hash state.
+        */
+.Lsha1_last:
+
+       eor             v26.16b, v26.16b, v26.16b
+       eor             v27.16b, v27.16b, v27.16b
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+
+       adr             x4,.Lrcon
+       /* Number of bits in message */
+       lsl             x3, x3, 3
+
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       /* move length to the end of the block */
+       mov             v29.s[3], w3
+       lsr             x3, x3, 32
+       /* and the higher part */
+       mov             v29.s[2], w3
+
+       /* The remaining part is up to 3 16B blocks and up to 1 4B block */
+       mov             w6, #0x80               /* that's the 1 of the pad */
+       mov             v26.b[3], w6
+       cbz             x5,.Lsha1_final
+       /* Are there 3 16B blocks? */
+       cmp             x5, #12
+       b.lo            1f
+       ld1             {v26.16b},[x1],16
+       ld1             {v27.16b},[x1],16
+       ld1             {v28.16b},[x1],16
+       rev32           v26.16b, v26.16b
+       rev32           v27.16b, v27.16b
+       rev32           v28.16b, v28.16b
+       sub             x5,x5,#12
+       mov             v29.b[7], w6
+       cbz             x5,.Lsha1_final
+       mov             v29.b[7], wzr
+       ld1             {v29.s}[0],[x1],4
+       rev32           v29.16b,v29.16b
+       mov             v29.b[7], w6
+       b               .Lsha1_final
+1:
+       /* Are there 2 16B blocks? */
+       cmp             x5, #8
+       b.lo            2f
+       ld1             {v26.16b},[x1],16
+       ld1             {v27.16b},[x1],16
+       rev32           v26.16b,v26.16b
+       rev32           v27.16b,v27.16b
+       sub             x5,x5,#8
+       mov             v28.b[7], w6
+       cbz             x5,.Lsha1_final
+       mov             v28.b[7], wzr
+       ld1             {v28.s}[0],[x1],4
+       rev32           v28.16b,v28.16b
+       mov             v28.b[7], w6
+       b               .Lsha1_final
+2:
+       /* Is there 1 16B block? */
+       cmp             x5, #4
+       b.lo            3f
+       ld1             {v26.16b},[x1],16
+       rev32           v26.16b,v26.16b
+       sub             x5,x5,#4
+       mov             v27.b[7], w6
+       cbz             x5,.Lsha1_final
+       mov             v27.b[7], wzr
+       ld1             {v27.s}[0],[x1],4
+       rev32           v27.16b,v27.16b
+       mov             v27.b[7], w6
+       b               .Lsha1_final
+3:
+       ld1             {v26.s}[0],[x1],4
+       rev32           v26.16b,v26.16b
+       mov             v26.b[7], w6
+
+.Lsha1_final:
+       ld1             {v4.16b},[x4],16        /* key0 */
+       ld1             {v5.16b},[x4],16        /* key1 */
+       ld1             {v6.16b},[x4],16        /* key2 */
+       ld1             {v7.16b},[x4],16        /* key3 */
+/* quad 0 */
+       add             v16.4s,v4.4s,v26.4s
+       sha1h           s19,s24
+       sha1c           q24,s25,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v4.4s,v27.4s
+       sha1h           s18,s24
+       sha1c           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v4.4s,v28.4s
+       sha1h           s19,s24
+       sha1c           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v4.4s,v29.4s
+       sha1h           s18,s24
+       sha1c           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v4.4s,v26.4s
+       sha1h           s19,s24
+       sha1c           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+/* quad 1 */
+       add             v17.4s,v5.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v5.4s,v28.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v5.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v5.4s,v26.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v5.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+/* quad 2 */
+       add             v16.4s,v6.4s,v28.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+
+       add             v17.4s,v6.4s,v29.4s
+       sha1h           s18,s24
+       sha1m           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v6.4s,v26.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v26.4s,v27.4s,v28.4s
+       sha1su1         v26.4s,v29.4s
+
+       add             v17.4s,v6.4s,v27.4s
+       sha1h           s18,s24
+       sha1m           q24,s19,v17.4s
+       sha1su0         v27.4s,v28.4s,v29.4s
+       sha1su1         v27.4s,v26.4s
+
+       add             v16.4s,v6.4s,v28.4s
+       sha1h           s19,s24
+       sha1m           q24,s18,v16.4s
+       sha1su0         v28.4s,v29.4s,v26.4s
+       sha1su1         v28.4s,v27.4s
+/* quad 3 */
+       add             v17.4s,v7.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+       sha1su0         v29.4s,v26.4s,v27.4s
+       sha1su1         v29.4s,v28.4s
+
+       add             v16.4s,v7.4s,v26.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+
+       add             v17.4s,v7.4s,v27.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+
+       add             v16.4s,v7.4s,v28.4s
+       sha1h           s19,s24
+       sha1p           q24,s18,v16.4s
+
+       add             v17.4s,v7.4s,v29.4s
+       sha1h           s18,s24
+       sha1p           q24,s19,v17.4s
+
+       add             v25.4s,v25.4s,v18.4s
+       add             v24.4s,v24.4s,v22.4s
+
+       rev32           v24.16b,v24.16b
+       rev32           v25.16b,v25.16b
+
+       st1             {v24.16b}, [x2],16
+       st1             {v25.s}[0], [x2]
+
+       mov             x0, xzr
+       ret
+
+.Lsha1_error:
+       mov             x0, #-1
+       ret
+
+       .size   sha1_block_partial, .-sha1_block_partial
+       .size   sha1_block, .-sha1_block
diff --git a/drivers/crypto/armv8/asm/sha256_core.S 
b/drivers/crypto/armv8/asm/sha256_core.S
new file mode 100644
index 0000000..2b2da7f
--- /dev/null
+++ b/drivers/crypto/armv8/asm/sha256_core.S
@@ -0,0 +1,525 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2016.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-2 Primitives
+ *
+ * Operations:
+ * sha256_block_partial:
+ *     out = partial_sha256(init, in, len)     <- no final block
+ *
+ * sha256_block:
+ *     out = sha256(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha256_block_partial(uint8_t *init,
+ *                     uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha256_block(uint8_t *init,
+ *                     uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha256_block_partial(
+ *     init,                   x0      (hash init state - NULL for default)
+ *     dsrc,                   x1      (digest src address)
+ *     ddst,                   x2      (digest dst address)
+ *     len,                    x3      (length)
+ *     )
+ *
+ * sha256_block(
+ *     init,                   x0      (hash init state - NULL for default)
+ *     dsrc,                   x1      (digest src address)
+ *     ddst,                   x2      (digest dst address)
+ *     len,                    x3      (length)
+ *     )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16,
+ * otherwise error code is returned.
+ *
+ */
+       .file "sha256_core.S"
+       .text
+       .cpu generic+fp+simd+crypto+crc
+       .align  4
+       .global sha256_block_partial
+       .type   sha256_block_partial,%function
+       .global sha256_block
+       .type   sha256_block,%function
+
+       .align  4
+.Lrcon:
+       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+       .align  4
+.Linit_sha_state:
+       .word           0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+       .word           0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+       .align  4
+
+sha256_block_partial:
+       mov             x6, #1                  /* indicate partial hash */
+       ands            x5, x3, #0x3f           /* check size mod 1 SHA block */
+       b.ne            .Lsha256_error
+       cbnz            x0, 1f
+       /* address of sha init state consts */
+       adr             x0,.Linit_sha_state
+1:
+       ld1             {v24.4s, v25.4s},[x0]   /* init ABCD, EFGH */
+       /* number of 16B blocks (will be at least 4) */
+       lsr             x5, x3, 4
+       b               .Lsha256_loop
+
+sha256_block:
+       mov             x6, xzr                 /* indicate full hash */
+       ands            x5, x3, #0xf            /* check size mod 16B block */
+       b.ne            .Lsha256_error
+       cbnz            x0, 1f
+       /* address of sha init state consts */
+       adr             x0,.Linit_sha_state
+1:
+       ld1             {v24.4s, v25.4s},[x0]   /* init ABCD, EFGH. (2 cycs) */
+       lsr             x5, x3, 4               /* number of 16B blocks */
+       cmp             x5, #4  /* at least 4 16B blocks give 1 SHA block */
+       b.lo            .Lsha256_last
+
+       .align  4
+.Lsha256_loop:
+       sub             x5, x5, #4              /* substract 1 SHA block */
+       adr             x4,.Lrcon
+
+       ld1             {v26.16b},[x1],16       /* dsrc[0] */
+       ld1             {v27.16b},[x1],16       /* dsrc[1] */
+       ld1             {v28.16b},[x1],16       /* dsrc[2] */
+       ld1             {v29.16b},[x1],16       /* dsrc[3] */
+
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+       rev32           v29.16b,v29.16b         /* fix endian w3 */
+
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+
+       ld1             {v4.16b},[x4],16        /* key0 */
+       ld1             {v5.16b},[x4],16        /* key1 */
+       ld1             {v6.16b},[x4],16        /* key2 */
+       ld1             {v7.16b},[x4],16        /* key3 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key4 */
+       ld1             {v5.16b},[x4],16        /* key5 */
+       ld1             {v6.16b},[x4],16        /* key6 */
+       ld1             {v7.16b},[x4],16        /* key7 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key8 */
+       ld1             {v5.16b},[x4],16        /* key9 */
+       ld1             {v6.16b},[x4],16        /* key10 */
+       ld1             {v7.16b},[x4],16        /* key11 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key8+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key9+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key10+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key11+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key12 */
+       ld1             {v5.16b},[x4],16        /* key13 */
+       ld1             {v6.16b},[x4],16        /* key14 */
+       ld1             {v7.16b},[x4],16        /* key15 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key12+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key13+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key14+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key15+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       cmp             x5, #4
+       b.hs            .Lsha256_loop
+
+       /* Store partial hash and return or complete hash */
+       cbz             x6, .Lsha256_last
+
+       st1             {v24.16b, v25.16b}, [x2]
+
+       mov             x0, xzr
+       ret
+
+       /*
+        * Last block with padding. v24-v25 contain hash state.
+        */
+.Lsha256_last:
+       eor             v26.16b, v26.16b, v26.16b
+       eor             v27.16b, v27.16b, v27.16b
+       eor             v28.16b, v28.16b, v28.16b
+       eor             v29.16b, v29.16b, v29.16b
+
+       adr             x4,.Lrcon
+       lsl             x3, x3, 3
+
+       mov             v22.16b,v24.16b         /* working ABCD <- ABCD */
+       mov             v23.16b,v25.16b         /* working EFGH <- EFGH */
+
+       /* Fill out the first vector register and the end of the block */
+
+       /* move length to the end of the block */
+       mov             v29.s[3], w3
+       lsr             x3, x3, 32
+       mov             v29.s[2], w3            /* and the higher part */
+       /* set padding 1 to the first reg */
+       mov             w6, #0x80               /* that's the 1 of the pad */
+       mov             v26.b[3], w6
+       cbz             x5,.Lsha256_final
+
+       sub             x5, x5, #1
+       mov             v27.16b, v26.16b
+       ld1             {v26.16b},[x1],16
+       rev32           v26.16b,v26.16b         /* fix endian w0 */
+       cbz             x5,.Lsha256_final
+
+       sub             x5, x5, #1
+       mov             v28.16b, v27.16b
+       ld1             {v27.16b},[x1],16
+       rev32           v27.16b,v27.16b         /* fix endian w1 */
+       cbz             x5,.Lsha256_final
+
+       mov             v29.b[0], w6
+       ld1             {v28.16b},[x1],16
+       rev32           v28.16b,v28.16b         /* fix endian w2 */
+
+.Lsha256_final:
+
+       ld1             {v4.16b},[x4],16        /* key0 */
+       ld1             {v5.16b},[x4],16        /* key1 */
+       ld1             {v6.16b},[x4],16        /* key2 */
+       ld1             {v7.16b},[x4],16        /* key3 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key0+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key1+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key2+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key3+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key4 */
+       ld1             {v5.16b},[x4],16        /* key5 */
+       ld1             {v6.16b},[x4],16        /* key6 */
+       ld1             {v7.16b},[x4],16        /* key7 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key4+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key5+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key6+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key7+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key8 */
+       ld1             {v5.16b},[x4],16        /* key9 */
+       ld1             {v6.16b},[x4],16        /* key10 */
+       ld1             {v7.16b},[x4],16        /* key11 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key8+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+       sha256su0       v26.4s,v27.4s
+       sha256su1       v26.4s,v28.4s,v29.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key9+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+       sha256su0       v27.4s,v28.4s
+       sha256su1       v27.4s,v29.4s,v26.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key10+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+       sha256su0       v28.4s,v29.4s
+       sha256su1       v28.4s,v26.4s,v27.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key11+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+       sha256su0       v29.4s,v26.4s
+       sha256su1       v29.4s,v27.4s,v28.4s
+
+       ld1             {v4.16b},[x4],16        /* key12 */
+       ld1             {v5.16b},[x4],16        /* key13 */
+       ld1             {v6.16b},[x4],16        /* key14 */
+       ld1             {v7.16b},[x4],16        /* key15 */
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v4.4s,v4.4s,v26.4s      /* wk = key12+w0 */
+       sha256h         q22, q23, v4.4s
+       sha256h2        q23, q21, v4.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v5.4s,v5.4s,v27.4s      /* wk = key13+w1 */
+       sha256h         q22, q23, v5.4s
+       sha256h2        q23, q21, v5.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v6.4s,v6.4s,v28.4s      /* wk = key14+w2 */
+       sha256h         q22, q23, v6.4s
+       sha256h2        q23, q21, v6.4s
+
+       mov             v21.16b, v22.16b        /* copy abcd */
+
+       add             v7.4s,v7.4s,v29.4s      /* wk = key15+w3 */
+       sha256h         q22, q23, v7.4s
+       sha256h2        q23, q21, v7.4s
+
+       add             v24.4s,v24.4s,v22.4s    /* ABCD += working copy */
+       add             v25.4s,v25.4s,v23.4s    /* EFGH += working copy */
+
+       rev32           v24.16b, v24.16b
+       rev32           v25.16b, v25.16b
+       st1             {v24.4s,v25.4s},[x2]    /* save them both */
+
+       mov             x0, xzr
+       ret
+
+.Lsha256_error:
+       mov             x0, #-1
+       ret
+
+       .size   sha256_block_partial, .-sha256_block_partial
-- 
1.9.1

Reply via email to