On Mon, Oct 07, 2019 at 06:45:48PM +0200, Ard Biesheuvel wrote:
> diff --git a/arch/arm/crypto/chacha-scalar-core.S
> b/arch/arm/crypto/chacha-scalar-core.S
> index 2140319b64a0..0970ae107590 100644
> --- a/arch/arm/crypto/chacha-scalar-core.S
> +++ b/arch/arm/crypto/chacha-scalar-core.S
> @@ -41,14 +41,6 @@
> X14 .req r12
> X15 .req r14
>
> -.Lexpand_32byte_k:
> - // "expand 32-byte k"
> - .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
> -
> -#ifdef __thumb2__
> -# define adrl adr
> -#endif
> -
> .macro __rev out, in, t0, t1, t2
> .if __LINUX_ARM_ARCH__ >= 6
> rev \out, \in
> @@ -391,61 +383,65 @@
> .endm // _chacha
>
> /*
> - * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
> - * const u32 iv[4]);
> + * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
> + * u32 *state, int nrounds);
> */
> -ENTRY(chacha20_arm)
> +ENTRY(chacha_doarm)
> cmp r2, #0 // len == 0?
> reteq lr
>
> + ldr ip, [sp]
> + cmp ip, #12
> +
> push {r0-r2,r4-r11,lr}
>
> // Push state x0-x15 onto stack.
> // Also store an extra copy of x10-x11 just before the state.
>
> - ldr r4, [sp, #48] // iv
> - mov r0, sp
> - sub sp, #80
> -
> - // iv: x12-x15
> - ldm r4, {X12,X13,X14,X15}
> - stmdb r0!, {X12,X13,X14,X15}
> + add X12, r3, #48
> + ldm X12, {X12,X13,X14,X15}
> + push {X12,X13,X14,X15}
> + sub sp, sp, #64
>
> - // key: x4-x11
> - __ldrd X8_X10, X9_X11, r3, 24
> + __ldrd X8_X10, X9_X11, r3, 40
> __strd X8_X10, X9_X11, sp, 8
> - stmdb r0!, {X8_X10, X9_X11}
> - ldm r3, {X4-X9_X11}
> - stmdb r0!, {X4-X9_X11}
> -
> - // constants: x0-x3
> - adrl X3, .Lexpand_32byte_k
> - ldm X3, {X0-X3}
> + __strd X8_X10, X9_X11, sp, 56
> + ldm r3, {X0-X9_X11}
> __strd X0, X1, sp, 16
> __strd X2, X3, sp, 24
> + __strd X4, X5, sp, 32
> + __strd X6, X7, sp, 40
> + __strd X8_X10, X9_X11, sp, 48
>
> + beq 1f
> _chacha 20
>
> - add sp, #76
> +0: add sp, #76
> pop {r4-r11, pc}
> -ENDPROC(chacha20_arm)
> +
> +1: _chacha 12
> + b 0b
> +ENDPROC(chacha_doarm)
>
> /*
> - * void hchacha20_arm(const u32 state[16], u32 out[8]);
> + * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
> */
> -ENTRY(hchacha20_arm)
> +ENTRY(hchacha_block_arm)
> push {r1,r4-r11,lr}
>
> + cmp r2, #12 // ChaCha12 ?
> +
> mov r14, r0
> ldmia r14!, {r0-r11} // load x0-x11
> push {r10-r11} // store x10-x11 to stack
> ldm r14, {r10-r12,r14} // load x12-x15
> sub sp, #8
>
> + beq 1f
> _chacha_permute 20
>
> // Skip over (unused0-unused1, x10-x11)
> - add sp, #16
> +0: add sp, #16
>
> // Fix up rotations of x12-x15
> ror X12, X12, #drot
> @@ -458,4 +454,7 @@ ENTRY(hchacha20_arm)
> stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
>
> pop {r4-r11,pc}
> -ENDPROC(hchacha20_arm)
> +
> +1: _chacha_permute 12
> + b 0b
> +ENDPROC(hchacha_block_arm)
> --
FYI, I've also had a version of this code supporting both the 12 and 20-round
variants sitting around here:
https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git/commit/?h=chacha-arm-scalar&id=fc51d8012742f591da3204b327a865f6109d472a
I'll take a closer look at this later, but you might want to take a quick look
at what I did, just in case I happened to do anything in a better way.
- Eric