On 01/02/15 15:18, Ard Biesheuvel wrote: > The crypto emulation code in target-arm/crypto_helper.c never worked > correctly on big endian hosts, due to the fact that it uses a union > of array types to convert between the native VFP register size (64 > bits) and the types used in the algorithms (bytes and 32 bit words) > > We cannot just swab between LE and BE when reading and writing the > registers, as the SHA code performs word additions, so instead, add > array accessors for the CRYPTO_STATE type whose LE and BE specific > implementations ensure that the correct array elements are referenced. > > Signed-off-by: Ard Biesheuvel <ard.biesheu...@linaro.org> > --- > > Only tested on a LE (amd64) host, as I don't have access to a BE host. > > target-arm/crypto_helper.c | 114 > +++++++++++++++++++++++++-------------------- > 1 file changed, 63 insertions(+), 51 deletions(-) > > diff --git a/target-arm/crypto_helper.c b/target-arm/crypto_helper.c > index dd60d0b81a4c..1fe975d0f1e3 100644 > --- a/target-arm/crypto_helper.c > +++ b/target-arm/crypto_helper.c > @@ -22,6 +22,14 @@ union CRYPTO_STATE { > uint64_t l[2]; > }; > > +#ifdef HOST_WORDS_BIGENDIAN > +#define CR_ST_BYTE(state, i) (state.bytes[(15 - (i)) ^ 8])
Produces 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9 which I think is consistent with what Peter wrote. > +#define CR_ST_WORD(state, i) (state.words[(3 - (i)) ^ 2]) Hm. The indices look good: 1, 0 3, 2 But, assuming you're on a big-endian host, perhaps you should byte swap the uint32_t too?... > (ie if you store 0x112233445566778899aabbccddeeff00 as a 64 bit write > to VFP register D0 then regs[0] will be > 0x112233445566778899aabbccddeeff00 regardless of host endianness. That > is, the least significant 8 bits of D0 will be (regs[0] & 0xff). (This > isn't the same number as if you do the union-type-punning thing with > union { uint64_t l; uint8_t b[8]; } and look at b[0].) Assume the guest writes value 0x112233445566778899aabbccddeeff00 (LE representation). * Assuming a little endian host, we get: [00 ff ee dd cc bb aa 99 88 77 66 55 44 33 22 11] (full 128-bit LE vector). This is grouped into units of four bytes for the words array: [00 ff ee dd] [cc bb aa 99] [88 77 66 55] [44 33 22 11] And the meaning of the individual uint32_t's is: 0xddeeff00 0x99aabbcc 0x55667788 0x11223344 Which are the building blocks for the crypto primitives. * Assuming a big endian host, you get the following byte array for the same store (if I understand Peter right -- two uint64_t values are encoded as host endian, but the ordering between those remains little endian): [99 aa bb cc dd ee ff 00 11 22 33 44 55 66 77 88] (and your CR_ST_BYTE() macro looks right for this). Grouping this into units of four bytes for the words array, we get [99 aa bb cc] [dd ee ff 00] [11 22 33 44] [55 66 77 88] resulting in values 0x99aabbcc 0xddeeff00 0x11223344 0x55667788 And then your CR_ST_WORD macro permutes them (1, 0, 3, 2) to the following values: 0xddeeff00 0x99aabbcc 0x55667788 0x11223344 ... Which is identical to the LE host result. So this part looks fine to me. Regarding the rest -- in my opinion, getting the implementation of crypto primitivies *wrong* despite them succesfully passing an extensive test suite is exceedingly unlikely. You usually cannot get crypto primitives "halfway right" -- as long as you don't *combine* them to bigger building blocks, crypto primitives are 100% specified and there are no "corner cases" that are usual for other program logic. The implementation either works or is catastrophically broken (which is quickly visible). Hence, if Peter gets good test results for this patchset, then I think that *suffices* to apply the patch. ... Which is why I won't try to eyeball the rest of the patch where you put the macros to use. :) The macros themselves are sound. If you broke their application, they won't pass the test suite (in the kernel bootup code, or elsewhere). Acked-by: Laszlo Ersek <ler...@redhat.com> Thanks Laszlo > +#else > +#define CR_ST_BYTE(state, i) (state.bytes[i]) > +#define CR_ST_WORD(state, i) (state.words[i]) > +#endif > + > void HELPER(crypto_aese)(CPUARMState *env, uint32_t rd, uint32_t rm, > uint32_t decrypt) > { > @@ -46,7 +54,7 @@ void HELPER(crypto_aese)(CPUARMState *env, uint32_t rd, > uint32_t rm, > > /* combine ShiftRows operation and sbox substitution */ > for (i = 0; i < 16; i++) { > - st.bytes[i] = sbox[decrypt][rk.bytes[shift[decrypt][i]]]; > + CR_ST_BYTE(st, i) = sbox[decrypt][CR_ST_BYTE(rk, shift[decrypt][i])]; > } > > env->vfp.regs[rd] = make_float64(st.l[0]); > @@ -198,11 +206,11 @@ void HELPER(crypto_aesmc)(CPUARMState *env, uint32_t > rd, uint32_t rm, > assert(decrypt < 2); > > for (i = 0; i < 16; i += 4) { > - st.words[i >> 2] = cpu_to_le32( > - mc[decrypt][st.bytes[i]] ^ > - rol32(mc[decrypt][st.bytes[i + 1]], 8) ^ > - rol32(mc[decrypt][st.bytes[i + 2]], 16) ^ > - rol32(mc[decrypt][st.bytes[i + 3]], 24)); > + CR_ST_WORD(st, i >> 2) = > + mc[decrypt][CR_ST_BYTE(st, i)] ^ > + rol32(mc[decrypt][CR_ST_BYTE(st, i + 1)], 8) ^ > + rol32(mc[decrypt][CR_ST_BYTE(st, i + 2)], 16) ^ > + rol32(mc[decrypt][CR_ST_BYTE(st, i + 3)], 24); > } > > env->vfp.regs[rd] = make_float64(st.l[0]); > @@ -255,24 +263,25 @@ void HELPER(crypto_sha1_3reg)(CPUARMState *env, > uint32_t rd, uint32_t rn, > > switch (op) { > case 0: /* sha1c */ > - t = cho(d.words[1], d.words[2], d.words[3]); > + t = cho(CR_ST_WORD(d, 1), CR_ST_WORD(d, 2), CR_ST_WORD(d, > 3)); > break; > case 1: /* sha1p */ > - t = par(d.words[1], d.words[2], d.words[3]); > + t = par(CR_ST_WORD(d, 1), CR_ST_WORD(d, 2), CR_ST_WORD(d, > 3)); > break; > case 2: /* sha1m */ > - t = maj(d.words[1], d.words[2], d.words[3]); > + t = maj(CR_ST_WORD(d, 1), CR_ST_WORD(d, 2), CR_ST_WORD(d, > 3)); > break; > default: > g_assert_not_reached(); > } > - t += rol32(d.words[0], 5) + n.words[0] + m.words[i]; > - > - n.words[0] = d.words[3]; > - d.words[3] = d.words[2]; > - d.words[2] = ror32(d.words[1], 2); > - d.words[1] = d.words[0]; > - d.words[0] = t; > + t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0) > + + CR_ST_WORD(m, i); > + > + CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3); > + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); > + CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2); > + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); > + CR_ST_WORD(d, 0) = t; > } > } > env->vfp.regs[rd] = make_float64(d.l[0]); > @@ -286,8 +295,8 @@ void HELPER(crypto_sha1h)(CPUARMState *env, uint32_t rd, > uint32_t rm) > float64_val(env->vfp.regs[rm + 1]) > } }; > > - m.words[0] = ror32(m.words[0], 2); > - m.words[1] = m.words[2] = m.words[3] = 0; > + CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2); > + CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0; > > env->vfp.regs[rd] = make_float64(m.l[0]); > env->vfp.regs[rd + 1] = make_float64(m.l[1]); > @@ -304,10 +313,10 @@ void HELPER(crypto_sha1su1)(CPUARMState *env, uint32_t > rd, uint32_t rm) > float64_val(env->vfp.regs[rm + 1]) > } }; > > - d.words[0] = rol32(d.words[0] ^ m.words[1], 1); > - d.words[1] = rol32(d.words[1] ^ m.words[2], 1); > - d.words[2] = rol32(d.words[2] ^ m.words[3], 1); > - d.words[3] = rol32(d.words[3] ^ d.words[0], 1); > + CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1); > + CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1); > + CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1); > + CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1); > > env->vfp.regs[rd] = make_float64(d.l[0]); > env->vfp.regs[rd + 1] = make_float64(d.l[1]); > @@ -356,20 +365,22 @@ void HELPER(crypto_sha256h)(CPUARMState *env, uint32_t > rd, uint32_t rn, > int i; > > for (i = 0; i < 4; i++) { > - uint32_t t = cho(n.words[0], n.words[1], n.words[2]) + n.words[3] > - + S1(n.words[0]) + m.words[i]; > - > - n.words[3] = n.words[2]; > - n.words[2] = n.words[1]; > - n.words[1] = n.words[0]; > - n.words[0] = d.words[3] + t; > - > - t += maj(d.words[0], d.words[1], d.words[2]) + S0(d.words[0]); > - > - d.words[3] = d.words[2]; > - d.words[2] = d.words[1]; > - d.words[1] = d.words[0]; > - d.words[0] = t; > + uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, > 2)) > + + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0)) > + + CR_ST_WORD(m, i); > + > + CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2); > + CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1); > + CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0); > + CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t; > + > + t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2)) > + + S0(CR_ST_WORD(d, 0)); > + > + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); > + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); > + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); > + CR_ST_WORD(d, 0) = t; > } > > env->vfp.regs[rd] = make_float64(d.l[0]); > @@ -394,13 +405,14 @@ void HELPER(crypto_sha256h2)(CPUARMState *env, uint32_t > rd, uint32_t rn, > int i; > > for (i = 0; i < 4; i++) { > - uint32_t t = cho(d.words[0], d.words[1], d.words[2]) + d.words[3] > - + S1(d.words[0]) + m.words[i]; > - > - d.words[3] = d.words[2]; > - d.words[2] = d.words[1]; > - d.words[1] = d.words[0]; > - d.words[0] = n.words[3 - i] + t; > + uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, > 2)) > + + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0)) > + + CR_ST_WORD(m, i); > + > + CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2); > + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1); > + CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0); > + CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t; > } > > env->vfp.regs[rd] = make_float64(d.l[0]); > @@ -418,10 +430,10 @@ void HELPER(crypto_sha256su0)(CPUARMState *env, > uint32_t rd, uint32_t rm) > float64_val(env->vfp.regs[rm + 1]) > } }; > > - d.words[0] += s0(d.words[1]); > - d.words[1] += s0(d.words[2]); > - d.words[2] += s0(d.words[3]); > - d.words[3] += s0(m.words[0]); > + CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1)); > + CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2)); > + CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3)); > + CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0)); > > env->vfp.regs[rd] = make_float64(d.l[0]); > env->vfp.regs[rd + 1] = make_float64(d.l[1]); > @@ -443,10 +455,10 @@ void HELPER(crypto_sha256su1)(CPUARMState *env, > uint32_t rd, uint32_t rn, > float64_val(env->vfp.regs[rm + 1]) > } }; > > - d.words[0] += s1(m.words[2]) + n.words[1]; > - d.words[1] += s1(m.words[3]) + n.words[2]; > - d.words[2] += s1(d.words[0]) + n.words[3]; > - d.words[3] += s1(d.words[1]) + m.words[0]; > + CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1); > + CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2); > + CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3); > + CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0); > > env->vfp.regs[rd] = make_float64(d.l[0]); > env->vfp.regs[rd + 1] = make_float64(d.l[1]); >