On Tuesday 19 April 2005 20:40, Chris Wright wrote: > * Denis Vlasenko ([EMAIL PROTECTED]) wrote: > > On Tuesday 19 April 2005 08:42, Denis Vlasenko wrote: > > > modprobe tcrypt hangs the box on both kernels. > > > The last printks are: > > > > > > <wp256 test runs ok> > > > > > > testing wp384 > > > NN<n>Unable to handle kernel paging request at virtual address eXXXXXXX > > > > > > Nothing is printed after this and system locks up solid. > > > No Sysrq-B. > > > > > > IIRC, 2.6.9 was okay. > > > > Update: it does not oops on another machine. CPU or .config related, > > I'll look into it... > > Any update? This is candidate for -stable fixing if it's an actual bug.
Yes. wp512_process_buffer() was using 3k of stack if compiled with -O2. The wp512.c I appended (sans table at top) is instrumented to show it. Use "make crypto/wp512.s". This is a suboptimal code generation by gcc, so I CC-ing gcc list for comments. Note that -Os compiled one (CONFIG_CC_OPTIMIZE_FOR_SIZE=y) does not have stack overflow problem and is significantly smaller, too. -- vda /** * The core Whirlpool transform. */ static void wp512_process_buffer(struct wp512_ctx *wctx) { int i, r; u64 K[8]; /* the round key */ u64 block[8]; /* mu(buffer) */ u64 state[8]; /* the cipher state */ u64 L[8]; for (i = 0; i < 8; i++) { block[i] = be64_to_cpu( ((__be64*)wctx->buffer)[i] ); } state[0] = block[0] ^ (K[0] = wctx->hash[0]); state[1] = block[1] ^ (K[1] = wctx->hash[1]); state[2] = block[2] ^ (K[2] = wctx->hash[2]); state[3] = block[3] ^ (K[3] = wctx->hash[3]); state[4] = block[4] ^ (K[4] = wctx->hash[4]); state[5] = block[5] ^ (K[5] = wctx->hash[5]); state[6] = block[6] ^ (K[6] = wctx->hash[6]); state[7] = block[7] ^ (K[7] = wctx->hash[7]); // gcc optimizer bug: first method is noticeably // worse than second: loads full u32, shifts and // zero-extends low u8 to u32 #if 0 #define BYTE7(v) ((u8)((v) >> 56)) #define BYTE6(v) ((u8)((v) >> 48)) #define BYTE5(v) ((u8)((v) >> 40)) #define BYTE4(v) ((u8)((v) >> 32)) // gcc optimizer bug: without (u32) below will emit // spurious shrd insns #define BYTE3(v) ((u8)((u32)(v) >> 24)) #define BYTE2(v) ((u8)((u32)(v) >> 16)) #define BYTE1(v) ((u8)((u32)(v) >> 8)) #define BYTE0(v) ((u8)(v)) #else // little-endian #define BYTE7(v) (((u8*)&v)[7]) #define BYTE6(v) (((u8*)&v)[6]) #define BYTE5(v) (((u8*)&v)[5]) #define BYTE4(v) (((u8*)&v)[4]) #define BYTE3(v) (((u8*)&v)[3]) #define BYTE2(v) (((u8*)&v)[2]) #define BYTE1(v) (((u8*)&v)[1]) #define BYTE0(v) (((u8*)&v)[0]) #endif // gcc -O2 optimizer bug: second method // causes excessive spills (~3K stack used) #if 1 #define X(a) a ^= #define XEND ; #else #define X(a) ^ #define XEND #endif for (r = 1; r <= WHIRLPOOL_ROUNDS; r++) { asm("#1"); L[0] = C0[BYTE7(K[0])] XEND X(L[0]) C1[BYTE6(K[7])] XEND X(L[0]) C2[BYTE5(K[6])] XEND X(L[0]) C3[BYTE4(K[5])] XEND X(L[0]) C4[BYTE3(K[4])] XEND X(L[0]) C5[BYTE2(K[3])] XEND X(L[0]) C6[BYTE1(K[2])] XEND X(L[0]) C7[BYTE0(K[1])] XEND X(L[0]) rc[r]; asm("#2"); L[1] = C0[BYTE7(K[1])] XEND X(L[1]) C1[BYTE6(K[0])] XEND X(L[1]) C2[BYTE5(K[7])] XEND X(L[1]) C3[BYTE4(K[6])] XEND X(L[1]) C4[BYTE3(K[5])] XEND X(L[1]) C5[BYTE2(K[4])] XEND X(L[1]) C6[BYTE1(K[3])] XEND X(L[1]) C7[BYTE0(K[2])]; L[2] = C0[BYTE7(K[2])] XEND X(L[2]) C1[BYTE6(K[1])] XEND X(L[2]) C2[BYTE5(K[0])] XEND X(L[2]) C3[BYTE4(K[7])] XEND X(L[2]) C4[BYTE3(K[6])] XEND X(L[2]) C5[BYTE2(K[5])] XEND X(L[2]) C6[BYTE1(K[4])] XEND X(L[2]) C7[BYTE0(K[3])]; L[3] = C0[BYTE7(K[3])] XEND X(L[3]) C1[BYTE6(K[2])] XEND X(L[3]) C2[BYTE5(K[1])] XEND X(L[3]) C3[BYTE4(K[0])] XEND X(L[3]) C4[BYTE3(K[7])] XEND X(L[3]) C5[BYTE2(K[6])] XEND X(L[3]) C6[BYTE1(K[5])] XEND X(L[3]) C7[BYTE0(K[4])]; L[4] = C0[BYTE7(K[4])] XEND X(L[4]) C1[BYTE6(K[3])] XEND X(L[4]) C2[BYTE5(K[2])] XEND X(L[4]) C3[BYTE4(K[1])] XEND X(L[4]) C4[BYTE3(K[0])] XEND X(L[4]) C5[BYTE2(K[7])] XEND X(L[4]) C6[BYTE1(K[6])] XEND X(L[4]) C7[BYTE0(K[5])]; L[5] = C0[BYTE7(K[5])] XEND X(L[5]) C1[BYTE6(K[4])] XEND X(L[5]) C2[BYTE5(K[3])] XEND X(L[5]) C3[BYTE4(K[2])] XEND X(L[5]) C4[BYTE3(K[1])] XEND X(L[5]) C5[BYTE2(K[0])] XEND X(L[5]) C6[BYTE1(K[7])] XEND X(L[5]) C7[BYTE0(K[6])]; L[6] = C0[BYTE7(K[6])] XEND X(L[6]) C1[BYTE6(K[5])] XEND X(L[6]) C2[BYTE5(K[4])] XEND X(L[6]) C3[BYTE4(K[3])] XEND X(L[6]) C4[BYTE3(K[2])] XEND X(L[6]) C5[BYTE2(K[1])] XEND X(L[6]) C6[BYTE1(K[0])] XEND X(L[6]) C7[BYTE0(K[7])]; L[7] = C0[BYTE7(K[7])] XEND X(L[7]) C1[BYTE6(K[6])] XEND X(L[7]) C2[BYTE5(K[5])] XEND X(L[7]) C3[BYTE4(K[4])] XEND X(L[7]) C4[BYTE3(K[3])] XEND X(L[7]) C5[BYTE2(K[2])] XEND X(L[7]) C6[BYTE1(K[1])] XEND X(L[7]) C7[BYTE0(K[0])]; K[0] = L[0]; K[1] = L[1]; K[2] = L[2]; K[3] = L[3]; K[4] = L[4]; K[5] = L[5]; K[6] = L[6]; K[7] = L[7]; L[0] = C0[BYTE7(state[0])] XEND X(L[0]) C1[BYTE6(state[7])] XEND X(L[0]) C2[BYTE5(state[6])] XEND X(L[0]) C3[BYTE4(state[5])] XEND X(L[0]) C4[BYTE3(state[4])] XEND X(L[0]) C5[BYTE2(state[3])] XEND X(L[0]) C6[BYTE1(state[2])] XEND X(L[0]) C7[BYTE0(state[1])] XEND X(L[0]) K[0]; L[1] = C0[BYTE7(state[1])] XEND X(L[1]) C1[BYTE6(state[0])] XEND X(L[1]) C2[BYTE5(state[7])] XEND X(L[1]) C3[BYTE4(state[6])] XEND X(L[1]) C4[BYTE3(state[5])] XEND X(L[1]) C5[BYTE2(state[4])] XEND X(L[1]) C6[BYTE1(state[3])] XEND X(L[1]) C7[BYTE0(state[2])] XEND X(L[1]) K[1]; L[2] = C0[BYTE7(state[2])] XEND X(L[2]) C1[BYTE6(state[1])] XEND X(L[2]) C2[BYTE5(state[0])] XEND X(L[2]) C3[BYTE4(state[7])] XEND X(L[2]) C4[BYTE3(state[6])] XEND X(L[2]) C5[BYTE2(state[5])] XEND X(L[2]) C6[BYTE1(state[4])] XEND X(L[2]) C7[BYTE0(state[3])] XEND X(L[2]) K[2]; L[3] = C0[BYTE7(state[3])] XEND X(L[3]) C1[BYTE6(state[2])] XEND X(L[3]) C2[BYTE5(state[1])] XEND X(L[3]) C3[BYTE4(state[0])] XEND X(L[3]) C4[BYTE3(state[7])] XEND X(L[3]) C5[BYTE2(state[6])] XEND X(L[3]) C6[BYTE1(state[5])] XEND X(L[3]) C7[BYTE0(state[4])] XEND X(L[3]) K[3]; L[4] = C0[BYTE7(state[4])] XEND X(L[4]) C1[BYTE6(state[3])] XEND X(L[4]) C2[BYTE5(state[2])] XEND X(L[4]) C3[BYTE4(state[1])] XEND X(L[4]) C4[BYTE3(state[0])] XEND X(L[4]) C5[BYTE2(state[7])] XEND X(L[4]) C6[BYTE1(state[6])] XEND X(L[4]) C7[BYTE0(state[5])] XEND X(L[4]) K[4]; L[5] = C0[BYTE7(state[5])] XEND X(L[5]) C1[BYTE6(state[4])] XEND X(L[5]) C2[BYTE5(state[3])] XEND X(L[5]) C3[BYTE4(state[2])] XEND X(L[5]) C4[BYTE3(state[1])] XEND X(L[5]) C5[BYTE2(state[0])] XEND X(L[5]) C6[BYTE1(state[7])] XEND X(L[5]) C7[BYTE0(state[6])] XEND X(L[5]) K[5]; L[6] = C0[BYTE7(state[6])] XEND X(L[6]) C1[BYTE6(state[5])] XEND X(L[6]) C2[BYTE5(state[4])] XEND X(L[6]) C3[BYTE4(state[3])] XEND X(L[6]) C4[BYTE3(state[2])] XEND X(L[6]) C5[BYTE2(state[1])] XEND X(L[6]) C6[BYTE1(state[0])] XEND X(L[6]) C7[BYTE0(state[7])] XEND X(L[6]) K[6]; L[7] = C0[BYTE7(state[7])] XEND X(L[7]) C1[BYTE6(state[6])] XEND X(L[7]) C2[BYTE5(state[5])] XEND X(L[7]) C3[BYTE4(state[4])] XEND X(L[7]) C4[BYTE3(state[3])] XEND X(L[7]) C5[BYTE2(state[2])] XEND X(L[7]) C6[BYTE1(state[1])] XEND X(L[7]) C7[BYTE0(state[0])] XEND X(L[7]) K[7]; state[0] = L[0]; state[1] = L[1]; state[2] = L[2]; state[3] = L[3]; state[4] = L[4]; state[5] = L[5]; state[6] = L[6]; state[7] = L[7]; } /* * apply the Miyaguchi-Preneel compression function: */ wctx->hash[0] ^= state[0] ^ block[0]; wctx->hash[1] ^= state[1] ^ block[1]; wctx->hash[2] ^= state[2] ^ block[2]; wctx->hash[3] ^= state[3] ^ block[3]; wctx->hash[4] ^= state[4] ^ block[4]; wctx->hash[5] ^= state[5] ^ block[5]; wctx->hash[6] ^= state[6] ^ block[6]; wctx->hash[7] ^= state[7] ^ block[7]; } static void wp512_init(void *ctx) { int i; struct wp512_ctx *wctx = ctx; memset(wctx->bitLength, 0, 32); wctx->bufferBits = wctx->bufferPos = 0; wctx->buffer[0] = 0; for (i = 0; i < 8; i++) { wctx->hash[i] = 0L; } } static void wp512_update(void *ctx, const u8 *source, unsigned int len) { struct wp512_ctx *wctx = ctx; int sourcePos = 0; unsigned int bits_len = len * 8; // convert to number of bits int sourceGap = (8 - ((int)bits_len & 7)) & 7; int bufferRem = wctx->bufferBits & 7; int i; u32 b, carry; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; u64 value = bits_len; for (i = 31, carry = 0; i >= 0 && (carry != 0 || value != 0ULL); i--) { carry += bitLength[i] + ((u32)value & 0xff); bitLength[i] = (u8)carry; carry >>= 8; value >>= 8; } while (bits_len > 8) { b = ((source[sourcePos] << sourceGap) & 0xff) | ((source[sourcePos + 1] & 0xff) >> (8 - sourceGap)); buffer[bufferPos++] |= (u8)(b >> bufferRem); bufferBits += 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += bufferRem; bits_len -= 8; sourcePos++; } if (bits_len > 0) { b = (source[sourcePos] << sourceGap) & 0xff; buffer[bufferPos] |= b >> bufferRem; } else { b = 0; } if (bufferRem + bits_len < 8) { bufferBits += bits_len; } else { bufferPos++; bufferBits += 8 - bufferRem; bits_len -= 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += (int)bits_len; } wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; } static void wp512_final(void *ctx, u8 *out) { struct wp512_ctx *wctx = ctx; int i; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; buffer[bufferPos] |= 0x80U >> (bufferBits & 7); bufferPos++; if (bufferPos > WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { if (bufferPos < WP512_BLOCK_SIZE) { memset(&buffer[bufferPos], 0, WP512_BLOCK_SIZE - bufferPos); } wp512_process_buffer(wctx); bufferPos = 0; } if (bufferPos < WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { memset(&buffer[bufferPos], 0, (WP512_BLOCK_SIZE - WP512_LENGTHBYTES) - bufferPos); } bufferPos = WP512_BLOCK_SIZE - WP512_LENGTHBYTES; memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES], bitLength, WP512_LENGTHBYTES); wp512_process_buffer(wctx); for (i = 0; i < WP512_DIGEST_SIZE/8; i++) { ((__be64*)out)[i] = cpu_to_be64(wctx->hash[i]); } wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; } static void wp384_final(void *ctx, u8 *out) { struct wp512_ctx *wctx = ctx; u8 D[64]; wp512_final (wctx, D); memcpy (out, D, WP384_DIGEST_SIZE); memset (D, 0, WP512_DIGEST_SIZE); } static void wp256_final(void *ctx, u8 *out) { struct wp512_ctx *wctx = ctx; u8 D[64]; wp512_final (wctx, D); memcpy (out, D, WP256_DIGEST_SIZE); memset (D, 0, WP512_DIGEST_SIZE); } static struct crypto_alg wp512 = { .cra_name = "wp512", .cra_flags = CRYPTO_ALG_TYPE_DIGEST, .cra_blocksize = WP512_BLOCK_SIZE, .cra_ctxsize = sizeof(struct wp512_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(wp512.cra_list), .cra_u = { .digest = { .dia_digestsize = WP512_DIGEST_SIZE, .dia_init = wp512_init, .dia_update = wp512_update, .dia_final = wp512_final } } }; static struct crypto_alg wp384 = { .cra_name = "wp384", .cra_flags = CRYPTO_ALG_TYPE_DIGEST, .cra_blocksize = WP512_BLOCK_SIZE, .cra_ctxsize = sizeof(struct wp512_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(wp384.cra_list), .cra_u = { .digest = { .dia_digestsize = WP384_DIGEST_SIZE, .dia_init = wp512_init, .dia_update = wp512_update, .dia_final = wp384_final } } }; static struct crypto_alg wp256 = { .cra_name = "wp256", .cra_flags = CRYPTO_ALG_TYPE_DIGEST, .cra_blocksize = WP512_BLOCK_SIZE, .cra_ctxsize = sizeof(struct wp512_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(wp256.cra_list), .cra_u = { .digest = { .dia_digestsize = WP256_DIGEST_SIZE, .dia_init = wp512_init, .dia_update = wp512_update, .dia_final = wp256_final } } }; static int __init init(void) { int ret = 0; ret = crypto_register_alg(&wp512); if (ret < 0) goto out; ret = crypto_register_alg(&wp384); if (ret < 0) { crypto_unregister_alg(&wp512); goto out; } ret = crypto_register_alg(&wp256); if (ret < 0) { crypto_unregister_alg(&wp512); crypto_unregister_alg(&wp384); } out: return ret; } static void __exit fini(void) { crypto_unregister_alg(&wp512); crypto_unregister_alg(&wp384); crypto_unregister_alg(&wp256); } MODULE_ALIAS("wp384"); MODULE_ALIAS("wp256"); module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Whirlpool Message Digest Algorithm");