Hello, I have attached a patch with optimization for twofish. Please let me know if any further changes required.
Following are the results for crypto_bench after optimization. lavu TWOFISH size: 1048576 runs: 1024 time: 18.799 +- 0.132 gcrypt TWOFISH size: 1048576 runs: 1024 time: 25.643 +- 0.096 tomcrypt TWOFISH size: 1048576 runs: 1024 time: 18.372 +- 0.082 Thanks, Supraja
From 8863d42845e52a7dcf5bffb875dacdf9a4449c70 Mon Sep 17 00:00:00 2001 From: Supraja Meedinti <supraja0...@gmail.com> Date: Mon, 16 Feb 2015 14:46:24 +0530 Subject: [PATCH] libavutil: optimize twofish cipher Signed-off-by: Supraja Meedinti <supraja0...@gmail.com> --- libavutil/twofish.c | 54 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/libavutil/twofish.c b/libavutil/twofish.c index 337c099..a300290 100644 --- a/libavutil/twofish.c +++ b/libavutil/twofish.c @@ -26,6 +26,10 @@ #define LR(x, n) ((x) << (n) | (x) >> (32 - (n))) #define RR(x, n) ((x) >> (n) | (x) << (32 - (n))) +static uint32_t MDS[4][256]; + +#define MDS_mul(X) (MDS[0][(X) & 0xff] ^ MDS[1][((X) >> 8) & 0xff] ^ MDS[2][((X) >> 16) & 0xff] ^ MDS[3][(X) >> 24]) + typedef struct AVTWOFISH { uint32_t K[40]; uint32_t S[4]; @@ -174,6 +178,39 @@ static uint32_t tf_h(uint32_t X, uint32_t L[4], int k) return AV_RL32(l); } +static void precomputeMDS(uint32_t L[4], int k) +{ + uint8_t y[4], l[4]; + int i; + for (i = 0; i < 256; i++) { + y[0] = y[1] = y[2] = y[3] = i; + if (k == 4) { + AV_WL32(l, L[3]); + y[0] = q1[y[0]] ^ l[0]; + y[1] = q0[y[1]] ^ l[1]; + y[2] = q0[y[2]] ^ l[2]; + y[3] = q1[y[3]] ^ l[3]; + } + if (k >= 3) { + AV_WL32(l, L[2]); + y[0] = q1[y[0]] ^ l[0]; + y[1] = q1[y[1]] ^ l[1]; + y[2] = q0[y[2]] ^ l[2]; + y[3] = q0[y[3]] ^ l[3]; + } + AV_WL32(l, L[1]); + y[0] = q1[q0[q0[y[0]] ^ l[0]] ^ (L[0] & 0xff)]; + y[1] = q0[q0[q1[y[1]] ^ l[1]] ^ ((L[0] >> 8) & 0xff)]; + y[2] = q1[q1[q0[y[2]] ^ l[2]] ^ ((L[0] >> 16) & 0xff)]; + y[3] = q0[q1[q1[y[3]] ^ l[3]] ^ (L[0] >> 24)]; + + MDS[0][i] = ((uint32_t)y[0]) ^ ((uint32_t)MD1[y[0]] << 8) ^ ((uint32_t)MD2[y[0]] << 16) ^ ((uint32_t)MD2[y[0]] << 24); + MDS[1][i] = ((uint32_t)MD2[y[1]]) ^ ((uint32_t)MD2[y[1]] << 8) ^ ((uint32_t)MD1[y[1]] << 16) ^ ((uint32_t)y[1] << 24); + MDS[2][i] = ((uint32_t)MD1[y[2]]) ^ ((uint32_t)MD2[y[2]] << 8) ^ ((uint32_t)y[2] << 16) ^ ((uint32_t)MD2[y[2]] << 24); + MDS[3][i] = ((uint32_t)MD1[y[3]]) ^ ((uint32_t)y[3] << 8) ^ ((uint32_t)MD2[y[3]] << 16) ^ ((uint32_t)MD1[y[3]] << 24); + } +} + static void twofish_encrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src) { uint32_t P[4], t0, t1; @@ -183,12 +220,12 @@ static void twofish_encrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src) P[2] = AV_RL32(src + 8) ^ cs->K[2]; P[3] = AV_RL32(src + 12) ^ cs->K[3]; for (i = 0; i < 16; i += 2) { - t0 = tf_h(P[0], cs->S, cs->ksize); - t1 = tf_h(LR(P[1], 8), cs->S, cs->ksize); + t0 = MDS_mul(P[0]); + t1 = MDS_mul(LR(P[1], 8)); P[2] = RR(P[2] ^ (t0 + t1 + cs->K[2 * i + 8]), 1); P[3] = LR(P[3], 1) ^ (t0 + 2 * t1 + cs->K[2 * i + 9]); - t0 = tf_h(P[2], cs->S, cs->ksize); - t1 = tf_h(LR(P[3], 8), cs->S, cs->ksize); + t0 = MDS_mul(P[2]); + t1 = MDS_mul(LR(P[3], 8)); P[0] = RR(P[0] ^ (t0 + t1 + cs->K[2 * i + 10]), 1); P[1] = LR(P[1], 1) ^ (t0 + 2 * t1 + cs->K[2 * i + 11]); } @@ -211,12 +248,12 @@ static void twofish_decrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src, uin P[0] = AV_RL32(src + 8) ^ cs->K[6]; P[1] = AV_RL32(src + 12) ^ cs->K[7]; for (i = 15; i >= 0; i -= 2) { - t0 = tf_h(P[2], cs->S, cs->ksize); - t1 = tf_h(LR(P[3], 8), cs->S, cs->ksize); + t0 = MDS_mul(P[2]); + t1 = MDS_mul(LR(P[3], 8)); P[0] = LR(P[0], 1) ^ (t0 + t1 + cs->K[2 * i + 8]); P[1] = RR(P[1] ^ (t0 + 2 * t1 + cs->K[2 * i + 9]), 1); - t0 = tf_h(P[0], cs->S, cs->ksize); - t1 = tf_h(LR(P[1], 8), cs->S, cs->ksize); + t0 = MDS_mul(P[0]); + t1 = MDS_mul(LR(P[1], 8)); P[2] = LR(P[2], 1) ^ (t0 + t1 + cs->K[2 * i + 6]); P[3] = RR(P[3] ^ (t0 + 2 * t1 + cs->K[2 * i + 7]), 1); } @@ -265,6 +302,7 @@ av_cold int av_twofish_init(AVTWOFISH *cs, const uint8_t *key, int key_bits) Mo[i] = Key[2 * i + 1]; cs->S[cs->ksize - i - 1] = tf_RS(Me[i], Mo[i]); } + precomputeMDS(cs->S,cs->ksize); for (i = 0; i < 20; i++) { A = tf_h((2 * i) * rho, Me, cs->ksize); B = tf_h((2 * i + 1) * rho, Mo, cs->ksize); -- 1.8.3.2
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel