Hello,

I have attached a patch with optimization for twofish. Please let me know
if any further changes required.

Following are the results for crypto_bench after optimization.

lavu       TWOFISH      size: 1048576  runs:   1024  time:   18.799 +- 0.132
gcrypt     TWOFISH      size: 1048576  runs:   1024  time:   25.643 +- 0.096
tomcrypt   TWOFISH      size: 1048576  runs:   1024  time:   18.372 +- 0.082

Thanks,
Supraja
From 8863d42845e52a7dcf5bffb875dacdf9a4449c70 Mon Sep 17 00:00:00 2001
From: Supraja Meedinti <supraja0...@gmail.com>
Date: Mon, 16 Feb 2015 14:46:24 +0530
Subject: [PATCH] libavutil: optimize twofish cipher

Signed-off-by: Supraja Meedinti <supraja0...@gmail.com>
---
 libavutil/twofish.c | 54 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/libavutil/twofish.c b/libavutil/twofish.c
index 337c099..a300290 100644
--- a/libavutil/twofish.c
+++ b/libavutil/twofish.c
@@ -26,6 +26,10 @@
 #define LR(x, n) ((x) << (n) | (x) >> (32 - (n)))
 #define RR(x, n) ((x) >> (n) | (x) << (32 - (n)))
 
+static uint32_t MDS[4][256];
+
+#define MDS_mul(X) (MDS[0][(X) & 0xff] ^ MDS[1][((X) >> 8) & 0xff] ^ MDS[2][((X) >> 16) & 0xff] ^ MDS[3][(X) >> 24])
+
 typedef struct AVTWOFISH {
     uint32_t K[40];
     uint32_t S[4];
@@ -174,6 +178,39 @@ static uint32_t tf_h(uint32_t X, uint32_t L[4], int k)
     return AV_RL32(l);
 }
 
+static void precomputeMDS(uint32_t L[4], int k)
+{
+    uint8_t y[4], l[4];
+    int i;
+    for (i = 0; i < 256; i++) {
+        y[0] = y[1] = y[2] = y[3] = i;
+        if (k == 4) {
+            AV_WL32(l, L[3]);
+            y[0] = q1[y[0]] ^ l[0];
+            y[1] = q0[y[1]] ^ l[1];
+            y[2] = q0[y[2]] ^ l[2];
+            y[3] = q1[y[3]] ^ l[3];
+        }
+        if (k >= 3) {
+            AV_WL32(l, L[2]);
+            y[0] = q1[y[0]] ^ l[0];
+            y[1] = q1[y[1]] ^ l[1];
+            y[2] = q0[y[2]] ^ l[2];
+            y[3] = q0[y[3]] ^ l[3];
+        }
+        AV_WL32(l, L[1]);
+        y[0] = q1[q0[q0[y[0]] ^ l[0]] ^ (L[0] & 0xff)];
+        y[1] = q0[q0[q1[y[1]] ^ l[1]] ^ ((L[0] >> 8) & 0xff)];
+        y[2] = q1[q1[q0[y[2]] ^ l[2]] ^ ((L[0] >> 16) & 0xff)];
+        y[3] = q0[q1[q1[y[3]] ^ l[3]] ^ (L[0] >> 24)];
+
+        MDS[0][i] = ((uint32_t)y[0]) ^ ((uint32_t)MD1[y[0]] << 8) ^ ((uint32_t)MD2[y[0]] << 16) ^ ((uint32_t)MD2[y[0]] << 24);
+        MDS[1][i] = ((uint32_t)MD2[y[1]]) ^ ((uint32_t)MD2[y[1]] << 8) ^ ((uint32_t)MD1[y[1]] << 16) ^ ((uint32_t)y[1] << 24);
+        MDS[2][i] = ((uint32_t)MD1[y[2]]) ^ ((uint32_t)MD2[y[2]] << 8) ^ ((uint32_t)y[2] << 16) ^ ((uint32_t)MD2[y[2]] << 24);
+        MDS[3][i] = ((uint32_t)MD1[y[3]]) ^ ((uint32_t)y[3] << 8) ^ ((uint32_t)MD2[y[3]] << 16) ^ ((uint32_t)MD1[y[3]] << 24);
+    }
+}
+
 static void twofish_encrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src)
 {
     uint32_t P[4], t0, t1;
@@ -183,12 +220,12 @@ static void twofish_encrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src)
     P[2] = AV_RL32(src + 8) ^ cs->K[2];
     P[3] = AV_RL32(src + 12) ^ cs->K[3];
     for (i = 0; i < 16; i += 2) {
-        t0 = tf_h(P[0], cs->S, cs->ksize);
-        t1 = tf_h(LR(P[1], 8), cs->S, cs->ksize);
+        t0 = MDS_mul(P[0]);
+        t1 = MDS_mul(LR(P[1], 8));
         P[2] = RR(P[2] ^ (t0 + t1 + cs->K[2 * i + 8]), 1);
         P[3] = LR(P[3], 1) ^ (t0 + 2 * t1 + cs->K[2 * i + 9]);
-        t0 = tf_h(P[2], cs->S, cs->ksize);
-        t1 = tf_h(LR(P[3], 8), cs->S, cs->ksize);
+        t0 = MDS_mul(P[2]);
+        t1 = MDS_mul(LR(P[3], 8));
         P[0] = RR(P[0] ^ (t0 + t1 + cs->K[2 * i + 10]), 1);
         P[1] = LR(P[1], 1) ^ (t0 + 2 * t1 + cs->K[2 * i + 11]);
     }
@@ -211,12 +248,12 @@ static void twofish_decrypt(AVTWOFISH *cs, uint8_t *dst, const uint8_t *src, uin
     P[0] = AV_RL32(src + 8) ^ cs->K[6];
     P[1] = AV_RL32(src + 12) ^ cs->K[7];
     for (i = 15; i >= 0; i -= 2) {
-        t0 = tf_h(P[2], cs->S, cs->ksize);
-        t1 = tf_h(LR(P[3], 8), cs->S, cs->ksize);
+        t0 = MDS_mul(P[2]);
+        t1 = MDS_mul(LR(P[3], 8));
         P[0] = LR(P[0], 1) ^ (t0 + t1 + cs->K[2 * i + 8]);
         P[1] = RR(P[1] ^ (t0 + 2 * t1 + cs->K[2 * i + 9]), 1);
-        t0 = tf_h(P[0], cs->S, cs->ksize);
-        t1 = tf_h(LR(P[1], 8), cs->S, cs->ksize);
+        t0 = MDS_mul(P[0]);
+        t1 = MDS_mul(LR(P[1], 8));
         P[2] = LR(P[2], 1) ^ (t0 + t1 + cs->K[2 * i + 6]);
         P[3] = RR(P[3] ^ (t0 + 2 * t1 + cs->K[2 * i + 7]), 1);
     }
@@ -265,6 +302,7 @@ av_cold int av_twofish_init(AVTWOFISH *cs, const uint8_t *key, int key_bits)
         Mo[i] = Key[2 * i + 1];
         cs->S[cs->ksize - i - 1] = tf_RS(Me[i], Mo[i]);
     }
+    precomputeMDS(cs->S,cs->ksize);
     for (i = 0; i < 20; i++) {
         A = tf_h((2 * i) * rho, Me, cs->ksize);
         B = tf_h((2 * i + 1) * rho, Mo, cs->ksize);
-- 
1.8.3.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to