Hi Pádraig,

Thank you for your answer.

2011/9/6 Pádraig Brady <p...@draigbrady.com>

> A few general points.
> You essentially used Linus' code (albeit by
> very helpfully isolating the significant differences).
> It might be easier/required to just include it in gnulib?
> There are a few files in gnulib that are not copyright of the FSF,
> so would Nicolas and Linus need to assign copyright?
>

Yes, this is what I did. I don't thing that including Linus' is easier as
the functions have a different prototype. Also, sha1, sha256 and sha512
share the same structure in gnulib, changing one without changing the other
would be weird. But if you thing it is required, I have not problem with
that.

By the way, I have done a test on sha512 and I have improved the speed on
the same 1Gb zero file from 4.5 to 3.9s. Please find the patch attached. So
I thing that using the same technics, we could improve all sha's speed.

For performance testing I've found gcc generates
> much more deterministic results with a -march
> as close to native as possible or otherwise
> the code is very susceptible to alignment issues etc.
> Your compiler supports -march=native.
> Note also gcc 4.6 has much better support for your sandy bridge CPU,
> either with -march=native or -march=corei7-avx
>

I tried using gcc-4.6.1 (I recompiled it under my ubuntu 10.10) but I
couldn't see any differences. For me, using any combination of -march=native
or not and gcc 4.4.5 or 4.6.1 doesn't make a difference, all the times are
in the measurement margin.

As for the SSE version, I would also like to see that included,
> given the proportion of hardware supporting that these days.
> I previously noticed a coreutils SSE2 patch here:
> http://www.arctic.org/~dean/crypto/sha1.html
> Though we'd probably need some runtime SSE detection to include that.
>

Ok, I could try to work on this. The real problem is to test that
compilation and SSE detection is done correctly on several platform. I only
have access to a few x86 machines, what is the usual way to test more
platforms ?

Best regards
-- 
Loïc
--- lib/sha512.c.orig	2011-09-06 15:24:17.320209997 +0200
+++ lib/sha512.c	2011-09-06 14:54:35.503382001 +0200
@@ -498,19 +498,23 @@
 #define SS0(x) u64xor (u64rol (x, 36), u64xor (u64rol (x, 30), u64rol (x, 25)))
 #define SS1(x) u64xor (u64rol(x, 50), u64xor (u64rol (x, 46), u64rol (x, 23)))
 
-#define M(I) (x[(I) & 15]                                                 \
-              = u64plus (x[(I) & 15],                                     \
-                         u64plus (S1 (x[((I) - 2) & 15]),                 \
-                                  u64plus (x[((I) - 7) & 15],             \
-                                           S0 (x[((I) - 15) & 15])))))
+#include "endian.h"
+#define X(I) (be64toh(words[I]))
+#define M(I) (u64plus (x[(I) & 15],                                       \
+                       u64plus (S1 (x[((I) - 2) & 15]),                   \
+                                u64plus (x[((I) - 7) & 15],               \
+                                         S0 (x[((I) - 15) & 15])))))
 
-#define R(A, B, C, D, E, F, G, H, K, M)                                   \
+#define R(A, B, C, D, E, F, G, H, M, I)                                   \
   do                                                                      \
     {                                                                     \
+      u64 temp = M(I);                                                    \
       u64 t0 = u64plus (SS0 (A), F2 (A, B, C));                           \
       u64 t1 =                                                            \
         u64plus (H, u64plus (SS1 (E),                                     \
-                             u64plus (F1 (E, F, G), u64plus (K, M))));    \
+                             u64plus (F1 (E, F, G),                       \
+                                      u64plus (K(I), temp))));            \
+      *(volatile u64 *)&x[(I) & 15] = temp;                                                 \
       D = u64plus (D, t1);                                                \
       H = u64plus (t0, t1);                                               \
     }                                                                     \
@@ -518,94 +522,86 @@
 
   while (words < endp)
     {
-      int t;
-      /* FIXME: see sha1.c for a better implementation.  */
-      for (t = 0; t < 16; t++)
-        {
-          x[t] = SWAP (*words);
-          words++;
-        }
-
-      R( a, b, c, d, e, f, g, h, K( 0), x[ 0] );
-      R( h, a, b, c, d, e, f, g, K( 1), x[ 1] );
-      R( g, h, a, b, c, d, e, f, K( 2), x[ 2] );
-      R( f, g, h, a, b, c, d, e, K( 3), x[ 3] );
-      R( e, f, g, h, a, b, c, d, K( 4), x[ 4] );
-      R( d, e, f, g, h, a, b, c, K( 5), x[ 5] );
-      R( c, d, e, f, g, h, a, b, K( 6), x[ 6] );
-      R( b, c, d, e, f, g, h, a, K( 7), x[ 7] );
-      R( a, b, c, d, e, f, g, h, K( 8), x[ 8] );
-      R( h, a, b, c, d, e, f, g, K( 9), x[ 9] );
-      R( g, h, a, b, c, d, e, f, K(10), x[10] );
-      R( f, g, h, a, b, c, d, e, K(11), x[11] );
-      R( e, f, g, h, a, b, c, d, K(12), x[12] );
-      R( d, e, f, g, h, a, b, c, K(13), x[13] );
-      R( c, d, e, f, g, h, a, b, K(14), x[14] );
-      R( b, c, d, e, f, g, h, a, K(15), x[15] );
-      R( a, b, c, d, e, f, g, h, K(16), M(16) );
-      R( h, a, b, c, d, e, f, g, K(17), M(17) );
-      R( g, h, a, b, c, d, e, f, K(18), M(18) );
-      R( f, g, h, a, b, c, d, e, K(19), M(19) );
-      R( e, f, g, h, a, b, c, d, K(20), M(20) );
-      R( d, e, f, g, h, a, b, c, K(21), M(21) );
-      R( c, d, e, f, g, h, a, b, K(22), M(22) );
-      R( b, c, d, e, f, g, h, a, K(23), M(23) );
-      R( a, b, c, d, e, f, g, h, K(24), M(24) );
-      R( h, a, b, c, d, e, f, g, K(25), M(25) );
-      R( g, h, a, b, c, d, e, f, K(26), M(26) );
-      R( f, g, h, a, b, c, d, e, K(27), M(27) );
-      R( e, f, g, h, a, b, c, d, K(28), M(28) );
-      R( d, e, f, g, h, a, b, c, K(29), M(29) );
-      R( c, d, e, f, g, h, a, b, K(30), M(30) );
-      R( b, c, d, e, f, g, h, a, K(31), M(31) );
-      R( a, b, c, d, e, f, g, h, K(32), M(32) );
-      R( h, a, b, c, d, e, f, g, K(33), M(33) );
-      R( g, h, a, b, c, d, e, f, K(34), M(34) );
-      R( f, g, h, a, b, c, d, e, K(35), M(35) );
-      R( e, f, g, h, a, b, c, d, K(36), M(36) );
-      R( d, e, f, g, h, a, b, c, K(37), M(37) );
-      R( c, d, e, f, g, h, a, b, K(38), M(38) );
-      R( b, c, d, e, f, g, h, a, K(39), M(39) );
-      R( a, b, c, d, e, f, g, h, K(40), M(40) );
-      R( h, a, b, c, d, e, f, g, K(41), M(41) );
-      R( g, h, a, b, c, d, e, f, K(42), M(42) );
-      R( f, g, h, a, b, c, d, e, K(43), M(43) );
-      R( e, f, g, h, a, b, c, d, K(44), M(44) );
-      R( d, e, f, g, h, a, b, c, K(45), M(45) );
-      R( c, d, e, f, g, h, a, b, K(46), M(46) );
-      R( b, c, d, e, f, g, h, a, K(47), M(47) );
-      R( a, b, c, d, e, f, g, h, K(48), M(48) );
-      R( h, a, b, c, d, e, f, g, K(49), M(49) );
-      R( g, h, a, b, c, d, e, f, K(50), M(50) );
-      R( f, g, h, a, b, c, d, e, K(51), M(51) );
-      R( e, f, g, h, a, b, c, d, K(52), M(52) );
-      R( d, e, f, g, h, a, b, c, K(53), M(53) );
-      R( c, d, e, f, g, h, a, b, K(54), M(54) );
-      R( b, c, d, e, f, g, h, a, K(55), M(55) );
-      R( a, b, c, d, e, f, g, h, K(56), M(56) );
-      R( h, a, b, c, d, e, f, g, K(57), M(57) );
-      R( g, h, a, b, c, d, e, f, K(58), M(58) );
-      R( f, g, h, a, b, c, d, e, K(59), M(59) );
-      R( e, f, g, h, a, b, c, d, K(60), M(60) );
-      R( d, e, f, g, h, a, b, c, K(61), M(61) );
-      R( c, d, e, f, g, h, a, b, K(62), M(62) );
-      R( b, c, d, e, f, g, h, a, K(63), M(63) );
-      R( a, b, c, d, e, f, g, h, K(64), M(64) );
-      R( h, a, b, c, d, e, f, g, K(65), M(65) );
-      R( g, h, a, b, c, d, e, f, K(66), M(66) );
-      R( f, g, h, a, b, c, d, e, K(67), M(67) );
-      R( e, f, g, h, a, b, c, d, K(68), M(68) );
-      R( d, e, f, g, h, a, b, c, K(69), M(69) );
-      R( c, d, e, f, g, h, a, b, K(70), M(70) );
-      R( b, c, d, e, f, g, h, a, K(71), M(71) );
-      R( a, b, c, d, e, f, g, h, K(72), M(72) );
-      R( h, a, b, c, d, e, f, g, K(73), M(73) );
-      R( g, h, a, b, c, d, e, f, K(74), M(74) );
-      R( f, g, h, a, b, c, d, e, K(75), M(75) );
-      R( e, f, g, h, a, b, c, d, K(76), M(76) );
-      R( d, e, f, g, h, a, b, c, K(77), M(77) );
-      R( c, d, e, f, g, h, a, b, K(78), M(78) );
-      R( b, c, d, e, f, g, h, a, K(79), M(79) );
+      R( a, b, c, d, e, f, g, h, X,  0 );
+      R( h, a, b, c, d, e, f, g, X,  1 );
+      R( g, h, a, b, c, d, e, f, X,  2 );
+      R( f, g, h, a, b, c, d, e, X,  3 );
+      R( e, f, g, h, a, b, c, d, X,  4 );
+      R( d, e, f, g, h, a, b, c, X,  5 );
+      R( c, d, e, f, g, h, a, b, X,  6 );
+      R( b, c, d, e, f, g, h, a, X,  7 );
+      R( a, b, c, d, e, f, g, h, X,  8 );
+      R( h, a, b, c, d, e, f, g, X,  9 );
+      R( g, h, a, b, c, d, e, f, X, 10 );
+      R( f, g, h, a, b, c, d, e, X, 11 );
+      R( e, f, g, h, a, b, c, d, X, 12 );
+      R( d, e, f, g, h, a, b, c, X, 13 );
+      R( c, d, e, f, g, h, a, b, X, 14 );
+      R( b, c, d, e, f, g, h, a, X, 15 );
+      R( a, b, c, d, e, f, g, h, M, 16 );
+      R( h, a, b, c, d, e, f, g, M, 17 );
+      R( g, h, a, b, c, d, e, f, M, 18 );
+      R( f, g, h, a, b, c, d, e, M, 19 );
+      R( e, f, g, h, a, b, c, d, M, 20 );
+      R( d, e, f, g, h, a, b, c, M, 21 );
+      R( c, d, e, f, g, h, a, b, M, 22 );
+      R( b, c, d, e, f, g, h, a, M, 23 );
+      R( a, b, c, d, e, f, g, h, M, 24 );
+      R( h, a, b, c, d, e, f, g, M, 25 );
+      R( g, h, a, b, c, d, e, f, M, 26 );
+      R( f, g, h, a, b, c, d, e, M, 27 );
+      R( e, f, g, h, a, b, c, d, M, 28 );
+      R( d, e, f, g, h, a, b, c, M, 29 );
+      R( c, d, e, f, g, h, a, b, M, 30 );
+      R( b, c, d, e, f, g, h, a, M, 31 );
+      R( a, b, c, d, e, f, g, h, M, 32 );
+      R( h, a, b, c, d, e, f, g, M, 33 );
+      R( g, h, a, b, c, d, e, f, M, 34 );
+      R( f, g, h, a, b, c, d, e, M, 35 );
+      R( e, f, g, h, a, b, c, d, M, 36 );
+      R( d, e, f, g, h, a, b, c, M, 37 );
+      R( c, d, e, f, g, h, a, b, M, 38 );
+      R( b, c, d, e, f, g, h, a, M, 39 );
+      R( a, b, c, d, e, f, g, h, M, 40 );
+      R( h, a, b, c, d, e, f, g, M, 41 );
+      R( g, h, a, b, c, d, e, f, M, 42 );
+      R( f, g, h, a, b, c, d, e, M, 43 );
+      R( e, f, g, h, a, b, c, d, M, 44 );
+      R( d, e, f, g, h, a, b, c, M, 45 );
+      R( c, d, e, f, g, h, a, b, M, 46 );
+      R( b, c, d, e, f, g, h, a, M, 47 );
+      R( a, b, c, d, e, f, g, h, M, 48 );
+      R( h, a, b, c, d, e, f, g, M, 49 );
+      R( g, h, a, b, c, d, e, f, M, 50 );
+      R( f, g, h, a, b, c, d, e, M, 51 );
+      R( e, f, g, h, a, b, c, d, M, 52 );
+      R( d, e, f, g, h, a, b, c, M, 53 );
+      R( c, d, e, f, g, h, a, b, M, 54 );
+      R( b, c, d, e, f, g, h, a, M, 55 );
+      R( a, b, c, d, e, f, g, h, M, 56 );
+      R( h, a, b, c, d, e, f, g, M, 57 );
+      R( g, h, a, b, c, d, e, f, M, 58 );
+      R( f, g, h, a, b, c, d, e, M, 59 );
+      R( e, f, g, h, a, b, c, d, M, 60 );
+      R( d, e, f, g, h, a, b, c, M, 61 );
+      R( c, d, e, f, g, h, a, b, M, 62 );
+      R( b, c, d, e, f, g, h, a, M, 63 );
+      R( a, b, c, d, e, f, g, h, M, 64 );
+      R( h, a, b, c, d, e, f, g, M, 65 );
+      R( g, h, a, b, c, d, e, f, M, 66 );
+      R( f, g, h, a, b, c, d, e, M, 67 );
+      R( e, f, g, h, a, b, c, d, M, 68 );
+      R( d, e, f, g, h, a, b, c, M, 69 );
+      R( c, d, e, f, g, h, a, b, M, 70 );
+      R( b, c, d, e, f, g, h, a, M, 71 );
+      R( a, b, c, d, e, f, g, h, M, 72 );
+      R( h, a, b, c, d, e, f, g, M, 73 );
+      R( g, h, a, b, c, d, e, f, M, 74 );
+      R( f, g, h, a, b, c, d, e, M, 75 );
+      R( e, f, g, h, a, b, c, d, M, 76 );
+      R( d, e, f, g, h, a, b, c, M, 77 );
+      R( c, d, e, f, g, h, a, b, M, 78 );
+      R( b, c, d, e, f, g, h, a, M, 79 );
 
       a = ctx->state[0] = u64plus (ctx->state[0], a);
       b = ctx->state[1] = u64plus (ctx->state[1], b);
@@ -615,5 +611,6 @@
       f = ctx->state[5] = u64plus (ctx->state[5], f);
       g = ctx->state[6] = u64plus (ctx->state[6], g);
       h = ctx->state[7] = u64plus (ctx->state[7], h);
+      words += 16;
     }
 }

Reply via email to