This is some low-level crypto code, an MMX implementation of Dan
Bernstien's "ChaCha" pseudorandom function.  The input is a 4x4 array
of 32-bit words, and mixing proceeds down either columns or diagonals.

Thus, the implementation keeps each row in a pair of MMX registers,
does mixing down the columns, then swizzles the rows (shear), mixes down
the colums again, then unshears.

It maps very nicely to sse2 registers, but I was trying to write an MMX
implementation for completeness.  This is tricky because I really need
9 registers, but I have only 8.

I could of course write this in straight assembly, but I was trying to get
gcc to do instruction scheduling for me.  I have progressibely added
more and more "keep this in MMX registers, damn it!" hints to the source,
but GCC keeps generating preposterously large stack frames.
(This example of 516 bytes is better than the 2000+ bytes I started with
before adding all the explicit register specifications.)

I realize that the register pressure is extreme, but I'm handing gcc
statements that map directly to 2-address instructions, and I'm not sure
how much more I can do.

Is there some elementary mistake I'm making?  Or should I just stop being cruel
to the compiler?

System is (32-bit) Debian Linux, gcc version 4.6.1 20110524 (prerelease) 
(Debian 4.6.0-9)
cc -W -Wall -Os -fomit-frame-pointer -march=pentium2 -mmmx -mno-sse -S chacha1.c
gcc -v      
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/i486-linux-gnu/4.6.1/lto-wrapper
Target: i486-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.6.0-9' 
--with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs 
--enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr 
--program-suffix=-4.6 --enable-shared --enable-multiarch 
--with-multiarch-defaults=i386-linux-gnu --enable-linker-build-id 
--with-system-zlib --libexecdir=/usr/lib --without-included-gettext 
--enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 
--libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug 
--enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc 
--enable-targets=all --with-arch-32=i586 --with-tune=generic 
--enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu 
--target=i486-linux-gnu
Thread model: posix
gcc version 4.6.1 20110524 (prerelease) (Debian 4.6.0-9) 


Source is as follows, then generated assembly.

#include <stdint.h>

/* Some types and a round constant needed everywhere */
typedef int32_t v4si __attribute__ ((vector_size (16)));
typedef int32_t v4si_u __attribute__ ((vector_size (16), aligned(4)));
typedef int32_t v2si __attribute__ ((vector_size (8)));

extern v4si const sigma;

#define ROUNDS 12       /* 8, 12, or 20 */

void chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);
void chacha2(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);
void chacha3(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict 
out);

/* Version 1: an mmx implementation */

/* The basic quarter round: x ^= y += z; z <<<= k; (rotate) */
#if 1
#define OP(x,y,z,k) do { \
                register v2si t  asm("%mm7");           \
                y = __builtin_ia32_paddd(y, z);         \
                x = __builtin_ia32_pxor(x, y);          \
                t = x;                                  \
                x = __builtin_ia32_pslldi(x, k);        \
                t = __builtin_ia32_psrldi(t, 32-k);     \
                x = __builtin_ia32_por(x, t);           \
        } while (0)
#else
#define OP(x,y,z,k) ( \
                x ^= y += z,                            \
                x = __builtin_ia32_pslldi(x, k) |       \
                    __builtin_ia32_psrldi(x, 32-k)      \
        )
#endif

/* Rotate words right 32 bits */
/* If the words of y:x are 3:2:1:0, rotate right to 0:3:2:1 */
/* Little-endian, that's 0123 -> 1230 */
#define ROTW(x,y) do { \
                register v2si t  asm("%mm7") = t;       \
                t = __builtin_ia32_punpckldq(t, x);     \
                x = __builtin_ia32_punpckhdq(x, x);     \
                x = __builtin_ia32_punpckldq(x, y);     \
                y = __builtin_ia32_punpckhdq(y, t);     \
        } while(0)
        
void
chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out)
{
        /*
         * There aren't enough MMX registers for all this, plus
         * temporaries, so the compiler will have to do some spilling.
         */
        register v2si a0 asm("%mm0") = ((v2si const *)&sigma)[0];
        register v2si a1 asm("%mm1") = ((v2si const *)&sigma)[1];
        register v2si b0 asm("%mm2") = ((v2si const *)key)[0];
        register v2si b1 asm("%mm3") = ((v2si const *)key)[1];
        register v2si c0 asm("%mm4") = ((v2si const *)key)[2];
        register v2si c1 asm("%mm5") = ((v2si const *)key)[3];
        register v2si d  asm("%mm6") = ((v2si const *)iv)[0];
        v2si dd[2];     /* On stack */
        int i;

        dd[1] = ((v2si const *)iv)[1];

        for (i = 0; i < ROUNDS/4; i++) {
//asm("# OP 1" :: "y" (d), "y" (a0), "y" (b0));
                OP(d, a0, b0, 16);
//asm("# OP 2" :: "y" (b0), "y" (c0), "y" (d));
                OP(b0, c0, d, 12);
//asm("# OP 3" :: "y" (d), "y" (a0), "y" (b0));
                OP(d, a0, b0, 8);
//asm("# OP 4" :: "y" (b0), "y" (c0), "y" (d));
                OP(b0, c0, d, 7);

                dd[0] = d;
                d = dd[1];

//asm("# OP 1" :: "y" (d), "y" (a1), "y" (b1));
                OP(d, a1, b1, 16);
//asm("# OP 2" :: "y" (b1), "y" (c1), "y" (d));
                OP(b1, c1, d, 12);
//asm("# OP 3" :: "y" (d), "y" (a1), "y" (b1));
                OP(d, a1, b1, 8);
//asm("# OP 4" :: "y" (b1), "y" (c1), "y" (d));
                OP(b1, c1, d, 7);

                /* Our ROTW converts 0123 to 1230.  To get
                 * the other orders, combine with implicit swaps.
                 *
                 * a: 0123 -> 3012      ROTW + swap halves
                 * b: 0123    0123      No change
                 * c: 0123    1230      ROTW
                 * d: 0123    2301      swap halves, i.e. DON'T reload
                 */
//asm("# Swap" :: "y" (a0), "y" (a1), "y" (c0), "y" (c1));
                ROTW(a0,a1);
                ROTW(c0,c1);

                /* Same as above, but with a0/a1 and d0/d1 swapped */
                OP(d, a1, b0, 16);
                OP(b0, c0, d, 12);
                OP(d, a1, b0, 8);
                OP(b0, c0, d, 7);

                dd[1] = d;
                d = dd[0];

                OP(d, a0, b1, 16);
                OP(b1, c1, d, 12);
                OP(d, a0, b1, 8);
                OP(b1, c1, d, 7);

                ROTW(a0,a1);
                ROTW(c0,c1);

                /* Now a and c are swapped, but not b or d */
                OP(d, a1, b0, 16);
                OP(b0, c1, d, 12);
                OP(d, a1, b0, 8);
                OP(b0, c1, d, 7);

                dd[0] = d;
                d = dd[1];

                OP(d, a0, b1, 16);
                OP(b1, c0, d, 12);
                OP(d, a0, b1, 8);
                OP(b1, c0, d, 7);

                ROTW(a0,a1);
                ROTW(c0,c1);
                /* And a fourth round.  c and d swapped */

                OP(d, a0, b0, 16);
                OP(b0, c1, d, 12);
                OP(d, a0, b0, 8);
                OP(b0, c1, d, 7);

                dd[1] = d;
                d = dd[0];

                OP(d, a1, b1, 16);
                OP(b1, c0, d, 12);
                OP(d, a1, b1, 8);
                OP(b1, c0, d, 7);

                ROTW(a0,a1);
                ROTW(c0,c1);
        }

        ((v2si *)out)[0] = a0   += ((v2si const *)&sigma)[0];
        ((v2si *)out)[1] = a1   += ((v2si const *)&sigma)[1];
        ((v2si *)out)[2] = b0   += ((v2si const *)key)[0];
        ((v2si *)out)[3] = b1   += ((v2si const *)key)[1];
        ((v2si *)out)[4] = c0   += ((v2si const *)key)[2];
        ((v2si *)out)[5] = c1   += ((v2si const *)key)[3];
        ((v2si *)out)[6] = d    += ((v2si const *)iv)[0];
        ((v2si *)out)[7] = dd[1] + ((v2si const *)iv)[1];
}


        .file   "chacha1.c"
        .text
        .globl  chacha1
        .type   chacha1, @function
chacha1:
.LFB0:
        .cfi_startproc
        pushl   %ebp
        .cfi_def_cfa_offset 8
        .cfi_offset 5, -8
        pushl   %edi
        .cfi_def_cfa_offset 12
        .cfi_offset 7, -12
        pushl   %esi
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        pushl   %ebx
        .cfi_def_cfa_offset 20
        .cfi_offset 3, -20
        subl    $516, %esp
        .cfi_def_cfa_offset 536
        movl    540(%esp), %edx
        movl    536(%esp), %ebx
        movl    540(%esp), %edi
        movl    sigma, %ebp
        movl    (%edx), %eax
        movl    4(%edx), %edx
        movl    8(%edi), %esi
        movl    12(%edi), %edi
        movl    %eax, 16(%esp)
        movl    (%ebx), %eax
        movl    %edx, 20(%esp)
        movl    4(%ebx), %edx
        movl    %esi, 8(%esp)
        movl    20(%ebx), %ecx
        movl    %edi, 12(%esp)
        movl    8(%ebx), %esi
        movl    %eax, (%esp)
        movl    12(%ebx), %edi
        movl    %edx, 4(%esp)
        movl    sigma+4, %edx
        movl    sigma+8, %eax
        movl    %ebp, 24(%esp)
        movl    %ecx, 44(%esp)
        movl    %edx, 28(%esp)
        movl    16(%ebx), %edx
        movl    %esi, 32(%esp)
        movl    %edi, 36(%esp)
        movl    %edx, 40(%esp)
        movl    sigma+12, %edx
        movl    %eax, 48(%esp)
        movl    24(%ebx), %eax
        movq    %mm7, 64(%esp)
        movq    %mm7, 72(%esp)
        movq    %mm7, 80(%esp)
        movq    %mm7, 88(%esp)
        movq    %mm7, 96(%esp)
        movq    %mm7, 104(%esp)
        movq    %mm7, 112(%esp)
        movl    %edx, 52(%esp)
        movl    28(%ebx), %edx
        movq    %mm7, 120(%esp)
        movl    %eax, 56(%esp)
        movl    $3, %eax
        movl    %edx, 60(%esp)
.L2:
        movq    24(%esp), %mm0
        paddd   (%esp), %mm0
        movq    16(%esp), %mm1
        movq    (%esp), %mm4
        movq    48(%esp), %mm6
        paddd   32(%esp), %mm6
        movq    32(%esp), %mm7
        decl    %eax
        pxor    %mm0, %mm1
        movq    %mm1, %mm3
        psrld   $16, %mm1
        pslld   $16, %mm3
        por     %mm1, %mm3
        movq    40(%esp), %mm1
        paddd   %mm3, %mm1
        pxor    %mm1, %mm4
        movq    %mm4, %mm2
        psrld   $20, %mm4
        pslld   $12, %mm2
        por     %mm4, %mm2
        paddd   %mm2, %mm0
        pxor    %mm0, %mm3
        movq    %mm3, %mm4
        psrld   $24, %mm3
        pslld   $8, %mm4
        por     %mm4, %mm3
        paddd   %mm3, %mm1
        movq    %mm3, (%esp)
        pxor    %mm1, %mm2
        movq    %mm2, %mm4
        psrld   $25, %mm2
        pslld   $7, %mm4
        por     %mm2, %mm4
        movq    8(%esp), %mm2
        pxor    %mm6, %mm2
        movq    %mm2, %mm5
        psrld   $16, %mm2
        pslld   $16, %mm5
        por     %mm2, %mm5
        movq    56(%esp), %mm2
        paddd   %mm5, %mm2
        pxor    %mm2, %mm7
        movq    %mm7, %mm3
        psrld   $20, %mm7
        pslld   $12, %mm3
        por     %mm7, %mm3
        movq    %mm5, %mm7
        paddd   %mm3, %mm6
        pxor    %mm6, %mm7
        movq    %mm7, %mm5
        psrld   $24, %mm7
        pslld   $8, %mm5
        por     %mm7, %mm5
        movq    %mm3, %mm7
        paddd   %mm5, %mm2
        pxor    %mm2, %mm7
        movq    %mm7, %mm3
        psrld   $25, %mm7
        pslld   $7, %mm3
        por     %mm7, %mm3
        movq    120(%esp), %mm7
        punpckldq       %mm0, %mm7
        punpckhdq       %mm0, %mm0
        movq    %mm7, 120(%esp)
        punpckldq       %mm6, %mm0
        punpckhdq       %mm7, %mm6
        movq    64(%esp), %mm7
        punpckldq       %mm1, %mm7
        punpckhdq       %mm1, %mm1
        paddd   %mm3, %mm0
        paddd   %mm4, %mm6
        movq    %mm7, 64(%esp)
        punpckldq       %mm2, %mm1
        punpckhdq       %mm7, %mm2
        movq    %mm5, %mm7
        pxor    %mm6, %mm7
        movq    %mm7, %mm5
        psrld   $16, %mm7
        pslld   $16, %mm5
        por     %mm7, %mm5
        movq    %mm4, %mm7
        paddd   %mm5, %mm1
        pxor    %mm1, %mm7
        movq    %mm7, %mm4
        psrld   $20, %mm7
        pslld   $12, %mm4
        por     %mm7, %mm4
        paddd   %mm4, %mm6
        pxor    %mm6, %mm5
        movq    %mm5, %mm7
        psrld   $24, %mm5
        pslld   $8, %mm7
        por     %mm7, %mm5
        paddd   %mm5, %mm1
        movq    %mm5, 8(%esp)
        pxor    %mm1, %mm4
        movq    %mm4, %mm5
        psrld   $25, %mm4
        pslld   $7, %mm5
        por     %mm4, %mm5
        movq    (%esp), %mm4
        pxor    %mm0, %mm4
        movq    %mm4, %mm7
        psrld   $16, %mm4
        pslld   $16, %mm7
        por     %mm4, %mm7
        paddd   %mm7, %mm2
        pxor    %mm2, %mm3
        movq    %mm3, %mm4
        psrld   $20, %mm3
        pslld   $12, %mm4
        por     %mm3, %mm4
        paddd   %mm4, %mm0
        pxor    %mm0, %mm7
        movq    %mm7, %mm3
        psrld   $24, %mm7
        pslld   $8, %mm3
        por     %mm7, %mm3
        paddd   %mm3, %mm2
        pxor    %mm2, %mm4
        movq    %mm4, %mm7
        psrld   $25, %mm4
        pslld   $7, %mm7
        por     %mm4, %mm7
        movq    72(%esp), %mm4
        punpckldq       %mm0, %mm4
        punpckhdq       %mm0, %mm0
        movq    %mm4, 72(%esp)
        punpckldq       %mm6, %mm0
        punpckhdq       %mm4, %mm6
        movq    80(%esp), %mm4
        punpckldq       %mm1, %mm4
        punpckhdq       %mm1, %mm1
        paddd   %mm7, %mm0
        paddd   %mm5, %mm6
        pxor    %mm6, %mm3
        movq    %mm4, 80(%esp)
        punpckldq       %mm2, %mm1
        punpckhdq       %mm4, %mm2
        movq    %mm3, %mm4
        psrld   $16, %mm3
        pslld   $16, %mm4
        por     %mm3, %mm4
        paddd   %mm4, %mm2
        pxor    %mm2, %mm5
        movq    %mm5, %mm3
        psrld   $20, %mm5
        pslld   $12, %mm3
        por     %mm5, %mm3
        paddd   %mm3, %mm6
        pxor    %mm6, %mm4
        movq    %mm4, %mm5
        psrld   $24, %mm4
        pslld   $8, %mm5
        por     %mm5, %mm4
        movq    8(%esp), %mm5
        pxor    %mm0, %mm5
        paddd   %mm4, %mm2
        movq    %mm4, 16(%esp)
        movq    %mm3, %mm4
        pxor    %mm2, %mm4
        movq    %mm4, %mm3
        psrld   $25, %mm4
        pslld   $7, %mm3
        por     %mm4, %mm3
        movq    %mm5, %mm4
        psrld   $16, %mm5
        pslld   $16, %mm4
        por     %mm5, %mm4
        paddd   %mm4, %mm1
        pxor    %mm1, %mm7
        movq    %mm7, %mm5
        psrld   $20, %mm7
        pslld   $12, %mm5
        por     %mm7, %mm5
        movq    %mm4, %mm7
        paddd   %mm5, %mm0
        pxor    %mm0, %mm7
        movq    %mm7, %mm4
        psrld   $24, %mm7
        pslld   $8, %mm4
        por     %mm7, %mm4
        paddd   %mm4, %mm1
        pxor    %mm1, %mm5
        movq    %mm5, %mm7
        psrld   $25, %mm5
        pslld   $7, %mm7
        por     %mm5, %mm7
        movq    88(%esp), %mm5
        punpckldq       %mm0, %mm5
        punpckhdq       %mm0, %mm0
        movq    %mm5, 88(%esp)
        punpckldq       %mm6, %mm0
        punpckhdq       %mm5, %mm6
        movq    96(%esp), %mm5
        punpckldq       %mm1, %mm5
        punpckhdq       %mm1, %mm1
        paddd   %mm3, %mm0
        pxor    %mm0, %mm4
        paddd   %mm7, %mm6
        movq    %mm5, 96(%esp)
        punpckldq       %mm2, %mm1
        punpckhdq       %mm5, %mm2
        movq    %mm4, %mm5
        psrld   $16, %mm4
        pslld   $16, %mm5
        por     %mm4, %mm5
        paddd   %mm5, %mm2
        pxor    %mm2, %mm3
        movq    %mm3, %mm4
        psrld   $20, %mm3
        pslld   $12, %mm4
        por     %mm3, %mm4
        paddd   %mm4, %mm0
        pxor    %mm0, %mm5
        movq    %mm5, %mm3
        psrld   $24, %mm5
        pslld   $8, %mm3
        por     %mm3, %mm5
        movq    16(%esp), %mm3
        pxor    %mm6, %mm3
        paddd   %mm5, %mm2
        movq    %mm5, 8(%esp)
        pxor    %mm2, %mm4
        movq    %mm4, %mm5
        psrld   $25, %mm4
        pslld   $7, %mm5
        por     %mm5, %mm4
        movq    104(%esp), %mm5
        punpckldq       %mm0, %mm5
        movq    %mm4, (%esp)
        movq    %mm3, %mm4
        psrld   $16, %mm3
        pslld   $16, %mm4
        por     %mm3, %mm4
        punpckhdq       %mm0, %mm0
        paddd   %mm4, %mm1
        pxor    %mm1, %mm7
        movq    %mm7, %mm3
        psrld   $20, %mm7
        pslld   $12, %mm3
        por     %mm7, %mm3
        paddd   %mm3, %mm6
        pxor    %mm6, %mm4
        punpckldq       %mm6, %mm0
        movq    %mm4, %mm7
        psrld   $24, %mm4
        pslld   $8, %mm7
        por     %mm7, %mm4
        punpckhdq       %mm5, %mm6
        paddd   %mm4, %mm1
        movq    %mm4, 16(%esp)
        pxor    %mm1, %mm3
        movq    %mm3, %mm4
        psrld   $25, %mm3
        pslld   $7, %mm4
        por     %mm4, %mm3
        movq    %mm3, 32(%esp)
        movq    %mm5, 104(%esp)
        movq    112(%esp), %mm7
        punpckldq       %mm1, %mm7
        movq    %mm0, 24(%esp)
        movq    %mm6, 48(%esp)
        punpckhdq       %mm1, %mm1
        movq    %mm7, 112(%esp)
        movq    %mm1, %mm0
        punpckldq       %mm2, %mm0
        punpckhdq       %mm7, %mm2
        movq    %mm0, 40(%esp)
        movq    %mm2, 56(%esp)
        jne     .L2
        movl    28(%esp), %edi
        movl    %ebp, %eax
        xorl    %ecx, %ecx
        movl    24(%esp), %edx
        movl    %ecx, 492(%esp)
        movl    24(%esp), %esi
        movl    %ebp, 472(%esp)
        movl    %edi, 484(%esp)
        movl    sigma+4, %edi
        addl    %edx, %eax
        movl    484(%esp), %ecx
        movl    %eax, 488(%esp)
        movl    488(%esp), %eax
        movl    %edi, 476(%esp)
        movl    544(%esp), %edi
        addl    476(%esp), %ecx
        movl    %esi, 480(%esp)
        movl    48(%esp), %esi
        movl    %eax, 464(%esp)
        movl    %ecx, 468(%esp)
        movl    464(%esp), %eax
        xorl    %ecx, %ecx
        movl    468(%esp), %edx
        movl    %eax, (%edi)
        movl    sigma+8, %eax
        movl    %edx, 4(%edi)
        movl    52(%esp), %edi
        movl    48(%esp), %edx
        movl    %esi, 432(%esp)
        movl    sigma+8, %esi
        movl    %ecx, 444(%esp)
        movl    %edi, 436(%esp)
        movl    sigma+12, %edi
        addl    %edx, %eax
        movl    436(%esp), %ecx
        movl    %eax, 440(%esp)
        movl    440(%esp), %eax
        movl    %esi, 424(%esp)
        movl    %edi, 428(%esp)
        movl    544(%esp), %edi
        addl    428(%esp), %ecx
        movl    %eax, 416(%esp)
        movl    (%esp), %esi
        movl    416(%esp), %eax
        movl    %ecx, 420(%esp)
        xorl    %ecx, %ecx
        movl    420(%esp), %edx
        movl    %eax, 8(%edi)
        movl    %edx, 12(%edi)
        movl    (%esp), %edx
        movl    (%ebx), %ebp
        movl    4(%esp), %edi
        movl    %ecx, 396(%esp)
        movl    %esi, 384(%esp)
        movl    32(%esp), %esi
        movl    %ebp, %eax
        addl    %edx, %eax
        movl    %eax, 392(%esp)
        movl    %edi, 388(%esp)
        movl    392(%esp), %eax
        movl    4(%ebx), %edi
        movl    388(%esp), %ecx
        movl    %ebp, 376(%esp)
        movl    %eax, 368(%esp)
        movl    %edi, 380(%esp)
        movl    368(%esp), %eax
        addl    380(%esp), %ecx
        movl    544(%esp), %edi
        movl    %ecx, 372(%esp)
        xorl    %ecx, %ecx
        movl    372(%esp), %edx
        movl    %eax, 16(%edi)
        movl    %edx, 20(%edi)
        movl    32(%esp), %edx
        movl    8(%ebx), %ebp
        movl    36(%esp), %edi
        movl    %ecx, 348(%esp)
        movl    %ebp, %eax
        addl    %edx, %eax
        movl    %eax, 344(%esp)
        movl    344(%esp), %eax
        movl    %esi, 336(%esp)
        movl    40(%esp), %esi
        movl    %edi, 340(%esp)
        movl    12(%ebx), %edi
        movl    340(%esp), %ecx
        movl    %eax, 320(%esp)
        movl    320(%esp), %eax
        movl    %edi, 332(%esp)
        movl    544(%esp), %edi
        addl    332(%esp), %ecx
        movl    %ebp, 328(%esp)
        movl    %ecx, 324(%esp)
        xorl    %ecx, %ecx
        movl    324(%esp), %edx
        movl    %eax, 24(%edi)
        movl    %edx, 28(%edi)
        movl    40(%esp), %edx
        movl    16(%ebx), %ebp
        movl    44(%esp), %edi
        movl    %ecx, 300(%esp)
        movl    %esi, 288(%esp)
        movl    %ebp, %eax
        addl    %edx, %eax
        movl    %eax, 296(%esp)
        movl    %edi, 292(%esp)
        movl    296(%esp), %eax
        movl    20(%ebx), %edi
        movl    292(%esp), %ecx
        movl    %ebp, 280(%esp)
        movl    %eax, 272(%esp)
        movl    %edi, 284(%esp)
        movl    272(%esp), %eax
        addl    284(%esp), %ecx
        movl    544(%esp), %edi
        movl    %ecx, 276(%esp)
        xorl    %ecx, %ecx
        movl    276(%esp), %edx
        movl    %eax, 32(%edi)
        movl    %edx, 36(%edi)
        movl    24(%ebx), %ebp
        movl    56(%esp), %edx
        movl    60(%esp), %edi
        movl    28(%ebx), %ebx
        movl    %ebp, %eax
        movl    56(%esp), %esi
        addl    %edx, %eax
        movl    %eax, 248(%esp)
        movl    248(%esp), %eax
        movl    %edi, 244(%esp)
        movl    544(%esp), %edi
        movl    %ecx, 252(%esp)
        movl    244(%esp), %ecx
        movl    %ebx, 236(%esp)
        movl    %eax, 224(%esp)
        addl    236(%esp), %ecx
        movl    224(%esp), %eax
        movl    %esi, 240(%esp)
        movl    16(%esp), %esi
        movl    %ecx, 228(%esp)
        xorl    %ecx, %ecx
        movl    228(%esp), %edx
        movl    %ebp, 232(%esp)
        movl    %eax, 40(%edi)
        movl    540(%esp), %eax
        movl    %edx, 44(%edi)
        movl    16(%esp), %edx
        movl    20(%esp), %edi
        movl    (%eax), %ebx
        movl    4(%eax), %ebp
        movl    %ecx, 204(%esp)
        movl    %edi, 196(%esp)
        movl    %ebx, %eax
        movl    196(%esp), %ecx
        addl    %edx, %eax
        movl    %eax, 200(%esp)
        movl    200(%esp), %eax
        movl    %esi, 192(%esp)
        movl    %ebx, 184(%esp)
        movl    544(%esp), %edi
        movl    %ebp, 188(%esp)
        movl    8(%esp), %esi
        movl    %eax, 176(%esp)
        addl    188(%esp), %ecx
        movl    176(%esp), %eax
        movl    %ecx, 180(%esp)
        xorl    %ecx, %ecx
        movl    180(%esp), %edx
        movl    %eax, 48(%edi)
        movl    540(%esp), %eax
        movl    %edx, 52(%edi)
        movl    8(%esp), %edx
        movl    8(%eax), %ebx
        movl    12(%esp), %edi
        movl    12(%eax), %ebp
        movl    %ecx, 156(%esp)
        movl    %ebx, %eax
        addl    %edx, %eax
        movl    %edi, 148(%esp)
        movl    544(%esp), %edi
        movl    %eax, 152(%esp)
        movl    148(%esp), %ecx
        movl    152(%esp), %eax
        movl    %ebp, 140(%esp)
        addl    140(%esp), %ecx
        movl    %esi, 144(%esp)
        movl    %eax, 128(%esp)
        movl    %ecx, 132(%esp)
        movl    128(%esp), %eax
        movl    132(%esp), %edx
        movl    %ebx, 136(%esp)
        movl    %eax, 56(%edi)
        movl    %edx, 60(%edi)
        addl    $516, %esp
        .cfi_def_cfa_offset 20
        popl    %ebx
        .cfi_def_cfa_offset 16
        .cfi_restore 3
        popl    %esi
        .cfi_def_cfa_offset 12
        .cfi_restore 6
        popl    %edi
        .cfi_def_cfa_offset 8
        .cfi_restore 7
        popl    %ebp
        .cfi_def_cfa_offset 4
        .cfi_restore 5
        ret
        .cfi_endproc
.LFE0:
        .size   chacha1, .-chacha1
        .ident  "GCC: (Debian 4.6.0-9) 4.6.1 20110524 (prerelease)"
        .section        .note.GNU-stack,"",@progbits

Reply via email to