This is some low-level crypto code, an MMX implementation of Dan Bernstien's "ChaCha" pseudorandom function. The input is a 4x4 array of 32-bit words, and mixing proceeds down either columns or diagonals.
Thus, the implementation keeps each row in a pair of MMX registers, does mixing down the columns, then swizzles the rows (shear), mixes down the colums again, then unshears. It maps very nicely to sse2 registers, but I was trying to write an MMX implementation for completeness. This is tricky because I really need 9 registers, but I have only 8. I could of course write this in straight assembly, but I was trying to get gcc to do instruction scheduling for me. I have progressibely added more and more "keep this in MMX registers, damn it!" hints to the source, but GCC keeps generating preposterously large stack frames. (This example of 516 bytes is better than the 2000+ bytes I started with before adding all the explicit register specifications.) I realize that the register pressure is extreme, but I'm handing gcc statements that map directly to 2-address instructions, and I'm not sure how much more I can do. Is there some elementary mistake I'm making? Or should I just stop being cruel to the compiler? System is (32-bit) Debian Linux, gcc version 4.6.1 20110524 (prerelease) (Debian 4.6.0-9) cc -W -Wall -Os -fomit-frame-pointer -march=pentium2 -mmmx -mno-sse -S chacha1.c gcc -v Using built-in specs. COLLECT_GCC=gcc COLLECT_LTO_WRAPPER=/usr/lib/gcc/i486-linux-gnu/4.6.1/lto-wrapper Target: i486-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Debian 4.6.0-9' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-multiarch --with-multiarch-defaults=i386-linux-gnu --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc --enable-targets=all --with-arch-32=i586 --with-tune=generic --enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu --target=i486-linux-gnu Thread model: posix gcc version 4.6.1 20110524 (prerelease) (Debian 4.6.0-9) Source is as follows, then generated assembly. #include <stdint.h> /* Some types and a round constant needed everywhere */ typedef int32_t v4si __attribute__ ((vector_size (16))); typedef int32_t v4si_u __attribute__ ((vector_size (16), aligned(4))); typedef int32_t v2si __attribute__ ((vector_size (8))); extern v4si const sigma; #define ROUNDS 12 /* 8, 12, or 20 */ void chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out); void chacha2(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out); void chacha3(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out); /* Version 1: an mmx implementation */ /* The basic quarter round: x ^= y += z; z <<<= k; (rotate) */ #if 1 #define OP(x,y,z,k) do { \ register v2si t asm("%mm7"); \ y = __builtin_ia32_paddd(y, z); \ x = __builtin_ia32_pxor(x, y); \ t = x; \ x = __builtin_ia32_pslldi(x, k); \ t = __builtin_ia32_psrldi(t, 32-k); \ x = __builtin_ia32_por(x, t); \ } while (0) #else #define OP(x,y,z,k) ( \ x ^= y += z, \ x = __builtin_ia32_pslldi(x, k) | \ __builtin_ia32_psrldi(x, 32-k) \ ) #endif /* Rotate words right 32 bits */ /* If the words of y:x are 3:2:1:0, rotate right to 0:3:2:1 */ /* Little-endian, that's 0123 -> 1230 */ #define ROTW(x,y) do { \ register v2si t asm("%mm7") = t; \ t = __builtin_ia32_punpckldq(t, x); \ x = __builtin_ia32_punpckhdq(x, x); \ x = __builtin_ia32_punpckldq(x, y); \ y = __builtin_ia32_punpckhdq(y, t); \ } while(0) void chacha1(uint32_t const key[8], uint32_t const iv[4], uint32_t *__restrict out) { /* * There aren't enough MMX registers for all this, plus * temporaries, so the compiler will have to do some spilling. */ register v2si a0 asm("%mm0") = ((v2si const *)&sigma)[0]; register v2si a1 asm("%mm1") = ((v2si const *)&sigma)[1]; register v2si b0 asm("%mm2") = ((v2si const *)key)[0]; register v2si b1 asm("%mm3") = ((v2si const *)key)[1]; register v2si c0 asm("%mm4") = ((v2si const *)key)[2]; register v2si c1 asm("%mm5") = ((v2si const *)key)[3]; register v2si d asm("%mm6") = ((v2si const *)iv)[0]; v2si dd[2]; /* On stack */ int i; dd[1] = ((v2si const *)iv)[1]; for (i = 0; i < ROUNDS/4; i++) { //asm("# OP 1" :: "y" (d), "y" (a0), "y" (b0)); OP(d, a0, b0, 16); //asm("# OP 2" :: "y" (b0), "y" (c0), "y" (d)); OP(b0, c0, d, 12); //asm("# OP 3" :: "y" (d), "y" (a0), "y" (b0)); OP(d, a0, b0, 8); //asm("# OP 4" :: "y" (b0), "y" (c0), "y" (d)); OP(b0, c0, d, 7); dd[0] = d; d = dd[1]; //asm("# OP 1" :: "y" (d), "y" (a1), "y" (b1)); OP(d, a1, b1, 16); //asm("# OP 2" :: "y" (b1), "y" (c1), "y" (d)); OP(b1, c1, d, 12); //asm("# OP 3" :: "y" (d), "y" (a1), "y" (b1)); OP(d, a1, b1, 8); //asm("# OP 4" :: "y" (b1), "y" (c1), "y" (d)); OP(b1, c1, d, 7); /* Our ROTW converts 0123 to 1230. To get * the other orders, combine with implicit swaps. * * a: 0123 -> 3012 ROTW + swap halves * b: 0123 0123 No change * c: 0123 1230 ROTW * d: 0123 2301 swap halves, i.e. DON'T reload */ //asm("# Swap" :: "y" (a0), "y" (a1), "y" (c0), "y" (c1)); ROTW(a0,a1); ROTW(c0,c1); /* Same as above, but with a0/a1 and d0/d1 swapped */ OP(d, a1, b0, 16); OP(b0, c0, d, 12); OP(d, a1, b0, 8); OP(b0, c0, d, 7); dd[1] = d; d = dd[0]; OP(d, a0, b1, 16); OP(b1, c1, d, 12); OP(d, a0, b1, 8); OP(b1, c1, d, 7); ROTW(a0,a1); ROTW(c0,c1); /* Now a and c are swapped, but not b or d */ OP(d, a1, b0, 16); OP(b0, c1, d, 12); OP(d, a1, b0, 8); OP(b0, c1, d, 7); dd[0] = d; d = dd[1]; OP(d, a0, b1, 16); OP(b1, c0, d, 12); OP(d, a0, b1, 8); OP(b1, c0, d, 7); ROTW(a0,a1); ROTW(c0,c1); /* And a fourth round. c and d swapped */ OP(d, a0, b0, 16); OP(b0, c1, d, 12); OP(d, a0, b0, 8); OP(b0, c1, d, 7); dd[1] = d; d = dd[0]; OP(d, a1, b1, 16); OP(b1, c0, d, 12); OP(d, a1, b1, 8); OP(b1, c0, d, 7); ROTW(a0,a1); ROTW(c0,c1); } ((v2si *)out)[0] = a0 += ((v2si const *)&sigma)[0]; ((v2si *)out)[1] = a1 += ((v2si const *)&sigma)[1]; ((v2si *)out)[2] = b0 += ((v2si const *)key)[0]; ((v2si *)out)[3] = b1 += ((v2si const *)key)[1]; ((v2si *)out)[4] = c0 += ((v2si const *)key)[2]; ((v2si *)out)[5] = c1 += ((v2si const *)key)[3]; ((v2si *)out)[6] = d += ((v2si const *)iv)[0]; ((v2si *)out)[7] = dd[1] + ((v2si const *)iv)[1]; } .file "chacha1.c" .text .globl chacha1 .type chacha1, @function chacha1: .LFB0: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 pushl %edi .cfi_def_cfa_offset 12 .cfi_offset 7, -12 pushl %esi .cfi_def_cfa_offset 16 .cfi_offset 6, -16 pushl %ebx .cfi_def_cfa_offset 20 .cfi_offset 3, -20 subl $516, %esp .cfi_def_cfa_offset 536 movl 540(%esp), %edx movl 536(%esp), %ebx movl 540(%esp), %edi movl sigma, %ebp movl (%edx), %eax movl 4(%edx), %edx movl 8(%edi), %esi movl 12(%edi), %edi movl %eax, 16(%esp) movl (%ebx), %eax movl %edx, 20(%esp) movl 4(%ebx), %edx movl %esi, 8(%esp) movl 20(%ebx), %ecx movl %edi, 12(%esp) movl 8(%ebx), %esi movl %eax, (%esp) movl 12(%ebx), %edi movl %edx, 4(%esp) movl sigma+4, %edx movl sigma+8, %eax movl %ebp, 24(%esp) movl %ecx, 44(%esp) movl %edx, 28(%esp) movl 16(%ebx), %edx movl %esi, 32(%esp) movl %edi, 36(%esp) movl %edx, 40(%esp) movl sigma+12, %edx movl %eax, 48(%esp) movl 24(%ebx), %eax movq %mm7, 64(%esp) movq %mm7, 72(%esp) movq %mm7, 80(%esp) movq %mm7, 88(%esp) movq %mm7, 96(%esp) movq %mm7, 104(%esp) movq %mm7, 112(%esp) movl %edx, 52(%esp) movl 28(%ebx), %edx movq %mm7, 120(%esp) movl %eax, 56(%esp) movl $3, %eax movl %edx, 60(%esp) .L2: movq 24(%esp), %mm0 paddd (%esp), %mm0 movq 16(%esp), %mm1 movq (%esp), %mm4 movq 48(%esp), %mm6 paddd 32(%esp), %mm6 movq 32(%esp), %mm7 decl %eax pxor %mm0, %mm1 movq %mm1, %mm3 psrld $16, %mm1 pslld $16, %mm3 por %mm1, %mm3 movq 40(%esp), %mm1 paddd %mm3, %mm1 pxor %mm1, %mm4 movq %mm4, %mm2 psrld $20, %mm4 pslld $12, %mm2 por %mm4, %mm2 paddd %mm2, %mm0 pxor %mm0, %mm3 movq %mm3, %mm4 psrld $24, %mm3 pslld $8, %mm4 por %mm4, %mm3 paddd %mm3, %mm1 movq %mm3, (%esp) pxor %mm1, %mm2 movq %mm2, %mm4 psrld $25, %mm2 pslld $7, %mm4 por %mm2, %mm4 movq 8(%esp), %mm2 pxor %mm6, %mm2 movq %mm2, %mm5 psrld $16, %mm2 pslld $16, %mm5 por %mm2, %mm5 movq 56(%esp), %mm2 paddd %mm5, %mm2 pxor %mm2, %mm7 movq %mm7, %mm3 psrld $20, %mm7 pslld $12, %mm3 por %mm7, %mm3 movq %mm5, %mm7 paddd %mm3, %mm6 pxor %mm6, %mm7 movq %mm7, %mm5 psrld $24, %mm7 pslld $8, %mm5 por %mm7, %mm5 movq %mm3, %mm7 paddd %mm5, %mm2 pxor %mm2, %mm7 movq %mm7, %mm3 psrld $25, %mm7 pslld $7, %mm3 por %mm7, %mm3 movq 120(%esp), %mm7 punpckldq %mm0, %mm7 punpckhdq %mm0, %mm0 movq %mm7, 120(%esp) punpckldq %mm6, %mm0 punpckhdq %mm7, %mm6 movq 64(%esp), %mm7 punpckldq %mm1, %mm7 punpckhdq %mm1, %mm1 paddd %mm3, %mm0 paddd %mm4, %mm6 movq %mm7, 64(%esp) punpckldq %mm2, %mm1 punpckhdq %mm7, %mm2 movq %mm5, %mm7 pxor %mm6, %mm7 movq %mm7, %mm5 psrld $16, %mm7 pslld $16, %mm5 por %mm7, %mm5 movq %mm4, %mm7 paddd %mm5, %mm1 pxor %mm1, %mm7 movq %mm7, %mm4 psrld $20, %mm7 pslld $12, %mm4 por %mm7, %mm4 paddd %mm4, %mm6 pxor %mm6, %mm5 movq %mm5, %mm7 psrld $24, %mm5 pslld $8, %mm7 por %mm7, %mm5 paddd %mm5, %mm1 movq %mm5, 8(%esp) pxor %mm1, %mm4 movq %mm4, %mm5 psrld $25, %mm4 pslld $7, %mm5 por %mm4, %mm5 movq (%esp), %mm4 pxor %mm0, %mm4 movq %mm4, %mm7 psrld $16, %mm4 pslld $16, %mm7 por %mm4, %mm7 paddd %mm7, %mm2 pxor %mm2, %mm3 movq %mm3, %mm4 psrld $20, %mm3 pslld $12, %mm4 por %mm3, %mm4 paddd %mm4, %mm0 pxor %mm0, %mm7 movq %mm7, %mm3 psrld $24, %mm7 pslld $8, %mm3 por %mm7, %mm3 paddd %mm3, %mm2 pxor %mm2, %mm4 movq %mm4, %mm7 psrld $25, %mm4 pslld $7, %mm7 por %mm4, %mm7 movq 72(%esp), %mm4 punpckldq %mm0, %mm4 punpckhdq %mm0, %mm0 movq %mm4, 72(%esp) punpckldq %mm6, %mm0 punpckhdq %mm4, %mm6 movq 80(%esp), %mm4 punpckldq %mm1, %mm4 punpckhdq %mm1, %mm1 paddd %mm7, %mm0 paddd %mm5, %mm6 pxor %mm6, %mm3 movq %mm4, 80(%esp) punpckldq %mm2, %mm1 punpckhdq %mm4, %mm2 movq %mm3, %mm4 psrld $16, %mm3 pslld $16, %mm4 por %mm3, %mm4 paddd %mm4, %mm2 pxor %mm2, %mm5 movq %mm5, %mm3 psrld $20, %mm5 pslld $12, %mm3 por %mm5, %mm3 paddd %mm3, %mm6 pxor %mm6, %mm4 movq %mm4, %mm5 psrld $24, %mm4 pslld $8, %mm5 por %mm5, %mm4 movq 8(%esp), %mm5 pxor %mm0, %mm5 paddd %mm4, %mm2 movq %mm4, 16(%esp) movq %mm3, %mm4 pxor %mm2, %mm4 movq %mm4, %mm3 psrld $25, %mm4 pslld $7, %mm3 por %mm4, %mm3 movq %mm5, %mm4 psrld $16, %mm5 pslld $16, %mm4 por %mm5, %mm4 paddd %mm4, %mm1 pxor %mm1, %mm7 movq %mm7, %mm5 psrld $20, %mm7 pslld $12, %mm5 por %mm7, %mm5 movq %mm4, %mm7 paddd %mm5, %mm0 pxor %mm0, %mm7 movq %mm7, %mm4 psrld $24, %mm7 pslld $8, %mm4 por %mm7, %mm4 paddd %mm4, %mm1 pxor %mm1, %mm5 movq %mm5, %mm7 psrld $25, %mm5 pslld $7, %mm7 por %mm5, %mm7 movq 88(%esp), %mm5 punpckldq %mm0, %mm5 punpckhdq %mm0, %mm0 movq %mm5, 88(%esp) punpckldq %mm6, %mm0 punpckhdq %mm5, %mm6 movq 96(%esp), %mm5 punpckldq %mm1, %mm5 punpckhdq %mm1, %mm1 paddd %mm3, %mm0 pxor %mm0, %mm4 paddd %mm7, %mm6 movq %mm5, 96(%esp) punpckldq %mm2, %mm1 punpckhdq %mm5, %mm2 movq %mm4, %mm5 psrld $16, %mm4 pslld $16, %mm5 por %mm4, %mm5 paddd %mm5, %mm2 pxor %mm2, %mm3 movq %mm3, %mm4 psrld $20, %mm3 pslld $12, %mm4 por %mm3, %mm4 paddd %mm4, %mm0 pxor %mm0, %mm5 movq %mm5, %mm3 psrld $24, %mm5 pslld $8, %mm3 por %mm3, %mm5 movq 16(%esp), %mm3 pxor %mm6, %mm3 paddd %mm5, %mm2 movq %mm5, 8(%esp) pxor %mm2, %mm4 movq %mm4, %mm5 psrld $25, %mm4 pslld $7, %mm5 por %mm5, %mm4 movq 104(%esp), %mm5 punpckldq %mm0, %mm5 movq %mm4, (%esp) movq %mm3, %mm4 psrld $16, %mm3 pslld $16, %mm4 por %mm3, %mm4 punpckhdq %mm0, %mm0 paddd %mm4, %mm1 pxor %mm1, %mm7 movq %mm7, %mm3 psrld $20, %mm7 pslld $12, %mm3 por %mm7, %mm3 paddd %mm3, %mm6 pxor %mm6, %mm4 punpckldq %mm6, %mm0 movq %mm4, %mm7 psrld $24, %mm4 pslld $8, %mm7 por %mm7, %mm4 punpckhdq %mm5, %mm6 paddd %mm4, %mm1 movq %mm4, 16(%esp) pxor %mm1, %mm3 movq %mm3, %mm4 psrld $25, %mm3 pslld $7, %mm4 por %mm4, %mm3 movq %mm3, 32(%esp) movq %mm5, 104(%esp) movq 112(%esp), %mm7 punpckldq %mm1, %mm7 movq %mm0, 24(%esp) movq %mm6, 48(%esp) punpckhdq %mm1, %mm1 movq %mm7, 112(%esp) movq %mm1, %mm0 punpckldq %mm2, %mm0 punpckhdq %mm7, %mm2 movq %mm0, 40(%esp) movq %mm2, 56(%esp) jne .L2 movl 28(%esp), %edi movl %ebp, %eax xorl %ecx, %ecx movl 24(%esp), %edx movl %ecx, 492(%esp) movl 24(%esp), %esi movl %ebp, 472(%esp) movl %edi, 484(%esp) movl sigma+4, %edi addl %edx, %eax movl 484(%esp), %ecx movl %eax, 488(%esp) movl 488(%esp), %eax movl %edi, 476(%esp) movl 544(%esp), %edi addl 476(%esp), %ecx movl %esi, 480(%esp) movl 48(%esp), %esi movl %eax, 464(%esp) movl %ecx, 468(%esp) movl 464(%esp), %eax xorl %ecx, %ecx movl 468(%esp), %edx movl %eax, (%edi) movl sigma+8, %eax movl %edx, 4(%edi) movl 52(%esp), %edi movl 48(%esp), %edx movl %esi, 432(%esp) movl sigma+8, %esi movl %ecx, 444(%esp) movl %edi, 436(%esp) movl sigma+12, %edi addl %edx, %eax movl 436(%esp), %ecx movl %eax, 440(%esp) movl 440(%esp), %eax movl %esi, 424(%esp) movl %edi, 428(%esp) movl 544(%esp), %edi addl 428(%esp), %ecx movl %eax, 416(%esp) movl (%esp), %esi movl 416(%esp), %eax movl %ecx, 420(%esp) xorl %ecx, %ecx movl 420(%esp), %edx movl %eax, 8(%edi) movl %edx, 12(%edi) movl (%esp), %edx movl (%ebx), %ebp movl 4(%esp), %edi movl %ecx, 396(%esp) movl %esi, 384(%esp) movl 32(%esp), %esi movl %ebp, %eax addl %edx, %eax movl %eax, 392(%esp) movl %edi, 388(%esp) movl 392(%esp), %eax movl 4(%ebx), %edi movl 388(%esp), %ecx movl %ebp, 376(%esp) movl %eax, 368(%esp) movl %edi, 380(%esp) movl 368(%esp), %eax addl 380(%esp), %ecx movl 544(%esp), %edi movl %ecx, 372(%esp) xorl %ecx, %ecx movl 372(%esp), %edx movl %eax, 16(%edi) movl %edx, 20(%edi) movl 32(%esp), %edx movl 8(%ebx), %ebp movl 36(%esp), %edi movl %ecx, 348(%esp) movl %ebp, %eax addl %edx, %eax movl %eax, 344(%esp) movl 344(%esp), %eax movl %esi, 336(%esp) movl 40(%esp), %esi movl %edi, 340(%esp) movl 12(%ebx), %edi movl 340(%esp), %ecx movl %eax, 320(%esp) movl 320(%esp), %eax movl %edi, 332(%esp) movl 544(%esp), %edi addl 332(%esp), %ecx movl %ebp, 328(%esp) movl %ecx, 324(%esp) xorl %ecx, %ecx movl 324(%esp), %edx movl %eax, 24(%edi) movl %edx, 28(%edi) movl 40(%esp), %edx movl 16(%ebx), %ebp movl 44(%esp), %edi movl %ecx, 300(%esp) movl %esi, 288(%esp) movl %ebp, %eax addl %edx, %eax movl %eax, 296(%esp) movl %edi, 292(%esp) movl 296(%esp), %eax movl 20(%ebx), %edi movl 292(%esp), %ecx movl %ebp, 280(%esp) movl %eax, 272(%esp) movl %edi, 284(%esp) movl 272(%esp), %eax addl 284(%esp), %ecx movl 544(%esp), %edi movl %ecx, 276(%esp) xorl %ecx, %ecx movl 276(%esp), %edx movl %eax, 32(%edi) movl %edx, 36(%edi) movl 24(%ebx), %ebp movl 56(%esp), %edx movl 60(%esp), %edi movl 28(%ebx), %ebx movl %ebp, %eax movl 56(%esp), %esi addl %edx, %eax movl %eax, 248(%esp) movl 248(%esp), %eax movl %edi, 244(%esp) movl 544(%esp), %edi movl %ecx, 252(%esp) movl 244(%esp), %ecx movl %ebx, 236(%esp) movl %eax, 224(%esp) addl 236(%esp), %ecx movl 224(%esp), %eax movl %esi, 240(%esp) movl 16(%esp), %esi movl %ecx, 228(%esp) xorl %ecx, %ecx movl 228(%esp), %edx movl %ebp, 232(%esp) movl %eax, 40(%edi) movl 540(%esp), %eax movl %edx, 44(%edi) movl 16(%esp), %edx movl 20(%esp), %edi movl (%eax), %ebx movl 4(%eax), %ebp movl %ecx, 204(%esp) movl %edi, 196(%esp) movl %ebx, %eax movl 196(%esp), %ecx addl %edx, %eax movl %eax, 200(%esp) movl 200(%esp), %eax movl %esi, 192(%esp) movl %ebx, 184(%esp) movl 544(%esp), %edi movl %ebp, 188(%esp) movl 8(%esp), %esi movl %eax, 176(%esp) addl 188(%esp), %ecx movl 176(%esp), %eax movl %ecx, 180(%esp) xorl %ecx, %ecx movl 180(%esp), %edx movl %eax, 48(%edi) movl 540(%esp), %eax movl %edx, 52(%edi) movl 8(%esp), %edx movl 8(%eax), %ebx movl 12(%esp), %edi movl 12(%eax), %ebp movl %ecx, 156(%esp) movl %ebx, %eax addl %edx, %eax movl %edi, 148(%esp) movl 544(%esp), %edi movl %eax, 152(%esp) movl 148(%esp), %ecx movl 152(%esp), %eax movl %ebp, 140(%esp) addl 140(%esp), %ecx movl %esi, 144(%esp) movl %eax, 128(%esp) movl %ecx, 132(%esp) movl 128(%esp), %eax movl 132(%esp), %edx movl %ebx, 136(%esp) movl %eax, 56(%edi) movl %edx, 60(%edi) addl $516, %esp .cfi_def_cfa_offset 20 popl %ebx .cfi_def_cfa_offset 16 .cfi_restore 3 popl %esi .cfi_def_cfa_offset 12 .cfi_restore 6 popl %edi .cfi_def_cfa_offset 8 .cfi_restore 7 popl %ebp .cfi_def_cfa_offset 4 .cfi_restore 5 ret .cfi_endproc .LFE0: .size chacha1, .-chacha1 .ident "GCC: (Debian 4.6.0-9) 4.6.1 20110524 (prerelease)" .section .note.GNU-stack,"",@progbits