I changed the asm a bit and made it about 1 cycle faster on Haswell and slightly smaller (-48 bytes overall incl. alignment on 64-bit Linux).
%macro AES_CRYPT 1 cglobal aes_%1rypt, 6,6,2 shl r3d, 4 add r5d, r5d add r0, 0x60 add r2, r3 add r1, r3 neg r3 pxor m1, m1 test r4, r4 je .block movu m1, [r4] ; iv .block: movu m0, [r2+r3] ; state %ifidn %1, enc pxor m0, m1 %endif pxor m0, [r0+8*r5-0x60] cmp r5d, 24 je .rounds12 jl .rounds10 aes%1 m0, [r0+0x70] aes%1 m0, [r0+0x60] .rounds12: aes%1 m0, [r0+0x50] aes%1 m0, [r0+0x40] .rounds10: aes%1 m0, [r0+0x30] aes%1 m0, [r0+0x20] aes%1 m0, [r0+0x10] aes%1 m0, [r0+0x00] aes%1 m0, [r0-0x10] aes%1 m0, [r0-0x20] aes%1 m0, [r0-0x30] aes%1 m0, [r0-0x40] aes%1 m0, [r0-0x50] aes%1last m0, [r0-0x60] test r4, r4 je .noiv %ifidn %1, enc mova m1, m0 %else pxor m0, m1 movu m1, [r2+r3] %endif .noiv movu [r1+r3], m0 add r3, 16 jl .block %ifidn %1, enc test r4, r4 je .ret movu [r4], m0 .ret: %endif REP_RET %endmacro %if HAVE_AESNI_EXTERNAL INIT_XMM aesni AES_CRYPT enc AES_CRYPT dec %endif _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel