On Fri, Nov 15, 2013 at 11:26:06PM +0100, Ondřej Bílka wrote: Minor correction, a mutt read replaced a set1.s file by one that I later used for avx2 variant. A correct file is following
.file "set1.c" .text .p2align 4,,15 .globl set .type set, @function set: .LFB0: .cfi_startproc movq %rdi, %rax andl $15, %eax shrq $2, %rax negq %rax andl $3, %eax je .L9 movl (%rdi), %edx movl $42, %esi imull %esi, %edx cmpl $1, %eax movl %edx, (%rdi) jbe .L10 movl 4(%rdi), %edx movl $42, %ecx imull %ecx, %edx cmpl $2, %eax movl %edx, 4(%rdi) jbe .L11 movl 8(%rdi), %edx movl $42, %r11d movl $125, %r10d imull %r11d, %edx movl $3, %r11d movl %edx, 8(%rdi) .L2: movl $128, %r8d xorl %edx, %edx subl %eax, %r8d movl %eax, %eax movl %r8d, %esi leaq (%rdi,%rax,4), %rcx xorl %eax, %eax shrl $2, %esi leal 0(,%rsi,4), %r9d .p2align 4,,10 .p2align 3 .L8: movdqa (%rcx,%rax), %xmm1 addl $1, %edx pslld $1, %xmm1 movdqa %xmm1, %xmm0 pslld $2, %xmm0 psubd %xmm1, %xmm0 movdqa %xmm0, %xmm1 pslld $3, %xmm1 psubd %xmm0, %xmm1 movdqa %xmm1, (%rcx,%rax) addq $16, %rax cmpl %edx, %esi ja .L8 movl %r10d, %ecx leal (%r11,%r9), %eax subl %r9d, %ecx cmpl %r9d, %r8d je .L1 movslq %eax, %rdx movl $42, %r9d leaq (%rdi,%rdx,4), %rdx movl (%rdx), %esi imull %r9d, %esi cmpl $1, %ecx movl %esi, (%rdx) leal 1(%rax), %edx je .L1 movslq %edx, %rdx movl $42, %r8d addl $2, %eax leaq (%rdi,%rdx,4), %rdx movl (%rdx), %esi imull %r8d, %esi cmpl $2, %ecx movl %esi, (%rdx) je .L1 cltq movl $42, %r10d leaq (%rdi,%rax,4), %rax movl (%rax), %edx imull %r10d, %edx movl %edx, (%rax) ret .p2align 4,,10 .p2align 3 .L1: rep ret .p2align 4,,10 .p2align 3 .L9: movl $128, %r10d xorl %r11d, %r11d jmp .L2 .p2align 4,,10 .p2align 3 .L11: movl $126, %r10d movl $2, %r11d jmp .L2 .p2align 4,,10 .p2align 3 .L10: movl $127, %r10d movl $1, %r11d jmp .L2 .cfi_endproc .LFE0: .size set, .-set .ident "GCC: (Debian 4.8.1-10) 4.8.1" .section .note.GNU-stack,"",@progbits