On Fri, Nov 15, 2013 at 11:26:06PM +0100, Ondřej Bílka wrote:
Minor correction, a mutt read replaced a set1.s file by one that I later
used for avx2 variant. A correct file is following
        .file   "set1.c"
        .text
        .p2align 4,,15
        .globl  set
        .type   set, @function
set:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        andl    $15, %eax
        shrq    $2, %rax
        negq    %rax
        andl    $3, %eax
        je      .L9
        movl    (%rdi), %edx
        movl    $42, %esi
        imull   %esi, %edx
        cmpl    $1, %eax
        movl    %edx, (%rdi)
        jbe     .L10
        movl    4(%rdi), %edx
        movl    $42, %ecx
        imull   %ecx, %edx
        cmpl    $2, %eax
        movl    %edx, 4(%rdi)
        jbe     .L11
        movl    8(%rdi), %edx
        movl    $42, %r11d
        movl    $125, %r10d
        imull   %r11d, %edx
        movl    $3, %r11d
        movl    %edx, 8(%rdi)
.L2:
        movl    $128, %r8d
        xorl    %edx, %edx
        subl    %eax, %r8d
        movl    %eax, %eax
        movl    %r8d, %esi
        leaq    (%rdi,%rax,4), %rcx
        xorl    %eax, %eax
        shrl    $2, %esi
        leal    0(,%rsi,4), %r9d
        .p2align 4,,10
        .p2align 3
.L8:
        movdqa  (%rcx,%rax), %xmm1
        addl    $1, %edx
        pslld   $1, %xmm1
        movdqa  %xmm1, %xmm0
        pslld   $2, %xmm0
        psubd   %xmm1, %xmm0
        movdqa  %xmm0, %xmm1
        pslld   $3, %xmm1
        psubd   %xmm0, %xmm1
        movdqa  %xmm1, (%rcx,%rax)
        addq    $16, %rax
        cmpl    %edx, %esi
        ja      .L8
        movl    %r10d, %ecx
        leal    (%r11,%r9), %eax
        subl    %r9d, %ecx
        cmpl    %r9d, %r8d
        je      .L1
        movslq  %eax, %rdx
        movl    $42, %r9d
        leaq    (%rdi,%rdx,4), %rdx
        movl    (%rdx), %esi
        imull   %r9d, %esi
        cmpl    $1, %ecx
        movl    %esi, (%rdx)
        leal    1(%rax), %edx
        je      .L1
        movslq  %edx, %rdx
        movl    $42, %r8d
        addl    $2, %eax
        leaq    (%rdi,%rdx,4), %rdx
        movl    (%rdx), %esi
        imull   %r8d, %esi
        cmpl    $2, %ecx
        movl    %esi, (%rdx)
        je      .L1
        cltq
        movl    $42, %r10d
        leaq    (%rdi,%rax,4), %rax
        movl    (%rax), %edx
        imull   %r10d, %edx
        movl    %edx, (%rax)
        ret
        .p2align 4,,10
        .p2align 3
.L1:
        rep ret
        .p2align 4,,10
        .p2align 3
.L9:
        movl    $128, %r10d
        xorl    %r11d, %r11d
        jmp     .L2
        .p2align 4,,10
        .p2align 3
.L11:
        movl    $126, %r10d
        movl    $2, %r11d
        jmp     .L2
        .p2align 4,,10
        .p2align 3
.L10:
        movl    $127, %r10d
        movl    $1, %r11d
        jmp     .L2
        .cfi_endproc
.LFE0:
        .size   set, .-set
        .ident  "GCC: (Debian 4.8.1-10) 4.8.1"
        .section        .note.GNU-stack,"",@progbits

Reply via email to