https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94037

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization
             Target|                            |x86_64-*-*, i?86-*-*
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2020-03-05
          Component|rtl-optimization            |target
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The only appearant difference is

        setge   %sil
        movzbl  %sil, %esi
...
        setl    %sil
        movzbl  %sil, %esi
...

vs. clangs

        xorl    %edx, %edx
        xorl    %esi, %esi
...
        setle   %dl
        setg    %sil

where eventually the xors are "free" and the setg/zext cause excessive
latency.  But it's all quite ugly and there has to be a better way to
conditionally exchange two values in memory (fitting in a register)
without branches (which is probably to avoid mispredicts).

Note with GCC 10 you'll see the v[2] = { a, b } store "vectorized",
-DFAST is still faster for me (Haswell), 7.4s vs. 10s.

Fast loop body:

.L12:
        movl    (%rax), %ecx
        vmovd   (%r11), %xmm1
        cmpl    %ecx, %esi
        setge   %dl
        movzbl  %dl, %edx
        vpinsrd $1, %ecx, %xmm1, %xmm0
        movl    %r8d, %ecx
        setge   %dil
        subl    %edx, %ecx
        vmovq   %xmm0, 120(%rsp)
        movslq  %ecx, %rcx
        movl    120(%rsp,%rdx,4), %edx
        movl    120(%rsp,%rcx,4), %ecx
        addq    $4, %rax
        movl    %ecx, -4(%rax)
        movl    %edx, (%r11)
        movzbl  %dil, %edx
        leaq    (%r11,%rdx,4), %r11
        cmpq    24(%rsp), %rax
        jb      .L12

slow one:

.L12:
        movl    (%rax), %esi
        vmovd   (%r10), %xmm1
        cmpl    %esi, %edx
        vpinsrd $1, %esi, %xmm1, %xmm0
        setge   %dil
        setl    %sil
        vmovq   %xmm0, 120(%rsp)
        movzbl  %dil, %edi
        movzbl  %sil, %esi
        movl    120(%rsp,%rdi,4), %edi
        movl    120(%rsp,%rsi,4), %esi
        setge   %cl
        movl    %edi, (%r10)
        movzbl  %cl, %ecx
        movl    %esi, (%rax)
        addq    $4, %rax
        leaq    (%r10,%rcx,4), %r10
        cmpq    8(%rsp), %rax
        jb      .L12

Reply via email to