https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69564

--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
With -Ofast -g -march=native -funroll-loops we actually (for C) beat clang on
LU.
The SOR difference is most likely IVOPTs/scheduling/RA thing, there is nothing
to actually vectorize because of the dependencies.
clang 3.8 (no matter if -funroll-loops is used or not) unrolls the main SOR
loop 4 times and we get roughly:
        .align  16, 0x90
.LBB1_12:
        vmovsd  -24(%r13,%rax,8), %xmm4
        vaddsd  -24(%r12,%rax,8), %xmm4, %xmm4
        vmovsd  -24(%rbx,%rax,8), %xmm5
        vaddsd  %xmm5, %xmm3, %xmm3
        vaddsd  %xmm3, %xmm4, %xmm3
        vmulsd  %xmm0, %xmm2, %xmm2
        vfmadd231sd     %xmm3, %xmm1, %xmm2
        vmovsd  %xmm2, -32(%rbx,%rax,8)
        vmovsd  -16(%r13,%rax,8), %xmm3
        vaddsd  -16(%r12,%rax,8), %xmm3, %xmm3
        vmovsd  -16(%rbx,%rax,8), %xmm4
        vaddsd  %xmm4, %xmm3, %xmm3
        vaddsd  %xmm3, %xmm2, %xmm2
        vmulsd  %xmm0, %xmm5, %xmm3
        vfmadd231sd     %xmm2, %xmm1, %xmm3
        vmovsd  %xmm3, -24(%rbx,%rax,8)
        vmovsd  -8(%r13,%rax,8), %xmm2 
        vaddsd  -8(%r12,%rax,8), %xmm2, %xmm2
        vmovsd  -8(%rbx,%rax,8), %xmm5 
        vaddsd  %xmm5, %xmm2, %xmm2
        vaddsd  %xmm2, %xmm3, %xmm2
        vmulsd  %xmm0, %xmm4, %xmm3
        vfmadd231sd     %xmm2, %xmm1, %xmm3
        vmovsd  %xmm3, -16(%rbx,%rax,8)
        vmovsd  (%r13,%rax,8), %xmm2   
        vaddsd  (%r12,%rax,8), %xmm2, %xmm4
        vmovsd  (%rbx,%rax,8), %xmm2   
        vaddsd  %xmm2, %xmm4, %xmm4
        vaddsd  %xmm4, %xmm3, %xmm4
        vmulsd  %xmm0, %xmm5, %xmm3
        vfmadd231sd     %xmm4, %xmm1, %xmm3
        vmovsd  %xmm3, -8(%rbx,%rax,8)
        addq    $4, %rax
        cmpl    %eax, %r15d
        jne     .LBB1_12
gcc -Ofast -g -march=native unrolls just twice:
        .p2align 4,,10
        .p2align 3
.L9:
        vmovsd  -8(%r8,%rdi,8), %xmm5
        vmovsd  -16(%r9,%rdi,8), %xmm1
        vmulsd  %xmm4, %xmm2, %xmm4
        movslq  %edi, %rax
        vaddsd  -16(%r10,%rdi,8), %xmm1, %xmm1
        vaddsd  %xmm0, %xmm5, %xmm0
        vmulsd  %xmm5, %xmm2, %xmm5
        vaddsd  %xmm0, %xmm1, %xmm0
        vmovapd %xmm0, %xmm1
        vfmadd132sd     %xmm3, %xmm4, %xmm1
        vmovsd  (%r8,%rdi,8), %xmm4
        vmovsd  %xmm1, -16(%r8,%rdi,8)
        vaddsd  %xmm4, %xmm1, %xmm1
        vmovsd  -8(%r10,%rdi,8), %xmm0
        vaddsd  -8(%r9,%rdi,8), %xmm0, %xmm0
        vaddsd  %xmm1, %xmm0, %xmm0
        vfmadd132sd     %xmm3, %xmm5, %xmm0
        vmovsd  %xmm0, -8(%r8,%rdi,8)
        addq    $2, %rdi
        cmpq    %rcx, %rdi
        jne     .L9
and with additional -funroll-loops we unroll 8 times instead:
.L9:
        vmovsd  -8(%rax,%rdi,8), %xmm6
        vmovsd  -16(%r8,%rdi,8), %xmm5
        vmulsd  %xmm7, %xmm1, %xmm8
        leaq    2(%rdi), %r14
        vaddsd  -16(%r9,%rdi,8), %xmm5, %xmm9
        vmovsd  (%rax,%rdi,8), %xmm12
        leaq    4(%rdi), %r10
        vaddsd  %xmm3, %xmm6, %xmm10
        vmulsd  %xmm6, %xmm1, %xmm7
        vmulsd  %xmm12, %xmm1, %xmm0
        vaddsd  %xmm10, %xmm9, %xmm11
        vfmadd132sd     %xmm2, %xmm8, %xmm11
        vmovsd  %xmm11, -16(%rax,%rdi,8)
        vaddsd  %xmm12, %xmm11, %xmm15
        vmovsd  -8(%r9,%rdi,8), %xmm13
        vaddsd  -8(%r8,%rdi,8), %xmm13, %xmm14
        vaddsd  %xmm15, %xmm14, %xmm4
        vfmadd132sd     %xmm2, %xmm7, %xmm4
        vmovsd  %xmm4, -8(%rax,%rdi,8)
        vmovsd  -8(%rax,%r14,8), %xmm6
        vmovsd  -16(%r8,%r14,8), %xmm3
        vaddsd  -16(%r9,%r14,8), %xmm3, %xmm8
        vmovsd  (%rax,%r14,8), %xmm10
        vaddsd  %xmm4, %xmm6, %xmm5
        vmulsd  %xmm6, %xmm1, %xmm11
        vmulsd  %xmm10, %xmm1, %xmm4
        vaddsd  %xmm5, %xmm8, %xmm9
        vfmadd132sd     %xmm2, %xmm0, %xmm9
        vmovsd  %xmm9, -16(%rax,%r14,8)
        vaddsd  %xmm10, %xmm9, %xmm13
        vmovsd  -8(%r9,%r14,8), %xmm12
        vaddsd  -8(%r8,%r14,8), %xmm12, %xmm7
        vaddsd  %xmm13, %xmm7, %xmm14
        vfmadd132sd     %xmm2, %xmm11, %xmm14
        vmovsd  %xmm14, -8(%rax,%r14,8)
        vmovsd  -8(%rax,%r10,8), %xmm15
        vmovsd  -16(%r8,%r10,8), %xmm6
        vaddsd  -16(%r9,%r10,8), %xmm6, %xmm0
        vmovsd  (%rax,%r10,8), %xmm9
        vaddsd  %xmm14, %xmm15, %xmm3
        vmulsd  %xmm15, %xmm1, %xmm5
        vmulsd  %xmm9, %xmm1, %xmm14
        vaddsd  %xmm3, %xmm0, %xmm8
        vfmadd132sd     %xmm2, %xmm4, %xmm8
        vmovsd  %xmm8, -16(%rax,%r10,8)
        vaddsd  %xmm9, %xmm8, %xmm12
        vmovsd  -8(%r9,%r10,8), %xmm10
        vaddsd  -8(%r8,%r10,8), %xmm10, %xmm11
        vaddsd  %xmm12, %xmm11, %xmm7
        vfmadd132sd     %xmm2, %xmm5, %xmm7
        vmovsd  %xmm7, -8(%rax,%r10,8)
        leaq    6(%rdi), %r10
        addq    $8, %rdi
        vmovsd  -8(%rax,%r10,8), %xmm13
        vmovsd  -16(%r8,%r10,8), %xmm15
        vaddsd  -16(%r9,%r10,8), %xmm15, %xmm6
        vaddsd  %xmm7, %xmm13, %xmm4
        vmovsd  (%rax,%r10,8), %xmm7
        vmulsd  %xmm13, %xmm1, %xmm8
        vaddsd  %xmm4, %xmm6, %xmm3
        vfmadd132sd     %xmm2, %xmm14, %xmm3
        vmovsd  %xmm3, -16(%rax,%r10,8)
        vaddsd  %xmm7, %xmm3, %xmm5
        vmovsd  -8(%r9,%r10,8), %xmm0
        vaddsd  -8(%r8,%r10,8), %xmm0, %xmm9
        vaddsd  %xmm5, %xmm9, %xmm0
        vfmadd132sd     %xmm2, %xmm8, %xmm0
        vmovapd %xmm0, %xmm3
        vmovsd  %xmm0, -8(%rax,%r10,8)
        cmpq    %rcx, %rdi
        jne     .L9

The number of the v*sd instructions matches in between all 3 versions
proportionally to how many unrolls there are, but the non-unrolled version has
a weird extra move in there (the movslq    %edi, %rax) even when it isn't used
anywhere in the loop, and the -funroll-loops version is just too weird, many
leaq insns when the constant could be added into the immediates in the
corresponding addresses.

Reply via email to