https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69564
--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> --- With -Ofast -g -march=native -funroll-loops we actually (for C) beat clang on LU. The SOR difference is most likely IVOPTs/scheduling/RA thing, there is nothing to actually vectorize because of the dependencies. clang 3.8 (no matter if -funroll-loops is used or not) unrolls the main SOR loop 4 times and we get roughly: .align 16, 0x90 .LBB1_12: vmovsd -24(%r13,%rax,8), %xmm4 vaddsd -24(%r12,%rax,8), %xmm4, %xmm4 vmovsd -24(%rbx,%rax,8), %xmm5 vaddsd %xmm5, %xmm3, %xmm3 vaddsd %xmm3, %xmm4, %xmm3 vmulsd %xmm0, %xmm2, %xmm2 vfmadd231sd %xmm3, %xmm1, %xmm2 vmovsd %xmm2, -32(%rbx,%rax,8) vmovsd -16(%r13,%rax,8), %xmm3 vaddsd -16(%r12,%rax,8), %xmm3, %xmm3 vmovsd -16(%rbx,%rax,8), %xmm4 vaddsd %xmm4, %xmm3, %xmm3 vaddsd %xmm3, %xmm2, %xmm2 vmulsd %xmm0, %xmm5, %xmm3 vfmadd231sd %xmm2, %xmm1, %xmm3 vmovsd %xmm3, -24(%rbx,%rax,8) vmovsd -8(%r13,%rax,8), %xmm2 vaddsd -8(%r12,%rax,8), %xmm2, %xmm2 vmovsd -8(%rbx,%rax,8), %xmm5 vaddsd %xmm5, %xmm2, %xmm2 vaddsd %xmm2, %xmm3, %xmm2 vmulsd %xmm0, %xmm4, %xmm3 vfmadd231sd %xmm2, %xmm1, %xmm3 vmovsd %xmm3, -16(%rbx,%rax,8) vmovsd (%r13,%rax,8), %xmm2 vaddsd (%r12,%rax,8), %xmm2, %xmm4 vmovsd (%rbx,%rax,8), %xmm2 vaddsd %xmm2, %xmm4, %xmm4 vaddsd %xmm4, %xmm3, %xmm4 vmulsd %xmm0, %xmm5, %xmm3 vfmadd231sd %xmm4, %xmm1, %xmm3 vmovsd %xmm3, -8(%rbx,%rax,8) addq $4, %rax cmpl %eax, %r15d jne .LBB1_12 gcc -Ofast -g -march=native unrolls just twice: .p2align 4,,10 .p2align 3 .L9: vmovsd -8(%r8,%rdi,8), %xmm5 vmovsd -16(%r9,%rdi,8), %xmm1 vmulsd %xmm4, %xmm2, %xmm4 movslq %edi, %rax vaddsd -16(%r10,%rdi,8), %xmm1, %xmm1 vaddsd %xmm0, %xmm5, %xmm0 vmulsd %xmm5, %xmm2, %xmm5 vaddsd %xmm0, %xmm1, %xmm0 vmovapd %xmm0, %xmm1 vfmadd132sd %xmm3, %xmm4, %xmm1 vmovsd (%r8,%rdi,8), %xmm4 vmovsd %xmm1, -16(%r8,%rdi,8) vaddsd %xmm4, %xmm1, %xmm1 vmovsd -8(%r10,%rdi,8), %xmm0 vaddsd -8(%r9,%rdi,8), %xmm0, %xmm0 vaddsd %xmm1, %xmm0, %xmm0 vfmadd132sd %xmm3, %xmm5, %xmm0 vmovsd %xmm0, -8(%r8,%rdi,8) addq $2, %rdi cmpq %rcx, %rdi jne .L9 and with additional -funroll-loops we unroll 8 times instead: .L9: vmovsd -8(%rax,%rdi,8), %xmm6 vmovsd -16(%r8,%rdi,8), %xmm5 vmulsd %xmm7, %xmm1, %xmm8 leaq 2(%rdi), %r14 vaddsd -16(%r9,%rdi,8), %xmm5, %xmm9 vmovsd (%rax,%rdi,8), %xmm12 leaq 4(%rdi), %r10 vaddsd %xmm3, %xmm6, %xmm10 vmulsd %xmm6, %xmm1, %xmm7 vmulsd %xmm12, %xmm1, %xmm0 vaddsd %xmm10, %xmm9, %xmm11 vfmadd132sd %xmm2, %xmm8, %xmm11 vmovsd %xmm11, -16(%rax,%rdi,8) vaddsd %xmm12, %xmm11, %xmm15 vmovsd -8(%r9,%rdi,8), %xmm13 vaddsd -8(%r8,%rdi,8), %xmm13, %xmm14 vaddsd %xmm15, %xmm14, %xmm4 vfmadd132sd %xmm2, %xmm7, %xmm4 vmovsd %xmm4, -8(%rax,%rdi,8) vmovsd -8(%rax,%r14,8), %xmm6 vmovsd -16(%r8,%r14,8), %xmm3 vaddsd -16(%r9,%r14,8), %xmm3, %xmm8 vmovsd (%rax,%r14,8), %xmm10 vaddsd %xmm4, %xmm6, %xmm5 vmulsd %xmm6, %xmm1, %xmm11 vmulsd %xmm10, %xmm1, %xmm4 vaddsd %xmm5, %xmm8, %xmm9 vfmadd132sd %xmm2, %xmm0, %xmm9 vmovsd %xmm9, -16(%rax,%r14,8) vaddsd %xmm10, %xmm9, %xmm13 vmovsd -8(%r9,%r14,8), %xmm12 vaddsd -8(%r8,%r14,8), %xmm12, %xmm7 vaddsd %xmm13, %xmm7, %xmm14 vfmadd132sd %xmm2, %xmm11, %xmm14 vmovsd %xmm14, -8(%rax,%r14,8) vmovsd -8(%rax,%r10,8), %xmm15 vmovsd -16(%r8,%r10,8), %xmm6 vaddsd -16(%r9,%r10,8), %xmm6, %xmm0 vmovsd (%rax,%r10,8), %xmm9 vaddsd %xmm14, %xmm15, %xmm3 vmulsd %xmm15, %xmm1, %xmm5 vmulsd %xmm9, %xmm1, %xmm14 vaddsd %xmm3, %xmm0, %xmm8 vfmadd132sd %xmm2, %xmm4, %xmm8 vmovsd %xmm8, -16(%rax,%r10,8) vaddsd %xmm9, %xmm8, %xmm12 vmovsd -8(%r9,%r10,8), %xmm10 vaddsd -8(%r8,%r10,8), %xmm10, %xmm11 vaddsd %xmm12, %xmm11, %xmm7 vfmadd132sd %xmm2, %xmm5, %xmm7 vmovsd %xmm7, -8(%rax,%r10,8) leaq 6(%rdi), %r10 addq $8, %rdi vmovsd -8(%rax,%r10,8), %xmm13 vmovsd -16(%r8,%r10,8), %xmm15 vaddsd -16(%r9,%r10,8), %xmm15, %xmm6 vaddsd %xmm7, %xmm13, %xmm4 vmovsd (%rax,%r10,8), %xmm7 vmulsd %xmm13, %xmm1, %xmm8 vaddsd %xmm4, %xmm6, %xmm3 vfmadd132sd %xmm2, %xmm14, %xmm3 vmovsd %xmm3, -16(%rax,%r10,8) vaddsd %xmm7, %xmm3, %xmm5 vmovsd -8(%r9,%r10,8), %xmm0 vaddsd -8(%r8,%r10,8), %xmm0, %xmm9 vaddsd %xmm5, %xmm9, %xmm0 vfmadd132sd %xmm2, %xmm8, %xmm0 vmovapd %xmm0, %xmm3 vmovsd %xmm0, -8(%rax,%r10,8) cmpq %rcx, %rdi jne .L9 The number of the v*sd instructions matches in between all 3 versions proportionally to how many unrolls there are, but the non-unrolled version has a weird extra move in there (the movslq %edi, %rax) even when it isn't used anywhere in the loop, and the -funroll-loops version is just too weird, many leaq insns when the constant could be added into the immediates in the corresponding addresses.