https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81303
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Without peeling for alignment the numbers improve but we still regress from
176s to 205s. The innermost (unrolled) loop is:
.L11:
vmovsd (%rdi,%r15,2), %xmm2
vmovsd (%rsi,%r15,2), %xmm1
movq -56(%rbp), %rbx
vmovhpd (%rdi,%r14), %xmm2, %xmm0
vmovsd (%rdi), %xmm2
vmovhpd (%rsi,%r14), %xmm1, %xmm6
vmovsd (%rsi), %xmm1
vmovhpd (%rdi,%r15), %xmm2, %xmm2
vmovhpd (%rsi,%r15), %xmm1, %xmm1
addq %r11, %rdi
addq %r11, %rsi
vinsertf128 $0x1, %xmm0, %ymm2, %ymm2
vmovsd (%rcx,%r15,2), %xmm0
vinsertf128 $0x1, %xmm6, %ymm1, %ymm1
vmulpd (%rbx,%rax), %ymm2, %ymm3
movq -72(%rbp), %rbx
vmovapd %ymm1, %ymm2
vmovhpd (%rcx,%r14), %xmm0, %xmm6
vmovsd (%rcx), %xmm0
vfmadd132pd (%rbx,%rax), %ymm3, %ymm2
movq -88(%rbp), %rbx
vmovhpd (%rcx,%r15), %xmm0, %xmm0
addq %r11, %rcx
vinsertf128 $0x1, %xmm6, %ymm0, %ymm0
vmulpd (%rbx,%rax), %ymm0, %ymm3
vmovsd (%rdx,%r15,2), %xmm0
movq -64(%rbp), %rbx
vmovhpd (%rdx,%r14), %xmm0, %xmm6
vmovsd (%rdx), %xmm0
vmovhpd (%rdx,%r15), %xmm0, %xmm0
addq %r11, %rdx
vinsertf128 $0x1, %xmm6, %ymm0, %ymm0
vfmadd132pd (%rbx,%rax), %ymm3, %ymm0
vaddpd %ymm0, %ymm2, %ymm1
vmovsd (%r9,%r15,2), %xmm0
vmovhpd (%r9,%r14), %xmm0, %xmm3
vmovsd (%r9), %xmm0
vmovhpd (%r9,%r15), %xmm0, %xmm0
addq %r11, %r9
vinsertf128 $0x1, %xmm3, %ymm0, %ymm0
vmulpd (%r12,%rax), %ymm0, %ymm2
vmovsd (%r8,%r15,2), %xmm0
vmovhpd (%r8,%r14), %xmm0, %xmm3
vmovsd (%r8), %xmm0
vmovhpd (%r8,%r15), %xmm0, %xmm0
movq -80(%rbp), %rbx
addq %r11, %r8
vinsertf128 $0x1, %xmm3, %ymm0, %ymm0
vfmadd132pd (%rbx,%rax), %ymm2, %ymm0
vaddpd %ymm0, %ymm1, %ymm0
vmovsd (%r10,%r15,2), %xmm1
vmovhpd (%r10,%r14), %xmm1, %xmm2
vmovsd (%r10), %xmm1
vmovhpd (%r10,%r15), %xmm1, %xmm1
addq %r11, %r10
vinsertf128 $0x1, %xmm2, %ymm1, %ymm1
vfmadd231pd 0(%r13,%rax), %ymm1, %ymm4
addq $32, %rax
vaddpd %ymm4, %ymm0, %ymm4
cmpq -96(%rbp), %rax
jne .L11
vs
.L10:
vmovsd (%rax,%rbx,8), %xmm0
vmulsd (%r15,%rdx), %xmm0, %xmm0
vmovsd (%r8,%rdx), %xmm1
vfmadd132sd (%rax,%r11,8), %xmm0, %xmm1
vmovsd (%rax,%rsi,8), %xmm0
vmulsd (%r12,%rdx), %xmm0, %xmm0
vmovsd 0(%rbp,%rdx), %xmm4
vfmadd231sd (%rax), %xmm4, %xmm0
vmovsd (%r14,%rdx), %xmm5
vmovsd (%rdi,%rdx), %xmm6
vfmadd231sd (%rax,%r9,8), %xmm6, %xmm2
vaddsd %xmm0, %xmm1, %xmm0
vmovsd (%rax,%r10,8), %xmm1
vmulsd 0(%r13,%rdx), %xmm1, %xmm1
vfmadd231sd (%rax,%rcx,8), %xmm5, %xmm1
addq -112(%rsp), %rdx
addq $8, %rax
vaddsd %xmm2, %xmm1, %xmm2
vaddsd %xmm2, %xmm0, %xmm2
cmpq -120(%rsp), %rax
jne .L10
looks like register pressure is high and IVO doesn't do the best job either.
The vectorized loop might also run into CPU arch limits with respect to
loop cache (it's 310 bytes long).