https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338
Bug ID: 61338
Summary: too many permutation in a vectorized "reverse loop"
Product: gcc
Version: 4.9.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vincenzo.innocente at cern dot ch
in this example gcc generates 4 permutations for foo (while none is required)
On the positive side the code for bar (which is a more realistic use case)
seems optimal.
float x[1024];
float y[1024];
float z[1024];
void foo() {
for (int i=0; i<512; ++i)
x[1023-i] += y[1023-i]*z[512-i];
}
void bar() {
for (int i=0; i<512; ++i)
x[1023-i] += y[i]*z[i+512];
}
c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s
__Z3foov:
LFB0:
vmovdqa LC0(%rip), %ymm2
xorl %eax, %eax
leaq 4064+_x(%rip), %rdx
leaq 4064+_y(%rip), %rsi
leaq 2020+_z(%rip), %rcx
.align 4,0x90
L2:
vpermd (%rdx,%rax), %ymm2, %ymm0
vpermd (%rcx,%rax), %ymm2, %ymm1
vpermd (%rsi,%rax), %ymm2, %ymm3
vfmadd231ps %ymm1, %ymm3, %ymm0
vpermd %ymm0, %ymm2, %ymm0
vmovaps %ymm0, (%rdx,%rax)
subq $32, %rax
cmpq $-2048, %rax
jne L2
vzeroupper
ret
LFE0:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
.text
LHOTE1:
.section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
.text
LHOTB2:
.align 4,0x90
.globl __Z3barv
__Z3barv:
LFB1:
vmovdqa LC0(%rip), %ymm1
leaq 2048+_z(%rip), %rdx
leaq _y(%rip), %rcx
leaq 4064+_x(%rip), %rax
leaq 4096+_z(%rip), %rsi
.align 4,0x90
L6:
vmovaps (%rdx), %ymm2
addq $32, %rdx
vpermd (%rax), %ymm1, %ymm0
addq $32, %rcx
vfmadd231ps -32(%rcx), %ymm2, %ymm0
subq $32, %rax
vpermd %ymm0, %ymm1, %ymm0
vmovaps %ymm0, 32(%rax)
cmpq %rsi, %rdx
jne L6
vzeroupper
ret
LFE1: