https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79102
Bug ID: 79102
Summary: gcc fails to auto-vectorise the product of an array of
complex floats
Product: gcc
Version: 7.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: drraph at gmail dot com
Target Milestone: ---
Consider this simple piece of code.
#include <complex.h>
complex float f(complex float x[]) {
complex float p = 1.0;
for (int i = 0; i < 128; i++)
p *= x[i];
return p;
}
If I compile it with -O3 -march=bdver2 -ffast-math I get
f:
vmovss xmm2, DWORD PTR .LC1[rip]
vxorps xmm1, xmm1, xmm1
lea rax, [rdi+256]
.L2:
vmovss xmm0, DWORD PTR [rdi+4]
add rdi, 8
vmulss xmm3, xmm0, xmm2
vmulss xmm0, xmm0, xmm1
vfmadd132ss xmm1, xmm3, DWORD PTR [rdi-8]
vfmsub132ss xmm2, xmm0, DWORD PTR [rdi-8]
cmp rax, rdi
jne .L2
vmovss DWORD PTR [rsp-8], xmm2
vmovss DWORD PTR [rsp-4], xmm1
vmovq xmm0, QWORD PTR [rsp-8]
ret
.LC1:
.long 1065353216
This is unvectorised code. However if I do the same using float instead, that
is with:
float f(float x[], int n ) {
float p = 1.0;
for (int i = 0; i < 32; i++)
p *= x[i];
return p;
}
I get
vmovups xmm2, XMMWORD PTR [rdi]
vmulps xmm0, xmm2, XMMWORD PTR [rdi+16]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+32]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+48]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+64]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+80]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+96]
vmulps xmm0, xmm0, XMMWORD PTR [rdi+112]
vpsrldq xmm1, xmm0, 8
vmulps xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 4
vmulps xmm0, xmm0, xmm1
ret
This is vectorised.
As a test I also the Intel C compiler version 17. In this case the assembly you
get using complex float is however vectorised giving:
f:
mov rdx, rdi #4.3
and rdx, 15 #4.3
movsd xmm0, QWORD PTR p.152.0.0.1[rip] #3.19
test dl, dl #4.3
je ..B1.4 # Prob 50% #4.3
test dl, 7 #4.3
jne ..B1.12 # Prob 10% #4.3
movsd xmm0, QWORD PTR [rdi] #5.10
mov dl, 1 #4.3
..B1.4: # Preds ..B1.3 ..B1.1
movzx eax, dl #4.3
neg dl #4.3
and dl, 3 #4.3
movzx edx, dl #4.3
movss xmm1, DWORD PTR .L_2il0floatpacket.0[rip] #3.19
neg rdx #4.3
movlhps xmm0, xmm1 #3.19
add rdx, 128 #4.3
..B1.5: # Preds ..B1.5 ..B1.4
movaps xmm2, xmm0 #5.5
movups xmm1, XMMWORD PTR [rdi+rax*8] #5.10
shufps xmm2, xmm0, 160 #5.5
mulps xmm2, xmm1 #5.5
xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm1, xmm1, 177 #5.5
shufps xmm0, xmm0, 245 #5.5
mulps xmm1, xmm0 #5.5
movups xmm3, XMMWORD PTR [16+rdi+rax*8] #5.10
add rax, 4 #4.3
addps xmm2, xmm1 #5.5
movaps xmm0, xmm2 #5.5
shufps xmm0, xmm2, 160 #5.5
mulps xmm0, xmm3 #5.5
xorps xmm3, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm3, xmm3, 177 #5.5
shufps xmm2, xmm2, 245 #5.5
mulps xmm3, xmm2 #5.5
addps xmm0, xmm3 #5.5
cmp rax, rdx #4.3
jb ..B1.5 # Prob 99% #4.3
movaps xmm1, xmm0 #3.19
movhlps xmm1, xmm0 #3.19
movaps xmm2, xmm1 #3.19
shufps xmm2, xmm1, 160 #3.19
mulps xmm2, xmm0 #3.19
xorps xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #3.19
shufps xmm0, xmm0, 177 #3.19
shufps xmm1, xmm1, 245 #3.19
mulps xmm0, xmm1 #3.19
addps xmm0, xmm2 #3.19
..B1.7: # Preds ..B1.6 ..B1.12
cmp rdx, 128 #4.3
jae ..B1.11 # Prob 0% #4.3
..B1.9: # Preds ..B1.7 ..B1.9
movsd xmm1, QWORD PTR [rdi+rdx*8] #5.10
inc rdx #4.3
movaps xmm2, xmm1 #5.5
shufps xmm2, xmm1, 160 #5.5
mulps xmm2, xmm0 #5.5
xorps xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5
shufps xmm0, xmm0, 177 #5.5
shufps xmm1, xmm1, 245 #5.5
mulps xmm0, xmm1 #5.5
addps xmm0, xmm2 #5.5
cmp rdx, 128 #4.3
jb ..B1.9 # Prob 99% #4.3
..B1.11: # Preds ..B1.9 ..B1.7
ret #6.10
..B1.12: # Preds ..B1.2
xor edx, edx #4.3
jmp ..B1.7 # Prob 100% #4.3
p.152.0.0.1:
.long 0x3f800000,0x00000000
.L_2il0floatpacket.1:
.long 0x00000000,0x80000000,0x00000000,0x80000000
.L_2il0floatpacket.0:
.long 0x3f800000