https://llvm.org/bugs/show_bug.cgi?id=31677
Bug ID: 31677 Summary: clang/llvm fails to vectorize the product of a complex array Product: libraries Version: trunk Hardware: PC OS: Linux Status: NEW Severity: normal Priority: P Component: Loop Optimizer Assignee: unassignedb...@nondot.org Reporter: drr...@gmail.com CC: llvm-bugs@lists.llvm.org Classification: Unclassified Consider this simple piece of code which takes the product of an array of complex numbers. #include <complex.h> complex float f(complex float x[]) { complex float p = 1.0; for (int i = 0; i < 32; i++) p *= x[i]; return p; } If I compile it with -O3 -march=bdver2 -ffast-math using clang 3.9.1 I get unvectorised assembly. .LCPI0_0: .long 1065353216 # float 1 f: # @f vxorps xmm1, xmm1, xmm1 vmovss xmm0, dword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero,zero,zero xor eax, eax .LBB0_1: # =>This Inner Loop Header: Depth=1 vmovss xmm2, dword ptr [rdi + 8*rax] # xmm2 = mem[0],zero,zero,zero vmovss xmm3, dword ptr [rdi + 8*rax + 4] # xmm3 = mem[0],zero,zero,zero vmulss xmm4, xmm2, xmm1 vmulss xmm5, xmm3, xmm1 vfmaddss xmm1, xmm3, xmm0, xmm4 vfmsubss xmm0, xmm2, xmm0, xmm5 inc rax cmp rax, 32 jne .LBB0_1 vinsertps xmm0, xmm0, xmm1, 16 # xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ret As a test I also tried icc (the Intel Compiler) which does appear to give vectorised code. f: xor eax, eax #4.3 movups xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #3.19 ..B1.2: # Preds ..B1.2 ..B1.1 movups xmm0, XMMWORD PTR [rdi+rax*8] #5.10 movups xmm4, XMMWORD PTR [32+rdi+rax*8] #5.10 movups xmm6, XMMWORD PTR [48+rdi+rax*8] #5.10 movups xmm8, XMMWORD PTR [64+rdi+rax*8] #5.10 movups xmm10, XMMWORD PTR [80+rdi+rax*8] #5.10 movups xmm12, XMMWORD PTR [96+rdi+rax*8] #5.10 movups xmm14, XMMWORD PTR [112+rdi+rax*8] #5.10 movaps xmm2, xmm0 #5.5 shufps xmm2, xmm0, 160 #5.5 mulps xmm2, xmm1 #5.5 xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm1, xmm1, 177 #5.5 shufps xmm0, xmm0, 245 #5.5 mulps xmm1, xmm0 #5.5 addps xmm2, xmm1 #5.5 movups xmm1, XMMWORD PTR [16+rdi+rax*8] #5.10 movaps xmm3, xmm2 #5.5 add rax, 16 #4.3 shufps xmm3, xmm2, 160 #5.5 mulps xmm3, xmm1 #5.5 xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm1, xmm1, 177 #5.5 shufps xmm2, xmm2, 245 #5.5 mulps xmm1, xmm2 #5.5 addps xmm3, xmm1 #5.5 movaps xmm5, xmm3 #5.5 shufps xmm5, xmm3, 160 #5.5 mulps xmm5, xmm4 #5.5 xorps xmm4, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm4, xmm4, 177 #5.5 shufps xmm3, xmm3, 245 #5.5 mulps xmm4, xmm3 #5.5 addps xmm5, xmm4 #5.5 movaps xmm7, xmm5 #5.5 shufps xmm7, xmm5, 160 #5.5 mulps xmm7, xmm6 #5.5 xorps xmm6, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm6, xmm6, 177 #5.5 shufps xmm5, xmm5, 245 #5.5 mulps xmm6, xmm5 #5.5 addps xmm7, xmm6 #5.5 movaps xmm9, xmm7 #5.5 shufps xmm9, xmm7, 160 #5.5 mulps xmm9, xmm8 #5.5 xorps xmm8, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm8, xmm8, 177 #5.5 shufps xmm7, xmm7, 245 #5.5 mulps xmm8, xmm7 #5.5 addps xmm9, xmm8 #5.5 movaps xmm11, xmm9 #5.5 shufps xmm11, xmm9, 160 #5.5 mulps xmm11, xmm10 #5.5 xorps xmm10, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm10, xmm10, 177 #5.5 shufps xmm9, xmm9, 245 #5.5 mulps xmm10, xmm9 #5.5 addps xmm11, xmm10 #5.5 movaps xmm13, xmm11 #5.5 shufps xmm13, xmm11, 160 #5.5 mulps xmm13, xmm12 #5.5 xorps xmm12, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm12, xmm12, 177 #5.5 shufps xmm11, xmm11, 245 #5.5 mulps xmm12, xmm11 #5.5 addps xmm13, xmm12 #5.5 movaps xmm1, xmm13 #5.5 shufps xmm1, xmm13, 160 #5.5 mulps xmm1, xmm14 #5.5 xorps xmm14, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm14, xmm14, 177 #5.5 shufps xmm13, xmm13, 245 #5.5 mulps xmm14, xmm13 #5.5 addps xmm1, xmm14 #5.5 cmp rax, 32 #4.3 jb ..B1.2 # Prob 96% #4.3 movaps xmm2, xmm1 #3.19 movhlps xmm2, xmm1 #3.19 movaps xmm0, xmm2 #3.19 shufps xmm0, xmm2, 160 #3.19 mulps xmm0, xmm1 #3.19 xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #3.19 shufps xmm1, xmm1, 177 #3.19 shufps xmm2, xmm2, 245 #3.19 mulps xmm1, xmm2 #3.19 addps xmm0, xmm1 #3.19 ret #6.10 .L_2il0floatpacket.0: .long 0x3f800000,0x00000000,0x3f800000,0x00000000 .L_2il0floatpacket.1: .long 0x00000000,0x80000000,0x00000000,0x80000000 Interestingly, clang *can* vectorise #include <complex.h> complex float f(complex float x[]) { complex float p = 1.0; for (int i = 0; i < 32; i++) p += x[i]; /* <--- + instead of * */ return p; } -- You are receiving this mail because: You are on the CC list for the bug.
_______________________________________________ llvm-bugs mailing list llvm-bugs@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs