https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79709
Bug ID: 79709 Summary: Subobtimal code with -mavx and explicit vector Product: gcc Version: 7.0.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: tkoenig at gcc dot gnu.org Target Milestone: --- For the following code typedef double v4do __attribute__((vector_size (32))); typedef long int v4i __attribute__((vector_size (32))); #define VSET(vect,val) do { vect[0]=val; vect[1]=val; vect[2]=val; vect[3]=val; } while (0) void foo(v4do cx, v4do cy, v4i *r) { v4do x, y, xn, yn; v4i add, res; v4do two, four; long int done; VSET(res, 0L); VSET(two, 2.0); VSET(four, 4.0); x = cx; y = cy; done = 0; while (1) { xn = x*x - y*y + cx; yn = two*x*y + cy; add = xn+xn + yn*yn < four; res += add; if (add[0] == 0 || add[1] == 0 || add[2] || add[3]) break; x = xn; y = yn; } *r = res; } gcc compares strange code. The loop is translated with 7.0.1 20170212 with "gcc -O3 -S -mavx v.c" into .L14: vpextrq $1, %xmm2, %rax testq %rax, %rax je .L2 vmovdqa -48(%rbp), %ymm5 vextractf128 $0x1, %ymm5, %xmm2 vmovq %xmm2, %rax testq %rax, %rax jne .L2 vpextrq $1, %xmm2, %rax vmovapd %ymm3, %ymm5 testq %rax, %rax jne .L2 .L3: vmulpd %ymm5, %ymm5, %ymm3 vmulpd %ymm8, %ymm5, %ymm5 vsubpd %ymm6, %ymm3, %ymm3 vmulpd %ymm4, %ymm5, %ymm4 vaddpd %ymm0, %ymm3, %ymm3 vaddpd %ymm1, %ymm4, %ymm4 vaddpd %ymm3, %ymm3, %ymm2 vmulpd %ymm4, %ymm4, %ymm6 vaddpd %ymm6, %ymm2, %ymm2 vcmpltpd %ymm7, %ymm2, %ymm5 vmovapd %ymm5, -48(%rbp) vmovdqa -48(%rbp), %xmm5 vpaddq -112(%rbp), %xmm5, %xmm5 vmovaps %xmm5, -80(%rbp) vmovdqa -32(%rbp), %xmm5 vpaddq -96(%rbp), %xmm5, %xmm2 vmovaps %xmm2, -64(%rbp) vmovdqa -80(%rbp), %ymm2 vmovdqa %ymm2, -112(%rbp) vmovdqa -48(%rbp), %xmm2 vmovq %xmm2, %rax testq %rax, %rax jne .L14 which contains quite a few unnecessary instructions for moving stuff around. By comparision, clang translates the inner loop to .LBB0_1: # =>This Inner Loop Header: Depth=1 vmulpd %ymm5, %ymm5, %ymm6 vmulpd %ymm4, %ymm4, %ymm7 vsubpd %ymm7, %ymm6, %ymm6 vaddpd %ymm5, %ymm5, %ymm7 vaddpd %ymm0, %ymm6, %ymm5 vmulpd %ymm7, %ymm4, %ymm4 vaddpd %ymm1, %ymm4, %ymm4 vaddpd %ymm5, %ymm5, %ymm6 vmulpd %ymm4, %ymm4, %ymm7 vaddpd %ymm7, %ymm6, %ymm6 vcmpltpd %ymm8, %ymm6, %ymm6 vextractf128 $1, %ymm6, %xmm7 vextractf128 $1, %ymm2, %xmm3 vpaddq %xmm3, %xmm7, %xmm3 vpaddq %xmm2, %xmm6, %xmm2 vinsertf128 $1, %xmm3, %ymm2, %ymm2 vmovq %xmm7, %rax vpextrq $1, %xmm7, %rcx orq %rax, %rcx jne .LBB0_4 # BB#2: # in Loop: Header=BB0_1 Depth=1 vpextrq $1, %xmm6, %rax testq %rax, %rax je .LBB0_4 # BB#3: # in Loop: Header=BB0_1 Depth=1 vmovq %xmm6, %rax testq %rax, %rax jne .LBB0_1 which looks much more straighforward, and should be faster.