[Bug c/79102] New: gcc fails to auto-vectorise the product of an array of complex floats

drraph at gmail dot com Mon, 16 Jan 2017 06:11:40 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79102


            Bug ID: 79102
           Summary: gcc fails to auto-vectorise the product of an array of
                    complex floats
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider this simple piece of code.

#include <complex.h>
complex float f(complex float x[]) {
  complex float p = 1.0;
  for (int i = 0; i < 128; i++)
    p *= x[i];
  return p;
}

If I compile it with -O3 -march=bdver2 -ffast-math  I get

f:
        vmovss  xmm2, DWORD PTR .LC1[rip]
        vxorps  xmm1, xmm1, xmm1
        lea     rax, [rdi+256]
.L2:
        vmovss  xmm0, DWORD PTR [rdi+4]
        add     rdi, 8
        vmulss  xmm3, xmm0, xmm2
        vmulss  xmm0, xmm0, xmm1
        vfmadd132ss     xmm1, xmm3, DWORD PTR [rdi-8]
        vfmsub132ss     xmm2, xmm0, DWORD PTR [rdi-8]
        cmp     rax, rdi
        jne     .L2
        vmovss  DWORD PTR [rsp-8], xmm2
        vmovss  DWORD PTR [rsp-4], xmm1
        vmovq   xmm0, QWORD PTR [rsp-8]
        ret
.LC1:
        .long   1065353216


This is unvectorised code. However if I do the same using float instead, that
is with:

float f(float x[], int n ) {
  float p = 1.0;
  for (int i = 0; i < 32; i++)
    p *= x[i];
  return p;
}

I get

        vmovups xmm2, XMMWORD PTR [rdi]
        vmulps  xmm0, xmm2, XMMWORD PTR [rdi+16]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+32]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+48]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+64]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+80]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+96]
        vmulps  xmm0, xmm0, XMMWORD PTR [rdi+112]
        vpsrldq xmm1, xmm0, 8
        vmulps  xmm0, xmm0, xmm1
        vpsrldq xmm1, xmm0, 4
        vmulps  xmm0, xmm0, xmm1
        ret

This is vectorised.

As a test I also the Intel C compiler version 17. In this case the assembly you
get using complex float is however vectorised giving:

f:
        mov       rdx, rdi                                      #4.3
        and       rdx, 15                                       #4.3
        movsd     xmm0, QWORD PTR p.152.0.0.1[rip]              #3.19
        test      dl, dl                                        #4.3
        je        ..B1.4        # Prob 50%                      #4.3
        test      dl, 7                                         #4.3
        jne       ..B1.12       # Prob 10%                      #4.3
        movsd     xmm0, QWORD PTR [rdi]                         #5.10
        mov       dl, 1                                         #4.3
..B1.4:                         # Preds ..B1.3 ..B1.1
        movzx     eax, dl                                       #4.3
        neg       dl                                            #4.3
        and       dl, 3                                         #4.3
        movzx     edx, dl                                       #4.3
        movss     xmm1, DWORD PTR .L_2il0floatpacket.0[rip]     #3.19
        neg       rdx                                           #4.3
        movlhps   xmm0, xmm1                                    #3.19
        add       rdx, 128                                      #4.3
..B1.5:                         # Preds ..B1.5 ..B1.4
        movaps    xmm2, xmm0                                    #5.5
        movups    xmm1, XMMWORD PTR [rdi+rax*8]                 #5.10
        shufps    xmm2, xmm0, 160                               #5.5
        mulps     xmm2, xmm1                                    #5.5
        xorps     xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm1, xmm1, 177                               #5.5
        shufps    xmm0, xmm0, 245                               #5.5
        mulps     xmm1, xmm0                                    #5.5
        movups    xmm3, XMMWORD PTR [16+rdi+rax*8]              #5.10
        add       rax, 4                                        #4.3
        addps     xmm2, xmm1                                    #5.5
        movaps    xmm0, xmm2                                    #5.5
        shufps    xmm0, xmm2, 160                               #5.5
        mulps     xmm0, xmm3                                    #5.5
        xorps     xmm3, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm3, xmm3, 177                               #5.5
        shufps    xmm2, xmm2, 245                               #5.5
        mulps     xmm3, xmm2                                    #5.5
        addps     xmm0, xmm3                                    #5.5
        cmp       rax, rdx                                      #4.3
        jb        ..B1.5        # Prob 99%                      #4.3
        movaps    xmm1, xmm0                                    #3.19
        movhlps   xmm1, xmm0                                    #3.19
        movaps    xmm2, xmm1                                    #3.19
        shufps    xmm2, xmm1, 160                               #3.19
        mulps     xmm2, xmm0                                    #3.19
        xorps     xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip]   #3.19
        shufps    xmm0, xmm0, 177                               #3.19
        shufps    xmm1, xmm1, 245                               #3.19
        mulps     xmm0, xmm1                                    #3.19
        addps     xmm0, xmm2                                    #3.19
..B1.7:                         # Preds ..B1.6 ..B1.12
        cmp       rdx, 128                                      #4.3
        jae       ..B1.11       # Prob 0%                       #4.3
..B1.9:                         # Preds ..B1.7 ..B1.9
        movsd     xmm1, QWORD PTR [rdi+rdx*8]                   #5.10
        inc       rdx                                           #4.3
        movaps    xmm2, xmm1                                    #5.5
        shufps    xmm2, xmm1, 160                               #5.5
        mulps     xmm2, xmm0                                    #5.5
        xorps     xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip]   #5.5
        shufps    xmm0, xmm0, 177                               #5.5
        shufps    xmm1, xmm1, 245                               #5.5
        mulps     xmm0, xmm1                                    #5.5
        addps     xmm0, xmm2                                    #5.5
        cmp       rdx, 128                                      #4.3
        jb        ..B1.9        # Prob 99%                      #4.3
..B1.11:                        # Preds ..B1.9 ..B1.7
        ret                                                     #6.10
..B1.12:                        # Preds ..B1.2
        xor       edx, edx                                      #4.3
        jmp       ..B1.7        # Prob 100%                     #4.3
p.152.0.0.1:
        .long   0x3f800000,0x00000000
.L_2il0floatpacket.1:
        .long   0x00000000,0x80000000,0x00000000,0x80000000
.L_2il0floatpacket.0:
        .long   0x3f800000

[Bug c/79102] New: gcc fails to auto-vectorise the product of an array of complex floats

Reply via email to