http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50162

             Bug #: 50162
           Summary: Wrong vectorization
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: ubiz...@gmail.com
            Target: x86


Following testcase results in wrong vectorization:

--cut here--
double a[256];
int b[256];
unsigned short c[256];

extern long lrint (double);

void bar(void)
{
  int i;

  for (i=0; i<256; ++i)
    {
      b[i] = lrint (a[i]);
      c[i] += c[i];
    }
}
--cut here--

gcc -O2 -ffast-math -ftree-vectorize -m32 -msse2 -mfpmath=sse

.L2:
    cvtpd2dq    a+16(,%eax,4), %xmm0
    cvtpd2dq    a(,%eax,4), %xmm1
    cvtpd2dq    a+32(,%eax,4), %xmm2
    punpcklqdq    %xmm0, %xmm1
    punpcklqdq    %xmm2, %xmm0
    movdqa    %xmm0, b+16(%eax,%eax)
    movdqa    c(%eax), %xmm0
    paddw    %xmm0, %xmm0
    movdqa    %xmm1, b(%eax,%eax)
    movdqa    %xmm0, c(%eax)
    addl    $16, %eax
    cmpl    $512, %eax
    jne    .L2

One cvtpd2dq is missing.

4.6.1. compiles to:

.L2:
    cvtpd2dq    a+16(,%eax,4), %xmm0
    cvtpd2dq    a(,%eax,4), %xmm1
    cvtpd2dq    a+48(,%eax,4), %xmm2
    punpcklqdq    %xmm0, %xmm1
    cvtpd2dq    a+32(,%eax,4), %xmm0
    punpcklqdq    %xmm2, %xmm0
    movdqa    %xmm0, b+16(%eax,%eax)
    movdqa    c(%eax), %xmm0
    paddw    %xmm0, %xmm0
    movdqa    %xmm1, b(%eax,%eax)
    movdqa    %xmm0, c(%eax)
    addl    $16, %eax
    cmpl    $512, %eax
    jne    .L2

The problem is already in .optimized tree dump:

  vect_var_.13_22 = MEM[symbol: a, index: ivtmp.41_17, step: 4, offset: 0B];
  vect_var_.14_24 = MEM[symbol: a, index: ivtmp.41_17, step: 4, offset: 16B];
  vect_var_.15_26 = MEM[symbol: a, index: ivtmp.41_17, step: 4, offset: 32B];
  vect_var_.17_29 = __builtin_ia32_vec_pack_sfix (vect_var_.13_22,
vect_var_.14_24);
  vect_var_.17_30 = __builtin_ia32_vec_pack_sfix (vect_var_.14_24,
vect_var_.15_26);
  MEM[symbol: b, index: ivtmp.41_17, step: 2, offset: 0B] = vect_var_.17_29;
  MEM[symbol: b, index: ivtmp.41_17, step: 2, offset: 16B] = vect_var_.17_30;
  vect_var_.26_38 = MEM[symbol: c, index: ivtmp.41_17, offset: 0B];
  vect_var_.27_39 = vect_var_.26_38 + vect_var_.26_38;
  MEM[symbol: c, index: ivtmp.41_17, offset: 0B] = vect_var_.27_39;

Please note, there is no access to offset 48B.

Gcc 4.6.1 generates:

  vect_var_.39 = __builtin_ia32_vec_pack_sfix (MEM[symbol: a, index: ivtmp.69,
step: 4], MEM[symbol: a, index: ivtmp.69, step: 4, offset: 16]);
  vect_var_.77 = __builtin_ia32_vec_pack_sfix (MEM[symbol: a, index: ivtmp.69,
step: 4, offset: 32], MEM[symbol: a, index: ivtmp.69, step: 4, offset: 48]);
  MEM[symbol: b, index: ivtmp.69, step: 2] = vect_var_.39;
  MEM[symbol: b, index: ivtmp.69, step: 2, offset: 16] = vect_var_.77;
  vect_var_.52 = MEM[symbol: c, index: ivtmp.69];
  MEM[symbol: c, index: ivtmp.69] = [plus_expr] vect_var_.52 + vect_var_.52;

Reply via email to