[Bug tree-optimization/84106] gcc is not able to vectorize code for 1D array, but does so for 2D array of the same size

bugzi...@poradnik-webmastera.com Tue, 30 Jan 2018 04:54:17 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106


--- Comment #2 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> ---
Test included in comment 0 is part of bigger test which I performed. In full
version code was also computing bitmask and stored in 3rd array. For test1 gcc
was able to vectorize inner loop to series of load-shift-store-store
operations.  In test2 it separated loops into two - 1st one performing memcpy
using "rep movsq", 2nd one calculating bitmasks using vector instructions. Here
is full code and output:

[code]
#include <stdint.h>

#define N 9

int a1[N][N];
int a2[N][N];
int a3[N][N];

int b1[N*N];
int b2[N*N];
int b3[N*N];

void test1()
{
    for (int i = 0; i < N; ++i)
    {
        for (int j = 0; j < N; ++j)
        {
            a2[i][j] = a1[i][j];
            a3[i][j] = 1u << (uint8_t)a1[i][j];
        }
    }
}

void test2()
{
    for (int i = 0; i < N*N; ++i)
    {
        b2[i] = b1[i];
        b3[i] = 1u << b1[i];
    }
}
[/code]

[out]
test1():
  vmovdqa ymm0, YMMWORD PTR .LC0[rip]
  vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip]
  mov eax, 1
  vmovdqa ymm5, YMMWORD PTR a1[rip+96]
  vmovdqa ymm6, YMMWORD PTR a1[rip+128]
  vmovdqa ymm7, YMMWORD PTR a1[rip+160]
  vmovdqa ymm2, YMMWORD PTR a1[rip]
  vmovdqa YMMWORD PTR a3[rip], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+32]
  vmovdqa ymm3, YMMWORD PTR a1[rip+32]
  vmovdqa YMMWORD PTR a2[rip], ymm2
  vmovdqa ymm2, YMMWORD PTR a1[rip+192]
  vmovdqa ymm4, YMMWORD PTR a1[rip+64]
  vmovdqa YMMWORD PTR a2[rip+32], ymm3
  vmovdqa ymm3, YMMWORD PTR a1[rip+224]
  vmovdqa YMMWORD PTR a3[rip+32], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+64]
  vmovdqa YMMWORD PTR a2[rip+64], ymm4
  vmovdqa ymm4, YMMWORD PTR a1[rip+256]
  vmovdqa YMMWORD PTR a2[rip+96], ymm5
  vmovdqa YMMWORD PTR a3[rip+64], ymm1
  vpsllvd ymm1, ymm0, ymm5
  vmovdqa ymm5, YMMWORD PTR a1[rip+288]
  vmovdqa YMMWORD PTR a2[rip+128], ymm6
  vmovdqa YMMWORD PTR a3[rip+96], ymm1
  vpsllvd ymm1, ymm0, ymm6
  vmovdqa YMMWORD PTR a2[rip+160], ymm7
  vmovdqa YMMWORD PTR a3[rip+128], ymm1
  vpsllvd ymm1, ymm0, ymm7
  vmovdqa YMMWORD PTR a2[rip+192], ymm2
  vmovdqa YMMWORD PTR a3[rip+160], ymm1
  vpsllvd ymm1, ymm0, ymm2
  vmovdqa YMMWORD PTR a2[rip+224], ymm3
  vmovdqa YMMWORD PTR a3[rip+192], ymm1
  vpsllvd ymm1, ymm0, ymm3
  vmovdqa YMMWORD PTR a2[rip+256], ymm4
  vmovdqa YMMWORD PTR a3[rip+224], ymm1
  vpsllvd ymm1, ymm0, ymm4
  vpsllvd ymm0, ymm0, ymm5
  vmovdqa YMMWORD PTR a3[rip+256], ymm1
  vmovdqa YMMWORD PTR a2[rip+288], ymm5
  mov ecx, DWORD PTR a1[rip+320]
  vmovdqa YMMWORD PTR a3[rip+288], ymm0
  sal eax, cl
  mov DWORD PTR a2[rip+320], ecx
  mov DWORD PTR a3[rip+320], eax
  vzeroupper
  ret
test2():
  mov esi, OFFSET FLAT:b1
  mov edi, OFFSET FLAT:b2
  mov ecx, 40
  vmovdqa ymm0, YMMWORD PTR .LC0[rip]
  rep movsq
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip]
  mov ecx, DWORD PTR b1[rip+320]
  vmovdqa YMMWORD PTR b3[rip], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+32]
  vmovdqa YMMWORD PTR b3[rip+32], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+64]
  mov eax, DWORD PTR [rsi]
  mov DWORD PTR [rdi], eax
  mov eax, 1
  vmovdqa YMMWORD PTR b3[rip+64], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+96]
  sal eax, cl
  mov DWORD PTR b3[rip+320], eax
  vmovdqa YMMWORD PTR b3[rip+96], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+128]
  vmovdqa YMMWORD PTR b3[rip+128], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+160]
  vmovdqa YMMWORD PTR b3[rip+160], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+192]
  vmovdqa YMMWORD PTR b3[rip+192], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+224]
  vmovdqa YMMWORD PTR b3[rip+224], ymm1
  vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+256]
  vpsllvd ymm0, ymm0, YMMWORD PTR b1[rip+288]
  vmovdqa YMMWORD PTR b3[rip+256], ymm1
  vmovdqa YMMWORD PTR b3[rip+288], ymm0
  vzeroupper
  ret
b3:
  .zero 324
b2:
  .zero 324
b1:
  .zero 324
a3:
  .zero 324
a2:
  .zero 324
a1:
  .zero 324
.LC0:
  .long 1
  .long 1
  .long 1
  .long 1
  .long 1
  .long 1
  .long 1
  .long 1
[/out]

[Bug tree-optimization/84106] gcc is not able to vectorize code for 1D array, but does so for 2D array of the same size

Reply via email to