https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106
--- Comment #2 from Daniel Fruzynski <bugzi...@poradnik-webmastera.com> --- Test included in comment 0 is part of bigger test which I performed. In full version code was also computing bitmask and stored in 3rd array. For test1 gcc was able to vectorize inner loop to series of load-shift-store-store operations. In test2 it separated loops into two - 1st one performing memcpy using "rep movsq", 2nd one calculating bitmasks using vector instructions. Here is full code and output: [code] #include <stdint.h> #define N 9 int a1[N][N]; int a2[N][N]; int a3[N][N]; int b1[N*N]; int b2[N*N]; int b3[N*N]; void test1() { for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { a2[i][j] = a1[i][j]; a3[i][j] = 1u << (uint8_t)a1[i][j]; } } } void test2() { for (int i = 0; i < N*N; ++i) { b2[i] = b1[i]; b3[i] = 1u << b1[i]; } } [/code] [out] test1(): vmovdqa ymm0, YMMWORD PTR .LC0[rip] vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip] mov eax, 1 vmovdqa ymm5, YMMWORD PTR a1[rip+96] vmovdqa ymm6, YMMWORD PTR a1[rip+128] vmovdqa ymm7, YMMWORD PTR a1[rip+160] vmovdqa ymm2, YMMWORD PTR a1[rip] vmovdqa YMMWORD PTR a3[rip], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+32] vmovdqa ymm3, YMMWORD PTR a1[rip+32] vmovdqa YMMWORD PTR a2[rip], ymm2 vmovdqa ymm2, YMMWORD PTR a1[rip+192] vmovdqa ymm4, YMMWORD PTR a1[rip+64] vmovdqa YMMWORD PTR a2[rip+32], ymm3 vmovdqa ymm3, YMMWORD PTR a1[rip+224] vmovdqa YMMWORD PTR a3[rip+32], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR a1[rip+64] vmovdqa YMMWORD PTR a2[rip+64], ymm4 vmovdqa ymm4, YMMWORD PTR a1[rip+256] vmovdqa YMMWORD PTR a2[rip+96], ymm5 vmovdqa YMMWORD PTR a3[rip+64], ymm1 vpsllvd ymm1, ymm0, ymm5 vmovdqa ymm5, YMMWORD PTR a1[rip+288] vmovdqa YMMWORD PTR a2[rip+128], ymm6 vmovdqa YMMWORD PTR a3[rip+96], ymm1 vpsllvd ymm1, ymm0, ymm6 vmovdqa YMMWORD PTR a2[rip+160], ymm7 vmovdqa YMMWORD PTR a3[rip+128], ymm1 vpsllvd ymm1, ymm0, ymm7 vmovdqa YMMWORD PTR a2[rip+192], ymm2 vmovdqa YMMWORD PTR a3[rip+160], ymm1 vpsllvd ymm1, ymm0, ymm2 vmovdqa YMMWORD PTR a2[rip+224], ymm3 vmovdqa YMMWORD PTR a3[rip+192], ymm1 vpsllvd ymm1, ymm0, ymm3 vmovdqa YMMWORD PTR a2[rip+256], ymm4 vmovdqa YMMWORD PTR a3[rip+224], ymm1 vpsllvd ymm1, ymm0, ymm4 vpsllvd ymm0, ymm0, ymm5 vmovdqa YMMWORD PTR a3[rip+256], ymm1 vmovdqa YMMWORD PTR a2[rip+288], ymm5 mov ecx, DWORD PTR a1[rip+320] vmovdqa YMMWORD PTR a3[rip+288], ymm0 sal eax, cl mov DWORD PTR a2[rip+320], ecx mov DWORD PTR a3[rip+320], eax vzeroupper ret test2(): mov esi, OFFSET FLAT:b1 mov edi, OFFSET FLAT:b2 mov ecx, 40 vmovdqa ymm0, YMMWORD PTR .LC0[rip] rep movsq vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip] mov ecx, DWORD PTR b1[rip+320] vmovdqa YMMWORD PTR b3[rip], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+32] vmovdqa YMMWORD PTR b3[rip+32], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+64] mov eax, DWORD PTR [rsi] mov DWORD PTR [rdi], eax mov eax, 1 vmovdqa YMMWORD PTR b3[rip+64], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+96] sal eax, cl mov DWORD PTR b3[rip+320], eax vmovdqa YMMWORD PTR b3[rip+96], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+128] vmovdqa YMMWORD PTR b3[rip+128], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+160] vmovdqa YMMWORD PTR b3[rip+160], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+192] vmovdqa YMMWORD PTR b3[rip+192], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+224] vmovdqa YMMWORD PTR b3[rip+224], ymm1 vpsllvd ymm1, ymm0, YMMWORD PTR b1[rip+256] vpsllvd ymm0, ymm0, YMMWORD PTR b1[rip+288] vmovdqa YMMWORD PTR b3[rip+256], ymm1 vmovdqa YMMWORD PTR b3[rip+288], ymm0 vzeroupper ret b3: .zero 324 b2: .zero 324 b1: .zero 324 a3: .zero 324 a2: .zero 324 a1: .zero 324 .LC0: .long 1 .long 1 .long 1 .long 1 .long 1 .long 1 .long 1 .long 1 [/out]