https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812
--- Comment #9 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Oddly enough simplified version of the loop SLP vectorizes for me:
struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b;};
struct drgb sum()
{
struct drgb r;
for (int i = 0; i < 100000; i++)
{
int j = addr[i];
double w = weights[i];
r.r += rgbs[j].r * w;
r.g += rgbs[j].g * w;
r.b += rgbs[j].b * w;
}
return r;
}
I get:
L2:
movslq (%r9,%rdx,4), %rax
vmovsd (%r8,%rdx,8), %xmm1
incq %rdx
leaq (%rax,%rax,2), %rax
addq %rsi, %rax
movzbl (%rax), %ecx
vmovddup %xmm1, %xmm4
vmovd %ecx, %xmm0
movzbl 1(%rax), %ecx
movzbl 2(%rax), %eax
vpinsrd $1, %ecx, %xmm0, %xmm0
vcvtdq2pd %xmm0, %xmm0
vfmadd231pd %xmm4, %xmm0, %xmm2
vcvtsi2sdl %eax, %xmm5, %xmm0
vfmadd231sd %xmm1, %xmm0, %xmm3
cmpq $100000, %rdx
jne .L2
I think the actual loop is:
<bb 53> [local count: 44202554]:
_106 = _262->pixel;
_109 = *source_231(D).columns;
<bb 54> [local count: 401841405]:
# pixel$green_332 = PHI <_124(89), pixel$green_265(53)>
# i_357 = PHI <i_298(89), 0(53)>
# pixel$red_371 = PHI <_119(89), pixel$red_263(53)>
# pixel$blue_377 = PHI <_129(89), pixel$blue_267(53)>
i.51_102 = (long unsigned int) i_357;
_103 = i.51_102 * 16;
_104 = _262 + _103;
_105 = _104->pixel;
_107 = _105 - _106;
_108 = (long unsigned int) _107;
_110 = _108 * _109;
_112 = _110 + _621;
weight_297 = _104->weight;
_113 = _112 * 4;
_114 = _276 + _113;
_115 = _114->red;
_116 = (int) _115;
_117 = (double) _116;
_118 = _117 * weight_297;
_119 = _118 + pixel$red_371;
_120 = _114->green;
_121 = (int) _120;
_122 = (double) _121;
_123 = _122 * weight_297;
_124 = _123 + pixel$green_332;
_125 = _114->blue;
_126 = (int) _125;
_127 = (double) _126;
_128 = _127 * weight_297;
_129 = _128 + pixel$blue_377;
i_298 = i_357 + 1;
if (n_195 > i_298)
goto <bb 89>; [89.00%]
else
goto <bb 118>; [11.00%]
<bb 118> [local count: 44202554]:
# _607 = PHI <_124(54)>
# _606 = PHI <_119(54)>
# _605 = PHI <_129(54)>
goto <bb 55>; [100.00%]
<bb 89> [local count: 357638851]:
goto <bb 54>; [100.00%]
and SLP vectorizer seems to claim:
../magick/resize.c:1284:52: note: _125 = _114->blue;
../magick/resize.c:1284:52: note: _120 = _114->green;
../magick/resize.c:1284:52: note: _115 = _114->red;
../magick/resize.c:1284:52: missed: not consecutive access weight_297 =
_104->weight;
../magick/resize.c:1284:52: missed: not consecutive access _105 =
_104->pixel;
../magick/resize.c:1284:52: missed: not consecutive access _134->red =
iftmp.57_207;
../magick/resize.c:1284:52: missed: not consecutive access _134->green =
iftmp.60_208;
../magick/resize.c:1284:52: missed: not consecutive access _134->blue =
iftmp.63_209;
../magick/resize.c:1284:52: missed: not consecutive access _134->opacity = 0;
../magick/resize.c:1284:52: missed: not consecutive access _63 =
*source_231(D).columns;
../magick/resize.c:1284:52: missed: not consecutive access _60 = _262->pixel;
Not sure if that is related to the real testcase:
struct rgb {unsigned char r,g,b;} *rgbs;
int *addr;
double *weights;
struct drgb {double r,g,b,o;};
struct drgb sum()
{
struct drgb r;
for (int i = 0; i < 100000; i++)
{
int j = addr[i];
double w = weights[i];
r.r += rgbs[j].r * w;
r.g += rgbs[j].g * w;
r.b += rgbs[j].b * w;
}
return r;
}
make us to miss the vectorization even though there is nothing using drgb->o:
sum:
.LFB0:
.cfi_startproc
movq %rdi, %r8
movq weights(%rip), %rsi
movq addr(%rip), %rdi
vxorps %xmm2, %xmm2, %xmm2
movq rgbs(%rip), %rcx
xorl %edx, %edx
.p2align 4
.p2align 3
.L2:
movslq (%rdi,%rdx,4), %rax
vmovsd (%rsi,%rdx,8), %xmm0
incq %rdx
leaq (%rax,%rax,2), %rax
addq %rcx, %rax
movzbl (%rax), %r9d
vcvtsi2sdl %r9d, %xmm2, %xmm1
movzbl 1(%rax), %r9d
movzbl 2(%rax), %eax
vfmadd231sd %xmm0, %xmm1, %xmm3
vcvtsi2sdl %r9d, %xmm2, %xmm1
vfmadd231sd %xmm0, %xmm1, %xmm5
vcvtsi2sdl %eax, %xmm2, %xmm1
vfmadd231sd %xmm0, %xmm1, %xmm4
cmpq $100000, %rdx
jne .L2
vmovq %xmm4, %xmm4
vunpcklpd %xmm5, %xmm3, %xmm0
movq %r8, %rax
vinsertf128 $0x1, %xmm4, %ymm0, %ymm0
vmovupd %ymm0, (%r8)
vzeroupper
ret