https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106081
Bug ID: 106081 Summary: missed vectorization Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- This testcase (derived from ImageMagick) struct pixels { short a,b,c,d; } *pixels; struct dpixels { double a,b,c,d; }; double test(double *k) { struct dpixels results={}; for (int u=0; u<10000;u++,k--) { results.a += *k*pixels[u].a; results.b += *k*pixels[u].b; results.c += *k*pixels[u].c; results.d += *k*pixels[u].d; } return results.a+results.b*2+results.c*3+results.d*4; } gets vectorized by clang: test: # @test .cfi_startproc # %bb.0: movq pixels(%rip), %rax vxorpd %xmm0, %xmm0, %xmm0 xorl %ecx, %ecx .p2align 4, 0x90 .LBB0_1: # =>This Inner Loop Header: Depth=1 vpmovsxwd (%rax), %xmm1 vbroadcastsd (%rdi,%rcx,8), %ymm2 addq $8, %rax decq %rcx vcvtdq2pd %xmm1, %ymm1 vfmadd231pd %ymm2, %ymm1, %ymm0 # ymm0 = (ymm1 * ymm2) + ymm0 cmpq $-10000, %rcx # imm = 0xD8F0 jne .LBB0_1 # %bb.2: vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] vfmadd132sd .LCPI0_0(%rip), %xmm0, %xmm1 # xmm1 = (xmm1 * mem) + xmm0 vextractf128 $1, %ymm0, %xmm0 vfmadd231sd .LCPI0_1(%rip), %xmm0, %xmm1 # xmm1 = (xmm0 * mem) + xmm1 vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0] vfmadd132sd .LCPI0_2(%rip), %xmm1, %xmm0 # xmm0 = (xmm0 * mem) + xmm1 vzeroupper retq but not by GCC. Original loop is: 0.94 : 423cb0: vmovdqu (%rsi,%rdi,8),%xmm5 // morphology.c:2984 : 2983 if ( IsNaN(*k) ) continue; 0.29 : 423cb5: vpermilpd $0x1,(%rcx),%xmm4 : 2982 for (u=0; u < (ssize_t) kernel->width; u++, k--) { 0.46 : 423cbb: add $0x2,%rdi 0.07 : 423cbf: add $0xfffffffffffffff0,%rcx : 2984 result.red += (*k)*k_pixels[u].red; 0.03 : 423cc3: vpshufb %xmm12,%xmm5,%xmm6 6.81 : 423cc8: vcvtdq2pd %xmm6,%xmm6 13.05 : 423ccc: vfmadd231pd %xmm6,%xmm4,%xmm1 : 2985 result.green += (*k)*k_pixels[u].green; 17.45 : 423cd1: vpshufb %xmm15,%xmm5,%xmm6 // morphology.c:2985 0.33 : 423cd6: vcvtdq2pd %xmm6,%xmm6 0.00 : 423cda: vfmadd231pd %xmm6,%xmm4,%xmm3 : 2986 result.blue += (*k)*k_pixels[u].blue; 15.28 : 423cdf: vpshufb %xmm13,%xmm5,%xmm6 // morphology.c:2986 : 2987 result.opacity += (*k)*k_pixels[u].opacity; 0.00 : 423ce4: vpshufb %xmm8,%xmm5,%xmm5 : 2986 result.blue += (*k)*k_pixels[u].blue; 0.00 : 423ce9: vcvtdq2pd %xmm6,%xmm6 : 2987 result.opacity += (*k)*k_pixels[u].opacity; 0.21 : 423ced: vcvtdq2pd %xmm5,%xmm5 : 2986 result.blue += (*k)*k_pixels[u].blue; 0.97 : 423cf1: vfmadd231pd %xmm6,%xmm4,%xmm0 : 2987 result.opacity += (*k)*k_pixels[u].opacity; 19.16 : 423cf6: vfmadd231pd %xmm5,%xmm4,%xmm2 // morphology.c:2987 : 2982 for (u=0; u < (ssize_t) kernel->width; u++, k--) { 14.51 : 423cfb: cmp %rdi,%rbp // morphology.c:2982 0.00 : 423cfe: jne 423cb0 <MorphologyApply.6136+0x20c0> Changing short to double makes it vectorized: .L2: vmovupd (%rax), %ymm4 vmovupd 64(%rax), %ymm2 subq $-128, %rax subq $32, %rdx vunpcklpd -96(%rax), %ymm4, %ymm1 vunpckhpd -96(%rax), %ymm4, %ymm0 vmovupd -64(%rax), %ymm4 vunpckhpd -32(%rax), %ymm2, %ymm2 vunpcklpd -32(%rax), %ymm4, %ymm4 vpermpd $27, 32(%rdx), %ymm3 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm0 vpermpd $216, %ymm2, %ymm2 vpermpd $216, %ymm4, %ymm4 vunpcklpd %ymm2, %ymm0, %ymm10 vunpckhpd %ymm2, %ymm0, %ymm0 vunpckhpd %ymm4, %ymm1, %ymm9 vunpcklpd %ymm4, %ymm1, %ymm1 vpermpd $216, %ymm10, %ymm10 vpermpd $216, %ymm0, %ymm0 vfmadd231pd %ymm3, %ymm10, %ymm6 vfmadd231pd %ymm3, %ymm0, %ymm8 vpermpd $216, %ymm9, %ymm9 vpermpd $216, %ymm1, %ymm1 vfmadd231pd %ymm3, %ymm1, %ymm5 vfmadd231pd %ymm3, %ymm9, %ymm7 cmpq %rax, %rcx jne .L2 howver clang's code looks shorter: LBB0_1: # =>This Inner Loop Header: Depth=1 vbroadcastsd (%rdi,%rcx,8), %ymm1 vfmadd231pd (%rax), %ymm1, %ymm0 # ymm0 = (ymm1 * mem) + ymm0 addq $32, %rax decq %rcx cmpq $-10000, %rcx # imm = 0xD8F0 jne .LBB0_1 We loop vectorize while clang slp vectorizes it seems.