https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105816
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed|2024-09-18 00:00:00 |2026-3-23
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #1)
> For example for
>
> void __attribute__((noipa)) test_lo (short * __restrict dst, short *src1,
> short *src2, int n)
> {
> for (int i = 0; i < n; ++i)
> {
> dst[0] = src1[0];
> dst[1] = src1[1];
> dst[2] = src1[2];
> dst[3] = src1[3];
> dst[4] = src2[0];
> dst[5] = src2[1];
> dst[6] = src2[2];
> dst[7] = src2[3];
> dst+=8;
> src1+=4;
> src2+=4;
> }
> }
>
> we generate
>
> .L4:
> movdqu (%rsi,%rax), %xmm0
> movq %xmm0, (%rdi,%rax,2)
> movhps %xmm0, 16(%rdi,%rax,2)
> movdqu (%rdx,%rax), %xmm0
> movq %xmm0, 8(%rdi,%rax,2)
> movhps %xmm0, 24(%rdi,%rax,2)
> addq $16, %rax
> cmpq %r8, %rax
> jne .L4
>
> but ideally we'd interleave two V2DImode vectors and perform two SSE vector
> stores.
We now do this:
.L4:
vmovdqu (%rsi,%rax), %xmm0
vmovdqu (%r8,%rax), %xmm1
vpunpcklqdq %xmm1, %xmm0, %xmm2
vpunpckhqdq %xmm1, %xmm0, %xmm0
vmovdqu %xmm2, (%rdi,%rax,2)
vmovdqu %xmm0, 16(%rdi,%rax,2)
addq $16, %rax
cmpq %rdx, %rax
jne .L4
Also the BB vectorization case is handled, producing
test_lo:
.LFB0:
.cfi_startproc
movaps %xmm0, (%rdi)
movaps %xmm1, 16(%rdi)
ret
by splitting the store. But with AVX it shows the underlying issue
is still present with regard to SLP discovery:
test_lo:
.LFB0:
.cfi_startproc
vpsrldq $12, %xmm0, %xmm2
vmovdqa %xmm1, %xmm3
vpsrldq $8, %xmm0, %xmm4
vpsrldq $8, %xmm1, %xmm5
vpextrd $1, %xmm1, %edx
vpextrd $1, %xmm0, %eax
vpsrldq $12, %xmm1, %xmm1
vpunpckldq %xmm1, %xmm5, %xmm5
vpunpckldq %xmm2, %xmm4, %xmm4
vpinsrd $1, %edx, %xmm3, %xmm1
vpinsrd $1, %eax, %xmm0, %xmm0
vpunpcklqdq %xmm5, %xmm1, %xmm1
vpunpcklqdq %xmm4, %xmm0, %xmm0
vinserti128 $0x1, %xmm1, %ymm0, %ymm0
vmovdqa %ymm0, (%rdi)
vzeroupper
ret
where we build the vector from scalars (and fail to reject this via costing):
_1 = BIT_FIELD_REF <src1_9(D), 32, 0>;
_2 = BIT_FIELD_REF <src1_9(D), 32, 32>;
_3 = BIT_FIELD_REF <src1_9(D), 32, 64>;
_4 = BIT_FIELD_REF <src1_9(D), 32, 96>;
_5 = BIT_FIELD_REF <src2_16(D), 32, 0>;
_6 = BIT_FIELD_REF <src2_16(D), 32, 32>;
_7 = BIT_FIELD_REF <src2_16(D), 32, 64>;
_8 = BIT_FIELD_REF <src2_16(D), 32, 96>;
_21 = {_1, _2, _3, _4, _5, _6, _7, _8};
vectp.4_22 = &BIT_FIELD_REF <*dst_11(D), 32, 0>;
t.c:6:13: note: Cost model analysis for part in loop 0:
Vector cost: 48
Scalar cost: 96
t.c:6:13: note: Basic block will be vectorized using SLP
Thus re-confirmed.