https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120233
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |jakub at gcc dot gnu.org
--- Comment #5 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
For -m64, it is really r16-531 and nothing else changed on the testcase between
r16-1 and r16-529 and r16-531 and latest trunk.
--- pr108938-3.s2 2026-02-16 03:57:25.187722927 -0500
+++ pr108938-3.s3 2026-02-16 03:57:31.349788268 -0500
@@ -5,10 +5,12 @@
.type foo1, @function
foo1:
.LFB0:
- movq (%rsi), %rax
- bswap %rax
- rolq $32, %rax
- movq %rax, (%rdi)
+ movl (%rsi), %eax
+ bswap %eax
+ movl %eax, (%rdi)
+ movl 4(%rsi), %eax
+ bswap %eax
+ movl %eax, 4(%rdi)
ret
.LFE0:
.size foo1, .-foo1
@@ -17,10 +19,26 @@ foo1:
.type foo2, @function
foo2:
.LFB1:
- movl (%rsi), %eax
- bswap %eax
- roll $16, %eax
- movl %eax, (%rdi)
+ movzwl (%rsi), %eax
+ movzwl 2(%rsi), %edx
+ movl %eax, %ecx
+ sall $16, %eax
+ sarw $8, %cx
+ movzwl %cx, %ecx
+ orl %ecx, %eax
+ movd %eax, %xmm0
+ movl %edx, %eax
+ sall $16, %edx
+ sarw $8, %ax
+ movdqa %xmm0, %xmm2
+ movzwl %ax, %eax
+ orl %eax, %edx
+ movd %edx, %xmm1
+ punpcklbw %xmm1, %xmm2
+ punpcklbw %xmm1, %xmm0
+ pshufd $65, %xmm2, %xmm2
+ punpcklbw %xmm2, %xmm0
+ movd %xmm0, (%rdi)
ret
.LFE1:
.size foo2, .-foo2
Before slp2 there is no difference in the IL, before store_merging which used
to detect the bswap with rotate in both cases the difference is
--- pr108938-3.c.219t.widening_mul_ 2026-02-16 04:10:06.592790016 -0500
+++ pr108938-3.c.219t.widening_mul 2026-02-16 04:10:43.114176421 -0500
@@ -19,27 +19,32 @@ void foo1 (char * a, unsigned int * rest
unsigned int _14;
char _15;
char _16;
- vector(8) char _36;
<bb 2> [local count: 1073741824]:
_1 = *b_18(D);
_2 = _1 >> 24;
_3 = (char) _2;
+ *a_19(D) = _3;
_4 = _1 >> 16;
_5 = (char) _4;
+ MEM[(char *)a_19(D) + 1B] = _5;
_6 = _1 >> 8;
_7 = (char) _6;
+ MEM[(char *)a_19(D) + 2B] = _7;
_8 = (char) _1;
+ MEM[(char *)a_19(D) + 3B] = _8;
_9 = MEM[(unsigned int *)b_18(D) + 4B];
_10 = _9 >> 24;
_11 = (char) _10;
+ MEM[(char *)a_19(D) + 4B] = _11;
_12 = _9 >> 16;
_13 = (char) _12;
+ MEM[(char *)a_19(D) + 5B] = _13;
_14 = _9 >> 8;
_15 = (char) _14;
+ MEM[(char *)a_19(D) + 6B] = _15;
_16 = (char) _9;
- _36 = {_3, _5, _7, _8, _11, _13, _15, _16};
- MEM <vector(8) char> [(char *)a_19(D)] = _36;
+ MEM[(char *)a_19(D) + 7B] = _16;
return;
}
@@ -50,27 +55,23 @@ void foo1 (char * a, unsigned int * rest
void foo2 (char * a, short int * restrict b)
{
+ vector(4) char vect__3.7;
short int _1;
short int _2;
- char _3;
- char _4;
short int _5;
short int _6;
- char _7;
- char _8;
- vector(4) char _16;
+ vector(2) short int _16;
+ vector(2) short int _17;
<bb 2> [local count: 1073741824]:
_1 = *b_10(D);
_2 = _1 >> 8;
- _3 = (char) _2;
- _4 = (char) _1;
+ _17 = {_2, _1};
_5 = MEM[(short int *)b_10(D) + 2B];
_6 = _5 >> 8;
- _7 = (char) _6;
- _8 = (char) _5;
- _16 = {_3, _4, _7, _8};
- MEM <vector(4) char> [(char *)a_11(D)] = _16;
+ _16 = {_6, _5};
+ vect__3.7_18 = VEC_PACK_TRUNC_EXPR <_17, _16>;
+ MEM <vector(4) char> [(char *)a_11(D)] = vect__3.7_18;
return;
}
I guess for foo2 store merging could be extended by handling
VEC_PACK_TRUNC_EXPR, but where does that end?
No idea about foo1 though, in that case store merging finds now 2 separate
bswap32 rather than bswap64 + rotate.