https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120233

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #5 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
For -m64, it is really r16-531 and nothing else changed on the testcase between
r16-1 and r16-529 and r16-531 and latest trunk.
--- pr108938-3.s2       2026-02-16 03:57:25.187722927 -0500
+++ pr108938-3.s3       2026-02-16 03:57:31.349788268 -0500
@@ -5,10 +5,12 @@
        .type   foo1, @function
 foo1:
 .LFB0:
-       movq    (%rsi), %rax
-       bswap   %rax
-       rolq    $32, %rax
-       movq    %rax, (%rdi)
+       movl    (%rsi), %eax
+       bswap   %eax
+       movl    %eax, (%rdi)
+       movl    4(%rsi), %eax
+       bswap   %eax
+       movl    %eax, 4(%rdi)
        ret
 .LFE0:
        .size   foo1, .-foo1
@@ -17,10 +19,26 @@ foo1:
        .type   foo2, @function
 foo2:
 .LFB1:
-       movl    (%rsi), %eax
-       bswap   %eax
-       roll    $16, %eax
-       movl    %eax, (%rdi)
+       movzwl  (%rsi), %eax
+       movzwl  2(%rsi), %edx
+       movl    %eax, %ecx
+       sall    $16, %eax
+       sarw    $8, %cx
+       movzwl  %cx, %ecx
+       orl     %ecx, %eax
+       movd    %eax, %xmm0
+       movl    %edx, %eax
+       sall    $16, %edx
+       sarw    $8, %ax
+       movdqa  %xmm0, %xmm2
+       movzwl  %ax, %eax
+       orl     %eax, %edx
+       movd    %edx, %xmm1
+       punpcklbw       %xmm1, %xmm2
+       punpcklbw       %xmm1, %xmm0
+       pshufd  $65, %xmm2, %xmm2
+       punpcklbw       %xmm2, %xmm0
+       movd    %xmm0, (%rdi)
        ret
 .LFE1:
        .size   foo2, .-foo2
Before slp2 there is no difference in the IL, before store_merging which used
to detect the bswap with rotate in both cases the difference is
--- pr108938-3.c.219t.widening_mul_     2026-02-16 04:10:06.592790016 -0500
+++ pr108938-3.c.219t.widening_mul      2026-02-16 04:10:43.114176421 -0500
@@ -19,27 +19,32 @@ void foo1 (char * a, unsigned int * rest
   unsigned int _14;
   char _15;
   char _16;
-  vector(8) char _36;

   <bb 2> [local count: 1073741824]:
   _1 = *b_18(D);
   _2 = _1 >> 24;
   _3 = (char) _2;
+  *a_19(D) = _3;
   _4 = _1 >> 16;
   _5 = (char) _4;
+  MEM[(char *)a_19(D) + 1B] = _5;
   _6 = _1 >> 8;
   _7 = (char) _6;
+  MEM[(char *)a_19(D) + 2B] = _7;
   _8 = (char) _1;
+  MEM[(char *)a_19(D) + 3B] = _8;
   _9 = MEM[(unsigned int *)b_18(D) + 4B];
   _10 = _9 >> 24;
   _11 = (char) _10;
+  MEM[(char *)a_19(D) + 4B] = _11;
   _12 = _9 >> 16;
   _13 = (char) _12;
+  MEM[(char *)a_19(D) + 5B] = _13;
   _14 = _9 >> 8;
   _15 = (char) _14;
+  MEM[(char *)a_19(D) + 6B] = _15;
   _16 = (char) _9;
-  _36 = {_3, _5, _7, _8, _11, _13, _15, _16};
-  MEM <vector(8) char> [(char *)a_19(D)] = _36;
+  MEM[(char *)a_19(D) + 7B] = _16;
   return;

 }
@@ -50,27 +55,23 @@ void foo1 (char * a, unsigned int * rest

 void foo2 (char * a, short int * restrict b)
 {
+  vector(4) char vect__3.7;
   short int _1;
   short int _2;
-  char _3;
-  char _4;
   short int _5;
   short int _6;
-  char _7;
-  char _8;
-  vector(4) char _16;
+  vector(2) short int _16;
+  vector(2) short int _17;

   <bb 2> [local count: 1073741824]:
   _1 = *b_10(D);
   _2 = _1 >> 8;
-  _3 = (char) _2;
-  _4 = (char) _1;
+  _17 = {_2, _1};
   _5 = MEM[(short int *)b_10(D) + 2B];
   _6 = _5 >> 8;
-  _7 = (char) _6;
-  _8 = (char) _5;
-  _16 = {_3, _4, _7, _8};
-  MEM <vector(4) char> [(char *)a_11(D)] = _16;
+  _16 = {_6, _5};
+  vect__3.7_18 = VEC_PACK_TRUNC_EXPR <_17, _16>;
+  MEM <vector(4) char> [(char *)a_11(D)] = vect__3.7_18;
   return;

 }
I guess for foo2 store merging could be extended by handling
VEC_PACK_TRUNC_EXPR, but where does that end?
No idea about foo1 though, in that case store merging finds now 2 separate
bswap32 rather than bswap64 + rotate.

Reply via email to