https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938
--- Comment #6 from postmaster at raasu dot org ---
I tried identical code using intrinsics with both clang and gcc:
clang:
movdqa xmm1,XMMWORD PTR [rip+0xd98] # 402050 <_IO_stdin_used+0x50>
pand xmm1,xmm0
movdqa xmm2,xmm0
pshufb xmm2,XMMWORD PTR [rip+0xd97] # 402060 <_IO_stdin_used+0x60>
movdqa xmm3,xmm0
pshufb xmm3,XMMWORD PTR [rip+0xd9a] # 402070 <_IO_stdin_used+0x70>
paddd xmm2,xmm1
psrld xmm0,0x18
paddd xmm0,xmm3
paddd xmm0,xmm2
gcc:
movdqa %xmm0, %xmm1
movdqa %xmm0, %xmm2
movdqa %xmm0, %xmm3
pshufb .LC0(%rip), %xmm1
pshufb .LC1(%rip), %xmm2
pshufb .LC2(%rip), %xmm3
pshufb .LC3(%rip), %xmm0
paddd %xmm3, %xmm0
paddd %xmm2, %xmm0
paddd %xmm1, %xmm0
This is the function using intrinsics:
static __m128i __attribute__((noinline)) haddd_epu8(__m128i a)
{
__m128i b1 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 12, 0x80,
0x80, 0x80, 8, 0x80, 0x80, 0x80, 4, 0x80, 0x80, 0x80, 0));
__m128i b2 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 13, 0x80,
0x80, 0x80, 9, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1));
__m128i b3 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 14, 0x80,
0x80, 0x80, 10, 0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2));
__m128i b4 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 15, 0x80,
0x80, 0x80, 11, 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3));
__m128i c = _mm_add_epi32(b1, _mm_add_epi32(b2, _mm_add_epi32(b3, b4)));
return c;
}