Testcase: #include <tmmintrin.h>
__m128i intrin() { __m128i a = _mm_setzero_si128(); a = _mm_cmpeq_epi8(a, a); __m128i count = _mm_sad_epu8(_mm_abs_epi8(a), _mm_setzero_si128()); return _mm_sll_epi64(a, count); } __m128i assem() { register __m128i r asm("xmm0"); asm( "pxor %%xmm1,%%xmm1\n\t" // 0x0000 "pcmpeqb %%xmm0,%%xmm0\n\t" // 0xffff "pabsb %%xmm0,%%xmm2\n\t" // 0x0101 "psadbw %%xmm1,%%xmm2\n\t" // 0x0008 "psllq %%xmm2,%%xmm0\n\t" :::"xmm0", "xmm1", "xmm2"); return r; } int main() { const __m128i a = intrin(); const __m128i b = assem(); return (_mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) == 0xffff) ? 0 : -1; } compiled with "g++ -march=core2 -O3" GCC 4.4 and 4.5 (4.5.0 20100311) compile the intrin() function to include an unnecessary movdqa -> mov -> movd: psadbw %xmm1,%xmm2 movdqa %xmm2,-0x18(%rsp) mov -0x18(%rsp),%rax movd %eax,%xmm1 psllq %xmm1,%xmm0 Which is not present in my inline asm version. Still both functions return the same result, which is expected: The documentation for the psllq instruction says that only the lower 64 bits of the shift argument are used. Thus the zeroing of the upper 64 bits from the xmm2 register to the xmm1 register are just wasted cycles. -- Summary: [missed optimization] use of SSE shift intrinsics introduces unnecessary moves to the stack and back Product: gcc Version: 4.5.0 Status: UNCONFIRMED Severity: minor Priority: P3 Component: middle-end AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: kretz at kde dot org GCC build triplet: x86_64-unknown-linux-gnu GCC host triplet: x86_64-unknown-linux-gnu GCC target triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43514