https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
Bug ID: 103611 Summary: GCC generates suboptimal code for SSE2/SSE4.1 64-bit integer element extraction on 32-bit x86 targets Product: gcc Version: 11.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: john_platts at hotmail dot com Target Milestone: --- Here is some code for extracting 64-bit integers from a SSE2 vector: #include <cstdint> #include <immintrin.h> template<int ElemIdx> std::int64_t SSE2ExtractInt64(__m128i vect) noexcept { static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1"); __m128i vect2; if constexpr(ElemIdx == 0) { vect2 = _mm_shuffle_epi32(vect, 1); } else { vect2 = _mm_shuffle_epi32(vect, 3); vect = _mm_shuffle_epi32(vect, 2); } auto loVal = std::uint32_t(_mm_cvtsi128_si32(vect)); auto hiVal = std::uint32_t(_mm_cvtsi128_si32(vect2)); return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32); } template std::int64_t SSE2ExtractInt64<0>(__m128i vect) noexcept; template std::int64_t SSE2ExtractInt64<1>(__m128i vect) noexcept; Here is the assembly code that is generated when the above C++ code is compiled with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options: _Z16SSE2ExtractInt64ILi0EExDv2_x: pushl %ebx pshufd $1, %xmm0, %xmm1 xorl %ebx, %ebx movd %xmm1, %edx movd %xmm0, %eax orl %ebx, %edx orb $0, %ah popl %ebx ret _Z16SSE2ExtractInt64ILi1EExDv2_x: pushl %esi pshufd $3, %xmm0, %xmm1 xorl %esi, %esi pushl %ebx pshufd $2, %xmm0, %xmm0 movl %esi, %edx movd %xmm1, %ecx movd %xmm0, %eax popl %ebx orb $0, %ah orl %ecx, %edx popl %esi ret Here is a more optimal implementation of the above functions: _Z16SSE2ExtractInt64ILi0EExDv2_x: pshufd $1, %xmm0, %xmm1 movd %xmm1, %edx movd %xmm0, %eax ret _Z16SSE2ExtractInt64ILi1EExDv2_x: pshufd $3, %xmm0, %xmm1 pshufd $2, %xmm0, %xmm0 movd %xmm1, %edx movd %xmm0, %eax ret Here is the code that is generated when the above C++ code is compiled with clang 13.0.0 with the -O2 -std=c++17 -march=nocona -mtune=skylake -m32 options: _Z16SSE2ExtractInt64ILi0EExDv2_x: # @_Z16SSE2ExtractInt64ILi0EExDv2_x movd %xmm0, %eax pshufd $85, %xmm0, %xmm0 # xmm0 = xmm0[1,1,1,1] movd %xmm0, %edx retl _Z16SSE2ExtractInt64ILi1EExDv2_x: # @_Z16SSE2ExtractInt64ILi1EExDv2_x pshufd $238, %xmm0, %xmm1 # xmm1 = xmm0[2,3,2,3] movd %xmm1, %eax pshufd $255, %xmm0, %xmm0 # xmm0 = xmm0[3,3,3,3] movd %xmm0, %edx retl