https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611
--- Comment #1 from John Platts <john_platts at hotmail dot com> --- Here is some C++ code for extracting 64-bit integers from a __m128i vector using SSE4.1: #include <cstdint> #include <immintrin.h> template<int ElemIdx> std::int64_t SSE41ExtractInt64(__m128i vect) noexcept { static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1"); std::uint32_t loVal; std::uint32_t hiVal; if constexpr(ElemIdx == 0) { loVal = std::uint32_t(_mm_extract_epi32(vect, 0)); hiVal = std::uint32_t(_mm_extract_epi32(vect, 1)); } else { loVal = std::uint32_t(_mm_extract_epi32(vect, 2)); hiVal = std::uint32_t(_mm_extract_epi32(vect, 3)); } return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32); } template std::int64_t SSE41ExtractInt64<0>(__m128i vect) noexcept; template std::int64_t SSE41ExtractInt64<1>(__m128i vect) noexcept; Here is the assembly code that is generated when the above C++ code is compiled with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32 options: _Z17SSE41ExtractInt64ILi0EExDv2_x: subl $28, %esp pmovzxdq %xmm0, %xmm1 movq %xmm1, 8(%esp) pextrd $1, %xmm0, %eax movl %eax, %edx movl 8(%esp), %eax orl 12(%esp), %edx orb $0, %ah addl $28, %esp ret _Z17SSE41ExtractInt64ILi1EExDv2_x: pushl %ebx pextrd $2, %xmm0, %ecx psrldq $12, %xmm0 xorl %ebx, %ebx movd %xmm0, %edx movl %ecx, %eax orl %ebx, %edx orb $0, %ah popl %ebx ret Here is more optimal code for the above functions: _Z17SSE41ExtractInt64ILi0EExDv2_x: movd %xmm0, %eax pextrd $1, %xmm0, %edx ret _Z17SSE41ExtractInt64ILi1EExDv2_x: pextrd $2, %xmm0, %eax pextrd $3, %xmm0, %edx ret Here is the code that is generated when the above C++ code is compiled with clang 13.0.0 with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32 options: _Z17SSE41ExtractInt64ILi0EExDv2_x: # @_Z17SSE41ExtractInt64ILi0EExDv2_x movd %xmm0, %eax pextrd $1, %xmm0, %edx retl _Z17SSE41ExtractInt64ILi1EExDv2_x: # @_Z17SSE41ExtractInt64ILi1EExDv2_x extractps $2, %xmm0, %eax extractps $3, %xmm0, %edx retl