https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103611

--- Comment #1 from John Platts <john_platts at hotmail dot com> ---
Here is some C++ code for extracting 64-bit integers from a __m128i vector
using SSE4.1:
#include <cstdint>
#include <immintrin.h>

template<int ElemIdx>
std::int64_t SSE41ExtractInt64(__m128i vect) noexcept {
    static_assert(ElemIdx == (ElemIdx & 1), "ElemIdx must be between 0 and 1");

    std::uint32_t loVal;
    std::uint32_t hiVal;
    if constexpr(ElemIdx == 0) {
        loVal = std::uint32_t(_mm_extract_epi32(vect, 0));
        hiVal = std::uint32_t(_mm_extract_epi32(vect, 1));
    } else {
        loVal = std::uint32_t(_mm_extract_epi32(vect, 2));
        hiVal = std::uint32_t(_mm_extract_epi32(vect, 3));
    }

    return std::int64_t(loVal) | std::int64_t(std::uint64_t(hiVal) << 32);
}

template std::int64_t SSE41ExtractInt64<0>(__m128i vect) noexcept;
template std::int64_t SSE41ExtractInt64<1>(__m128i vect) noexcept;

Here is the assembly code that is generated when the above C++ code is compiled
with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32 options:
_Z17SSE41ExtractInt64ILi0EExDv2_x:
        subl    $28, %esp
        pmovzxdq        %xmm0, %xmm1
        movq    %xmm1, 8(%esp)
        pextrd  $1, %xmm0, %eax
        movl    %eax, %edx
        movl    8(%esp), %eax
        orl     12(%esp), %edx
        orb     $0, %ah
        addl    $28, %esp
        ret
_Z17SSE41ExtractInt64ILi1EExDv2_x:
        pushl   %ebx
        pextrd  $2, %xmm0, %ecx
        psrldq  $12, %xmm0
        xorl    %ebx, %ebx
        movd    %xmm0, %edx
        movl    %ecx, %eax
        orl     %ebx, %edx
        orb     $0, %ah
        popl    %ebx
        ret

Here is more optimal code for the above functions:
_Z17SSE41ExtractInt64ILi0EExDv2_x:
        movd    %xmm0, %eax
        pextrd  $1, %xmm0, %edx
        ret
_Z17SSE41ExtractInt64ILi1EExDv2_x:
        pextrd  $2, %xmm0, %eax
        pextrd  $3, %xmm0, %edx
        ret

Here is the code that is generated when the above C++ code is compiled with
clang 13.0.0 with the -O2 -std=c++17 -march=core2 -msse4.1 -mtune=skylake -m32
options:
_Z17SSE41ExtractInt64ILi0EExDv2_x:      # @_Z17SSE41ExtractInt64ILi0EExDv2_x
        movd    %xmm0, %eax
        pextrd  $1, %xmm0, %edx
        retl
_Z17SSE41ExtractInt64ILi1EExDv2_x:      # @_Z17SSE41ExtractInt64ILi1EExDv2_x
        extractps       $2, %xmm0, %eax
        extractps       $3, %xmm0, %edx
        retl

Reply via email to