[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

amonakov at gcc dot gnu.org via Gcc-bugs Mon, 06 May 2024 08:52:41 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944


Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org

--- Comment #3 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
Throughput-wise, the code in comment 2 has a significant bottleneck on port 5
on Haswell and Skylake (31 uops out of 70 go to port 5). Straightforward code
that does 16x movzx-movzx-movb for each byte should fare better, even
considering the load-store penalty for retrieving the vector from memory:

        pand    xmm1, XMMWORD PTR .LC0[rip]
        movaps  XMMWORD PTR [rsp-56], xmm0
        movaps  XMMWORD PTR [rsp-40], xmm1
        movzx   eax, BYTE PTR [rsp-40]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-24], al
        movzx   eax, BYTE PTR [rsp-39]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-23], al
        movzx   eax, BYTE PTR [rsp-38]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-22], al
        movzx   eax, BYTE PTR [rsp-37]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-21], al
        movzx   eax, BYTE PTR [rsp-36]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-20], al
        movzx   eax, BYTE PTR [rsp-35]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-19], al
        movzx   eax, BYTE PTR [rsp-34]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-18], al
        movzx   eax, BYTE PTR [rsp-33]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-17], al
        movzx   eax, BYTE PTR [rsp-32]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-16], al
        movzx   eax, BYTE PTR [rsp-31]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-15], al
        movzx   eax, BYTE PTR [rsp-30]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-14], al
        movzx   eax, BYTE PTR [rsp-29]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-13], al
        movzx   eax, BYTE PTR [rsp-28]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-12], al
        movzx   eax, BYTE PTR [rsp-27]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-11], al
        movzx   eax, BYTE PTR [rsp-26]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-10], al
        movzx   eax, BYTE PTR [rsp-25]
        movzx   eax, BYTE PTR [rsp-56+rax]
        mov     BYTE PTR [rsp-9], al
        movdqa  xmm0, XMMWORD PTR [rsp-24]

If you want to avoid the load-store forwarding stall, perhaps you can assemble
two halves of the shuffled vector on GPRs (e.g. do 'movzx ecx, byte[...]; shl
eax, 8; mov al, byte [...+rcx]), then merge two 64-bit GPRs into one 128-bit
vector.

[Bug target/114944] Codegen of __builtin_shuffle for an 16-byte uint8_t vector is suboptimal on SSE2

Reply via email to