https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107837

            Bug ID: 107837
           Summary: Missed optimization: Using memcpy to load a struct
                    unnecessary uses stack space
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: chfast at gmail dot com
  Target Milestone: ---

I have a simple struct with array uint64_t[4]. When using memcpy() load it from
a storage of bytes and then performing some additional operations, a temporary
object on the stack is created.


struct uint256
{
    unsigned long v[4];
};

void load_bad(uint256* o, const char* src) noexcept
{
    uint256 x;
    __builtin_memcpy(&x, src, sizeof(x));
    uint256 y;
    y.v[0] = __builtin_bswap64(x.v[3]);
    y.v[1] = __builtin_bswap64(x.v[2]);
    y.v[2] = __builtin_bswap64(x.v[1]);
    y.v[3] = __builtin_bswap64(x.v[0]);
    *o = y;
}


load_bad(uint256*, char const*):
        movdqu  xmm0, XMMWORD PTR [rsi]
        movdqu  xmm1, XMMWORD PTR [rsi+16]
        movaps  XMMWORD PTR [rsp-40], xmm0
        mov     rdx, QWORD PTR [rsp-32]
        mov     rax, QWORD PTR [rsp-40]
        movaps  XMMWORD PTR [rsp-24], xmm1
        mov     rsi, QWORD PTR [rsp-16]
        mov     rcx, QWORD PTR [rsp-24]
        bswap   rdx
        bswap   rax
        mov     QWORD PTR [rdi+16], rdx
        bswap   rsi
        bswap   rcx
        mov     QWORD PTR [rdi], rsi
        mov     QWORD PTR [rdi+8], rcx
        mov     QWORD PTR [rdi+24], rax
        ret


The workaround is to use reinterpret_cast.

https://godbolt.org/z/WevYch8nv

Reply via email to