#include <emmintrin.h> typedef char const* __attribute__((aligned(16))) aligned_byte_buffer;
__m128i load_1( aligned_byte_buffer buf ) { return *((__m128i*)buf); } __m128i load_2( aligned_byte_buffer buf ) { __m128i m; __builtin_memcpy( &m, buf, sizeof( m ) ); return m; } gcc-4.2.1 produces unoptimal load_2 code: load_1: movdqa (%rdi), %xmm0 ret load_2: movq (%rdi), %rax movq %rax, -24(%rsp) movq 8(%rdi), %rax movq %rax, -16(%rsp) movdqa -24(%rsp), %xmm0 ret -- Summary: missed memcpy -> movdqa optimization. Product: gcc Version: 4.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: pluto at agmk dot net GCC target triplet: x86_64-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=32951