https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66866
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- More reduced, fails at -O1: extern "C" void abort (void); typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); typedef short A __attribute__((__may_alias__)); __m128i __attribute__((noinline)) shuf(const __m128i v) { __m128i r; reinterpret_cast<A *>(&r)[5] = reinterpret_cast<const A *>(&v)[4]; return r; } int main() { __attribute__((aligned(16))) short mem[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; *reinterpret_cast<__m128i *>(mem) = shuf (*reinterpret_cast<__m128i *>(mem)); if (mem[5] != 4) abort (); return 0; } _Z4shufDv2_x: .LFB527: .cfi_startproc movaps %xmm0, -24(%rsp) pxor %xmm0, %xmm0 pinsrw $5, -24(%rsp), %xmm0 ret