https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> --- I have a fix that, with -mavx512f generates just .L3: vmovupd (%rcx,%rax), %zmm0 vpermpd (%rsi,%rax), %zmm1, %zmm2 vpermpd %zmm0, %zmm1, %zmm0 vmovupd %zmm2, (%rdi,%rax,2) vmovupd %zmm0, 64(%rdi,%rax,2) addq $64, %rax cmpq %rax, %rdx jne .L3 while the lack of cross-lane shuffles in AVX2 requires a .L3: vmovupd (%rsi,%rax), %xmm5 vmovupd 32(%rsi,%rax), %xmm6 vinsertf128 $0x1, 16(%rsi,%rax), %ymm5, %ymm1 vinsertf128 $0x1, 48(%rsi,%rax), %ymm6, %ymm3 vmovupd (%rcx,%rax), %xmm7 vmovupd 32(%rcx,%rax), %xmm5 vinsertf128 $0x1, 16(%rcx,%rax), %ymm7, %ymm0 vinsertf128 $0x1, 48(%rcx,%rax), %ymm5, %ymm2 vunpcklpd %ymm3, %ymm1, %ymm4 vunpckhpd %ymm3, %ymm1, %ymm1 vpermpd $216, %ymm4, %ymm4 vpermpd $216, %ymm1, %ymm1 vmovupd %xmm4, (%rdi,%rax,2) vextractf128 $0x1, %ymm4, 16(%rdi,%rax,2) vmovupd %xmm1, 32(%rdi,%rax,2) vextractf128 $0x1, %ymm1, 48(%rdi,%rax,2) vunpcklpd %ymm2, %ymm0, %ymm1 vunpckhpd %ymm2, %ymm0, %ymm0 vpermpd $216, %ymm1, %ymm1 vpermpd $216, %ymm0, %ymm0 vmovupd %xmm1, 64(%rdi,%rax,2) vextractf128 $0x1, %ymm1, 80(%rdi,%rax,2) vextractf128 $0x1, %ymm0, 112(%rdi,%rax,2) vmovupd %xmm0, 96(%rdi,%rax,2) addq $64, %rax cmpq %rax, %rdx jne .L3 not much better than what SSE provides: .L3: movupd (%rsi), %xmm3 movupd 16(%rsi), %xmm7 movq %rax, %rdx subq $-128, %rdi movupd 32(%rsi), %xmm2 movupd 48(%rsi), %xmm6 addq $64, %rax addq $64, %rsi movapd %xmm3, %xmm8 movupd -64(%rax), %xmm1 movupd -48(%rax), %xmm5 unpckhpd %xmm7, %xmm3 unpcklpd %xmm7, %xmm8 movupd -32(%rax), %xmm0 movupd -16(%rax), %xmm4 movups %xmm3, -96(%rdi) movups %xmm8, -128(%rdi) movapd %xmm2, %xmm8 unpckhpd %xmm6, %xmm2 movups %xmm2, -80(%rdi) movapd %xmm1, %xmm2 unpcklpd %xmm6, %xmm8 unpckhpd %xmm5, %xmm1 unpcklpd %xmm5, %xmm2 movups %xmm8, -112(%rdi) movups %xmm2, -64(%rdi) movapd %xmm0, %xmm2 unpckhpd %xmm4, %xmm0 unpcklpd %xmm4, %xmm2 movups %xmm1, -32(%rdi) movups %xmm2, -48(%rdi) movups %xmm0, -16(%rdi) cmpq %rdx, %rcx jne .L3