https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98856
--- Comment #19 from Richard Biener <rguenth at gcc dot gnu.org> --- So to recover performance we need both, avoiding the latency on the vector plus avoiding the spilling. This variant is fast: .L56: .cfi_restore_state vmovdqu (%rsi), %xmm4 movq 8(%rsi), %rdx shrq $63, %rdx imulq $135, %rdx, %rdi movq (%rsi), %rdx vmovq %rdi, %xmm0 vpsllq $1, %xmm4, %xmm1 shrq $63, %rdx vmovq %rdx, %xmm5 vpunpcklqdq %xmm5, %xmm0, %xmm0 vpxor %xmm1, %xmm0, %xmm0 vmovdqu %xmm0, (%rax) jmp .L53 compared to the original: .L56: .cfi_restore_state vmovdqu (%rsi), %xmm4 vmovdqa %xmm4, 16(%rsp) movq 24(%rsp), %rdx vmovdqa 16(%rsp), %xmm5 shrq $63, %rdx imulq $135, %rdx, %rdi movq 16(%rsp), %rdx vmovq %rdi, %xmm0 vpsllq $1, %xmm5, %xmm1 shrq $63, %rdx vpinsrq $1, %rdx, %xmm0, %xmm0 vpxor %xmm1, %xmm0, %xmm0 vmovdqu %xmm0, (%rax) jmp .L53