https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98856
--- Comment #23 from Richard Biener <rguenth at gcc dot gnu.org> --- Created attachment 50300 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=50300&action=edit preprocessed source of the important Botan TU This is the full preprocessed source of the TU. When compiled with -Ofast -march=znver2 look for poly_double_n_le in the assembly, in the prologue the function jumps based on kernel size - size 16 is the important one: cmpq $16, %rdx je .L54 ... .L54: .cfi_restore_state vmovdqu (%rsi), %xmm4 vmovdqa %xmm4, 16(%rsp) movq 24(%rsp), %rdx vmovdqa 16(%rsp), %xmm5 shrq $63, %rdx imulq $135, %rdx, %rcx movq 16(%rsp), %rdx vmovq %rcx, %xmm0 vpsllq $1, %xmm5, %xmm1 shrq $63, %rdx vpinsrq $1, %rdx, %xmm0, %xmm0 vpxor %xmm1, %xmm0, %xmm0 vmovdqu %xmm0, (%rdi) leaq -16(%rbp), %rsp popq %r12 popq %r13 popq %rbp .cfi_remember_state .cfi_def_cfa 7, 8 ret