https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114987
--- Comment #3 from Colin Ian King <colin.king at intel dot com> --- perf report from gcc-13 of stress_vecfp_float_add_16.avx of compute loop: 57.93 │200: vaddps 0xc0(%rsp),%ymm3,%ymm5 11.11 │ vaddps 0xe0(%rsp),%ymm2,%ymm6 0.02 │ vmovaps %ymm5,0x60(%rsp) 2.92 │ mov 0x60(%rsp),%rax │ mov 0x68(%rsp),%rdx 0.37 │ vmovaps %ymm6,0x40(%rsp) │ vmovaps %ymm5,0x80(%rsp) 6.30 │ vmovq %rax,%xmm1 4.11 │ mov 0x40(%rsp),%rax │ vmovdqa 0x90(%rsp),%xmm5 │ vmovaps %ymm6,0xa0(%rsp) 3.27 │ vpinsrq $0x1,%rdx,%xmm1,%xmm1 │ mov 0x48(%rsp),%rdx │ vmovdqa 0xb0(%rsp),%xmm6 3.22 │ vmovdqa %xmm1,0xc0(%rsp) 0.42 │ vmovq %rax,%xmm0 │ vmovdqa %xmm5,0xd0(%rsp) 6.80 │ vpinsrq $0x1,%rdx,%xmm0,%xmm0 3.52 │ vmovdqa %xmm0,0xe0(%rsp) │ vmovdqa %xmm6,0xf0(%rsp) │ sub $0x1,%ecx │ ↑ jne 200 perf report from gcc-14 of stress_vecfp_float_add_16.avx of compute loop: 65.79 │200: vaddps 0xc0(%rsp),%ymm3,%ymm5 3.26 │ vaddps 0xe0(%rsp),%ymm2,%ymm6 0.00 │ vmovaps %ymm5,0x60(%rsp) 9.25 │ mov 0x60(%rsp),%rax 0.00 │ mov 0x68(%rsp),%rdx │ vmovaps %ymm6,0x40(%rsp) │ vmovaps %ymm5,0x80(%rsp) 6.49 │ vmovq %rax,%xmm1 0.00 │ mov 0x40(%rsp),%rax 0.00 │ vmovaps %ymm6,0xa0(%rsp) 3.02 │ vpinsrq $0x1,%rdx,%xmm1,%xmm1 │ mov 0x48(%rsp),%rdx 0.35 │ vmovdqa %xmm1,0xc0(%rsp) 0.68 │ vmovq %rax,%xmm0 0.00 │ vmovdqa 0x90(%rsp),%xmm1 5.18 │ vpinsrq $0x1,%rdx,%xmm0,%xmm0 3.00 │ vmovdqa %xmm0,0xe0(%rsp) │ vmovdqa 0xb0(%rsp),%xmm0 │ vmovdqa %xmm1,0xd0(%rsp) │ vmovdqa %xmm0,0xf0(%rsp) │ sub $0x1,%ecx 2.94 │ ↑ jne 200