https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98172
--- Comment #8 from H.J. Lu <hjl.tools at gmail dot com> --- -mtune=generic -mavx2 -mfma generates awful code: [hjl@gnu-skx-1 tmp]$ cat y.c #define DATA_ENTRIES 256 extern double *a, *x, *y, *z; void work() { int i; for (i = 0; i < DATA_ENTRIES; ++i) z[i] = a[i] * x[i] + y[i]; } [hjl@gnu-skx-1 tmp]$ gcc -S -O3 y.c -mavx2 -mfma [hjl@gnu-skx-1 tmp]$ cat y.s .file "y.c" .text .p2align 4 .globl work .type work, @function work: .LFB0: .cfi_startproc movq z(%rip), %rdx movq x(%rip), %rsi movq a(%rip), %rcx movq y(%rip), %rdi leaq 8(%rsi), %r8 movq %rdx, %rax subq %r8, %rax leaq 8(%rcx), %r9 cmpq $16, %rax movq %rdx, %rax seta %r8b subq %r9, %rax cmpq $16, %rax seta %al testb %al, %r8b je .L5 leaq 8(%rdi), %r8 movq %rdx, %rax subq %r8, %rax cmpq $16, %rax jbe .L5 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L3: vmovupd (%rcx,%rax), %xmm3 vmovupd (%rsi,%rax), %xmm4 vinsertf128 $0x1, 16(%rcx,%rax), %ymm3, %ymm0 vinsertf128 $0x1, 16(%rsi,%rax), %ymm4, %ymm2 vmovupd (%rdi,%rax), %xmm5 vinsertf128 $0x1, 16(%rdi,%rax), %ymm5, %ymm1 vfmadd132pd %ymm2, %ymm1, %ymm0 vmovupd %xmm0, (%rdx,%rax) vextractf128 $0x1, %ymm0, 16(%rdx,%rax) addq $32, %rax cmpq $2048, %rax jne .L3 vzeroupper ret .L5: xorl %eax, %eax .p2align 4,,10 .p2align 3 .L2: vmovsd (%rcx,%rax), %xmm0 vmovsd (%rdi,%rax), %xmm6 vfmadd132sd (%rsi,%rax), %xmm6, %xmm0 vmovsd %xmm0, (%rdx,%rax) addq $8, %rax cmpq $2048, %rax jne .L2 ret .cfi_endproc .LFE0: .size work, .-work .ident "GCC: (GNU) 10.2.1 20210119 (Red Hat 10.2.1-10)" .section .note.GNU-stack,"",@progbits [hjl@gnu-skx-1 tmp]$ gcc -S -O3 y.c -mavx2 -mfma -mtune=haswell [hjl@gnu-skx-1 tmp]$ cat y.s .file "y.c" .text .p2align 4 .globl work .type work, @function work: .LFB0: .cfi_startproc movq z(%rip), %rdx movq x(%rip), %rsi movq a(%rip), %rcx movq y(%rip), %rdi leaq 8(%rsi), %r8 movq %rdx, %rax subq %r8, %rax leaq 8(%rcx), %r9 cmpq $16, %rax movq %rdx, %rax seta %r8b subq %r9, %rax cmpq $16, %rax seta %al testb %al, %r8b je .L5 leaq 8(%rdi), %r8 movq %rdx, %rax subq %r8, %rax cmpq $16, %rax jbe .L5 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L3: vmovupd (%rcx,%rax), %ymm0 vmovupd (%rdi,%rax), %ymm1 vfmadd132pd (%rsi,%rax), %ymm1, %ymm0 vmovupd %ymm0, (%rdx,%rax) addq $32, %rax cmpq $2048, %rax jne .L3 vzeroupper ret .L5: xorl %eax, %eax .p2align 4,,10 .p2align 3 .L2: vmovsd (%rcx,%rax), %xmm0 vmovsd (%rdi,%rax), %xmm2 vfmadd132sd (%rsi,%rax), %xmm2, %xmm0 vmovsd %xmm0, (%rdx,%rax) addq $8, %rax cmpq $2048, %rax jne .L2 ret .cfi_endproc .LFE0: .size work, .-work .ident "GCC: (GNU) 10.2.1 20210119 (Red Hat 10.2.1-10)" .section .note.GNU-stack,"",@progbits [hjl@gnu-skx-1 tmp]$