https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98172

--- Comment #8 from H.J. Lu <hjl.tools at gmail dot com> ---
-mtune=generic -mavx2 -mfma generates awful code:

[hjl@gnu-skx-1 tmp]$ cat y.c
#define DATA_ENTRIES 256
extern double *a, *x, *y, *z;
void work()
{
        int i;

        for (i = 0; i < DATA_ENTRIES; ++i)
               z[i] = a[i] * x[i] + y[i];
}

[hjl@gnu-skx-1 tmp]$ gcc -S -O3 y.c -mavx2 -mfma
[hjl@gnu-skx-1 tmp]$ cat y.s
        .file   "y.c"
        .text
        .p2align 4
        .globl  work
        .type   work, @function
work:
.LFB0:
        .cfi_startproc
        movq    z(%rip), %rdx
        movq    x(%rip), %rsi
        movq    a(%rip), %rcx
        movq    y(%rip), %rdi
        leaq    8(%rsi), %r8
        movq    %rdx, %rax
        subq    %r8, %rax
        leaq    8(%rcx), %r9
        cmpq    $16, %rax
        movq    %rdx, %rax
        seta    %r8b
        subq    %r9, %rax
        cmpq    $16, %rax
        seta    %al
        testb   %al, %r8b
        je      .L5
        leaq    8(%rdi), %r8
        movq    %rdx, %rax
        subq    %r8, %rax
        cmpq    $16, %rax
        jbe     .L5
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L3:
        vmovupd (%rcx,%rax), %xmm3
        vmovupd (%rsi,%rax), %xmm4
        vinsertf128     $0x1, 16(%rcx,%rax), %ymm3, %ymm0
        vinsertf128     $0x1, 16(%rsi,%rax), %ymm4, %ymm2
        vmovupd (%rdi,%rax), %xmm5
        vinsertf128     $0x1, 16(%rdi,%rax), %ymm5, %ymm1
        vfmadd132pd     %ymm2, %ymm1, %ymm0
        vmovupd %xmm0, (%rdx,%rax)
        vextractf128    $0x1, %ymm0, 16(%rdx,%rax)
        addq    $32, %rax
        cmpq    $2048, %rax
        jne     .L3
        vzeroupper
        ret
.L5:
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L2:
        vmovsd  (%rcx,%rax), %xmm0
        vmovsd  (%rdi,%rax), %xmm6
        vfmadd132sd     (%rsi,%rax), %xmm6, %xmm0
        vmovsd  %xmm0, (%rdx,%rax)
        addq    $8, %rax
        cmpq    $2048, %rax
        jne     .L2
        ret
        .cfi_endproc
.LFE0:
        .size   work, .-work
        .ident  "GCC: (GNU) 10.2.1 20210119 (Red Hat 10.2.1-10)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 tmp]$ gcc -S -O3 y.c -mavx2 -mfma -mtune=haswell
[hjl@gnu-skx-1 tmp]$ cat y.s
        .file   "y.c"
        .text
        .p2align 4
        .globl  work
        .type   work, @function
work:
.LFB0:
        .cfi_startproc
        movq    z(%rip), %rdx
        movq    x(%rip), %rsi
        movq    a(%rip), %rcx
        movq    y(%rip), %rdi
        leaq    8(%rsi), %r8
        movq    %rdx, %rax
        subq    %r8, %rax
        leaq    8(%rcx), %r9
        cmpq    $16, %rax
        movq    %rdx, %rax
        seta    %r8b
        subq    %r9, %rax
        cmpq    $16, %rax
        seta    %al
        testb   %al, %r8b
        je      .L5
        leaq    8(%rdi), %r8
        movq    %rdx, %rax
        subq    %r8, %rax
        cmpq    $16, %rax
        jbe     .L5
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L3:
        vmovupd (%rcx,%rax), %ymm0
        vmovupd (%rdi,%rax), %ymm1
        vfmadd132pd     (%rsi,%rax), %ymm1, %ymm0
        vmovupd %ymm0, (%rdx,%rax)
        addq    $32, %rax
        cmpq    $2048, %rax
        jne     .L3
        vzeroupper
        ret
.L5:
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L2:
        vmovsd  (%rcx,%rax), %xmm0
        vmovsd  (%rdi,%rax), %xmm2
        vfmadd132sd     (%rsi,%rax), %xmm2, %xmm0
        vmovsd  %xmm0, (%rdx,%rax)
        addq    $8, %rax
        cmpq    $2048, %rax
        jne     .L2
        ret
        .cfi_endproc
.LFE0:
        .size   work, .-work
        .ident  "GCC: (GNU) 10.2.1 20210119 (Red Hat 10.2.1-10)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 tmp]$

Reply via email to