https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176

H.J. Lu <hjl.tools at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |crazylht at gmail dot com

--- Comment #2 from H.J. Lu <hjl.tools at gmail dot com> ---
(In reply to Richard Biener from comment #1)
> 
> The epilogue vectorization issue also needs investigation.

Epilogue vectorization doesn't seem to work:

[hjl@gnu-cfl-1 pr89176]$ cat x.i
extern float *v1;
extern float *v2;
extern float *res;


void
foo (int n)
{
  int i;

  for (i = 0; i < n; i++)
    res[i] = v2[i] * v1[i];
}
[hjl@gnu-cfl-1 pr89176]$ make x.s
/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/ -O3
-march=skylake  -S x.i
[hjl@gnu-cfl-1 pr89176]$ cat x.s
        .file   "x.i"
        .text
        .p2align 4
        .globl  foo
        .type   foo, @function
foo:
.LFB0:
        .cfi_startproc
        testl   %edi, %edi
        jle     .L23
        movq    v2(%rip), %rcx
        movq    res(%rip), %rdx
        movq    v1(%rip), %rsi
        leaq    31(%rcx), %r8
        subq    %rdx, %r8
        cmpq    $62, %r8
        leaq    31(%rsi), %r8
        seta    %r9b
        subq    %rdx, %r8
        cmpq    $62, %r8
        seta    %r8b
        leal    -1(%rdi), %eax
        testb   %r8b, %r9b
        je      .L3
        cmpl    $6, %eax
        jbe     .L3
        movl    %edi, %r8d
        shrl    $3, %r8d
        salq    $5, %r8
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L4:
        vmovups (%rcx,%rax), %ymm1
        vmulps  (%rsi,%rax), %ymm1, %ymm0
        vmovups %ymm0, (%rdx,%rax)
        addq    $32, %rax
        cmpq    %r8, %rax
        jne     .L4
        movl    %edi, %eax
        andl    $-8, %eax
        testb   $7, %dil
        je      .L22
        movl    %eax, %r8d
        vmovss  (%rcx,%r8,4), %xmm0
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        leal    1(%rax), %r8d
        cmpl    %r8d, %edi
        jle     .L22
        movslq  %r8d, %r8
        vmovss  (%rcx,%r8,4), %xmm0
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        leal    2(%rax), %r8d
        cmpl    %r8d, %edi
        jle     .L22
        movslq  %r8d, %r8
        vmovss  (%rcx,%r8,4), %xmm0
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        leal    3(%rax), %r8d
        cmpl    %r8d, %edi
        jle     .L22
        movslq  %r8d, %r8
        vmovss  (%rcx,%r8,4), %xmm0
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        leal    4(%rax), %r8d
        cmpl    %r8d, %edi
        jle     .L22
        movslq  %r8d, %r8
        vmovss  (%rcx,%r8,4), %xmm0
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        leal    5(%rax), %r8d
        cmpl    %r8d, %edi
        jle     .L22
        movslq  %r8d, %r8
        vmovss  (%rcx,%r8,4), %xmm0
        addl    $6, %eax
        vmulss  (%rsi,%r8,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%r8,4)
        cmpl    %eax, %edi
        jle     .L22
        cltq
        vmovss  (%rcx,%rax,4), %xmm0
        vmulss  (%rsi,%rax,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%rax,4)
        vzeroupper
        ret
        .p2align 4,,10
        .p2align 3
.L22:
        vzeroupper
.L23:
        ret
        .p2align 4,,10
        .p2align 3
.L3:
        movl    %eax, %edi
        xorl    %eax, %eax
        .p2align 4,,10
        .p2align 3
.L6:
        vmovss  (%rcx,%rax,4), %xmm0
        movq    %rax, %r8
        vmulss  (%rsi,%rax,4), %xmm0, %xmm0
        vmovss  %xmm0, (%rdx,%rax,4)
        incq    %rax
        cmpq    %rdi, %r8
        jne     .L6
        ret
        .cfi_endproc
.LFE0:
        .size   foo, .-foo
        .ident  "GCC: (GNU) 10.0.0 20190723 (experimental)"
        .section        .note.GNU-stack,"",@progbits
[hjl@gnu-cfl-1 pr89176]$

Reply via email to