https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89176
H.J. Lu <hjl.tools at gmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |crazylht at gmail dot com --- Comment #2 from H.J. Lu <hjl.tools at gmail dot com> --- (In reply to Richard Biener from comment #1) > > The epilogue vectorization issue also needs investigation. Epilogue vectorization doesn't seem to work: [hjl@gnu-cfl-1 pr89176]$ cat x.i extern float *v1; extern float *v2; extern float *res; void foo (int n) { int i; for (i = 0; i < n; i++) res[i] = v2[i] * v1[i]; } [hjl@gnu-cfl-1 pr89176]$ make x.s /export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-debug/build-x86_64-linux/gcc/ -O3 -march=skylake -S x.i [hjl@gnu-cfl-1 pr89176]$ cat x.s .file "x.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc testl %edi, %edi jle .L23 movq v2(%rip), %rcx movq res(%rip), %rdx movq v1(%rip), %rsi leaq 31(%rcx), %r8 subq %rdx, %r8 cmpq $62, %r8 leaq 31(%rsi), %r8 seta %r9b subq %rdx, %r8 cmpq $62, %r8 seta %r8b leal -1(%rdi), %eax testb %r8b, %r9b je .L3 cmpl $6, %eax jbe .L3 movl %edi, %r8d shrl $3, %r8d salq $5, %r8 xorl %eax, %eax .p2align 4,,10 .p2align 3 .L4: vmovups (%rcx,%rax), %ymm1 vmulps (%rsi,%rax), %ymm1, %ymm0 vmovups %ymm0, (%rdx,%rax) addq $32, %rax cmpq %r8, %rax jne .L4 movl %edi, %eax andl $-8, %eax testb $7, %dil je .L22 movl %eax, %r8d vmovss (%rcx,%r8,4), %xmm0 vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) leal 1(%rax), %r8d cmpl %r8d, %edi jle .L22 movslq %r8d, %r8 vmovss (%rcx,%r8,4), %xmm0 vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) leal 2(%rax), %r8d cmpl %r8d, %edi jle .L22 movslq %r8d, %r8 vmovss (%rcx,%r8,4), %xmm0 vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) leal 3(%rax), %r8d cmpl %r8d, %edi jle .L22 movslq %r8d, %r8 vmovss (%rcx,%r8,4), %xmm0 vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) leal 4(%rax), %r8d cmpl %r8d, %edi jle .L22 movslq %r8d, %r8 vmovss (%rcx,%r8,4), %xmm0 vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) leal 5(%rax), %r8d cmpl %r8d, %edi jle .L22 movslq %r8d, %r8 vmovss (%rcx,%r8,4), %xmm0 addl $6, %eax vmulss (%rsi,%r8,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%r8,4) cmpl %eax, %edi jle .L22 cltq vmovss (%rcx,%rax,4), %xmm0 vmulss (%rsi,%rax,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%rax,4) vzeroupper ret .p2align 4,,10 .p2align 3 .L22: vzeroupper .L23: ret .p2align 4,,10 .p2align 3 .L3: movl %eax, %edi xorl %eax, %eax .p2align 4,,10 .p2align 3 .L6: vmovss (%rcx,%rax,4), %xmm0 movq %rax, %r8 vmulss (%rsi,%rax,4), %xmm0, %xmm0 vmovss %xmm0, (%rdx,%rax,4) incq %rax cmpq %rdi, %r8 jne .L6 ret .cfi_endproc .LFE0: .size foo, .-foo .ident "GCC: (GNU) 10.0.0 20190723 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-cfl-1 pr89176]$