https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878
--- Comment #1 from H.J. Lu <hjl.tools at gmail dot com> ---
If we make integer register store more expensive, this testcase will
regress:
[hjl@gnu-cfl-1 unroll]$ cat x.i
void
foo (long p2, long *diag, long d, long i)
{
long k;
k = p2 < 3 ? p2 + p2 : p2 + 3;
while (i < k)
diag[i++] = d;
}
[hjl@gnu-cfl-1 unroll]$ make
/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O3
-march=skylake -S x.i
[hjl@gnu-cfl-1 unroll]$ cat x.s
.file "x.i"
.text
.p2align 4
.globl foo
.type foo, @function
foo:
.LFB0:
.cfi_startproc
leaq (%rdi,%rdi), %rax
leaq 3(%rdi), %r8
cmpq $2, %rdi
cmovle %rax, %r8
cmpq %rcx, %r8
jle .L10
movq %rcx, %rax
notq %rax
movq %r8, %r9
addq %r8, %rax
subq %rcx, %r9
cmpq $3, %rax
jbe .L5
movq %r9, %rdi
shrq $2, %rdi
vmovq %rdx, %xmm1
leaq (%rsi,%rcx,8), %rax
salq $5, %rdi
vpbroadcastq %xmm1, %ymm0
addq %rax, %rdi
.p2align 4,,10
.p2align 3
.L6:
vmovdqu %ymm0, (%rax)
addq $32, %rax
cmpq %rdi, %rax
jne .L6
movq %r9, %rax
andq $-4, %rax
addq %rax, %rcx
cmpq %rax, %r9
je .L9
vzeroupper
.L5:
leaq 1(%rcx), %rax
movq %rdx, (%rsi,%rcx,8)
cmpq %r8, %rax
jge .L10
leaq 2(%rcx), %rdi
movq %rdx, (%rsi,%rax,8)
cmpq %rdi, %r8
jle .L10
addq $3, %rcx
movq %rdx, (%rsi,%rdi,8)
cmpq %rcx, %r8
jle .L10
movq %rdx, (%rsi,%rcx,8)
ret
.p2align 4,,10
.p2align 3
.L9:
vzeroupper
.L10:
ret
.cfi_endproc
.LFE0:
.size foo, .-foo
.ident "GCC: (GNU) 10.0.0 20190613 (experimental)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-cfl-1 unroll]$
since higher integer register store cost will reduce loop unroll count.