http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60086
Alexander Monakov <amonakov at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amonakov at gcc dot gnu.org
--- Comment #7 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #1)
> alignment, but still the scheduler doesn't reorder the loads vs. the store,
> unless -O3 -mavx -fschedule-insns. The reason why the second scheduler
> doesn't reorder those is that RA allocates the same register
I think you usually want -fschedule-insns (pre-regalloc scheduling) or
-frename-registers rather than -fselective-scheduling2 when the goal is to
workaround RA conservativeness. Unfortunately, stack accesses in the loop
prevent sched2 from using the additional freedom supplied by regrename for AVX
code in this case (when tuning is enabled). The stack accesses seem to be a
trunk regression judging by good code supplied in the opening comment.
(-O3 -mavx -fschedule-insns or -frename-registers, same modulo ymm* names,
%rpb-based accesses in the loop are pretty bad, but otherwise it's scheduled as
desired)
.L9:
movq -136(%rbp), %rdx
vmovapd (%r9,%rax), %ymm1
addq $1, %rdi
vmovapd (%r10,%rax), %ymm0
vaddpd (%rdx,%rax), %ymm1, %ymm1
movq -144(%rbp), %rdx
vaddpd (%rdx,%rax), %ymm0, %ymm0
vmovapd %ymm1, (%r9,%rax)
vmovapd %ymm0, (%r10,%rax)
addq $32, %rax
cmpq %rdi, -152(%rbp)
ja .L9
(-O3 -fschedule-insns or -frename-registers, same modulo xmm* names, scheduled
as desired)
.L7:
movapd (%r9,%rax), %xmm0
addq $1, %rdi
movapd (%r10,%rax), %xmm2
addpd (%r11,%rax), %xmm0
addpd (%rcx,%rax), %xmm2
movaps %xmm0, (%r9,%rax)
movaps %xmm2, (%r10,%rax)
addq $16, %rax
cmpq %rdi, %r8
ja .L7
(-mavx -O3 -mtune=corei7-avx -frename-registers, stack-based references prevent
good scheduling)
.L9:
movq -136(%rbp), %rdx
addq $1, %rdi
vmovapd (%r9,%rax), %ymm0
vmovapd (%r10,%rax), %ymm3
vaddpd (%rdx,%rax), %ymm0, %ymm2
movq -144(%rbp), %rdx
vmovapd %ymm2, (%r9,%rax)
vaddpd (%rdx,%rax), %ymm3, %ymm4
vmovapd %ymm4, (%r10,%rax)
addq $32, %rax
cmpq %rdi, -152(%rbp)
ja .L9
(-mavx -O3 -mtune=corei7-avx -fschedule-insns -fno-ivopts, no spilling in the
loop, scheduled as desired)
.L9:
addq $32, %rcx
addq $32, %r10
vmovapd (%rdx), %ymm1
addq $32, %rsi
vmovapd (%rdi), %ymm0
addq $32, %r11
addq $1, %rax
addq $32, %rdx
vaddpd -32(%rcx), %ymm1, %ymm1
addq $32, %rdi
vaddpd -32(%r10), %ymm0, %ymm0
vmovapd %ymm1, -32(%rsi)
vmovapd %ymm0, -32(%r11)
cmpq %rax, -184(%rbp)
ja .L9