http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59262
Bug ID: 59262
Summary: __attribute__ ((optimize())) broken (and corrupts
optimization of the whole compilation unit)
Product: gcc
Version: 4.9.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vincenzo.innocente at cern dot ch
in latest 4.9. seen in 4.8.1 too
take
cat attribute.cc
inline float sum(float x, float y) { return x+y;}
#ifdef OPT1
float foo1() __attribute__ ((optimize("O3", "fast-math")));
#endif
#ifdef OPT2
float foo2() __attribute__ ((optimize("fast-math")));
#endif
#ifdef OPT3
float foo3() __attribute__ ((optimize("O3")));
#endif
float x[1024], y[1024];
float foo1() {
float ret=0;
for (int i=0; i<1024; ++i)
ret += sum(x[i],y[i]);
return ret;
}
float foo2() {
float ret=0;
for (int i=0; i<1024; ++i)
ret += sum(x[i],y[i]);
return ret;
}
float foo3() {
float ret=0;
for (int i=0; i<1024; ++i)
ret += sum(x[i],y[i]);
return ret;
}
float bar() {
float ret=0;
for (int i=0; i<1024; ++i)
ret += sum(x[i],y[i]);
return ret;
}
c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT1 ; cat attribute.s
.file "attribute.cc"
.section .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
.p2align 4,,15
.weak _Z3sumff
.type _Z3sumff, @function
_Z3sumff:
.LFB0:
.cfi_startproc
addss %xmm1, %xmm0
ret
.cfi_endproc
.LFE0:
.size _Z3sumff, .-_Z3sumff
.text
.p2align 4,,-1
.globl _Z4foo1v
.type _Z4foo1v, @function
_Z4foo1v:
.LFB1:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
pxor %xmm3, %xmm3
xorl %ebx, %ebx
subq $16, %rsp
.cfi_def_cfa_offset 32
movss %xmm3, 12(%rsp)
.p2align 4,,10
.p2align 3
.L3:
movss x(%rbx), %xmm0
addq $4, %rbx
movss y-4(%rbx), %xmm1
call _Z3sumff
addss 12(%rsp), %xmm0
movss %xmm0, 12(%rsp)
cmpq $4096, %rbx
jne .L3
addq $16, %rsp
.cfi_def_cfa_offset 16
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE1:
.size _Z4foo1v, .-_Z4foo1v
.p2align 4,,15
.globl _Z4foo2v
.type _Z4foo2v, @function
_Z4foo2v:
.LFB2:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L8:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L8
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE2:
.size _Z4foo2v, .-_Z4foo2v
.p2align 4,,15
.globl _Z4foo3v
.type _Z4foo3v, @function
_Z4foo3v:
.LFB3:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L11:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L11
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE3:
.size _Z4foo3v, .-_Z4foo3v
.p2align 4,,15
.globl _Z3barv
.type _Z3barv, @function
_Z3barv:
.LFB4:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L14:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L14
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE4:
.size _Z3barv, .-_Z3barv
.globl y
.bss
.align 32
.type y, @object
.size y, 4096
y:
.zero 4096
.globl x
.align 32
.type x, @object
.size x, 4096
x:
.zero 4096
.section .rodata.cst4,"aM",@progbits,4
.align 4
.LC0:
.long 0
.ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
.section .note.GNU-stack,"",@progbits
c++ -O2 -ftree-vectorize -S attribute.cc -march=corei7 -DOPT2 ; cat
attribute.s
.file "attribute.cc"
.text
.p2align 4,,15
.globl _Z4foo1v
.type _Z4foo1v, @function
_Z4foo1v:
.LFB1:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L2:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L2
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE1:
.size _Z4foo1v, .-_Z4foo1v
.p2align 4,,-1
.globl _Z4foo2v
.type _Z4foo2v, @function
_Z4foo2v:
.LFB2:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L6:
movss x(%rax), %xmm1
addq $4, %rax
addss y-4(%rax), %xmm1
addss %xmm1, %xmm0
cmpq $4096, %rax
jne .L6
ret
.cfi_endproc
.LFE2:
.size _Z4foo2v, .-_Z4foo2v
.p2align 4,,15
.globl _Z4foo3v
.type _Z4foo3v, @function
_Z4foo3v:
.LFB3:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L9:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L9
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE3:
.size _Z4foo3v, .-_Z4foo3v
.p2align 4,,15
.globl _Z3barv
.type _Z3barv, @function
_Z3barv:
.LFB4:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L12:
movaps x(%rax), %xmm1
addq $16, %rax
addps y-16(%rax), %xmm1
addps %xmm1, %xmm0
cmpq $4096, %rax
jne .L12
haddps %xmm0, %xmm0
haddps %xmm0, %xmm0
addss .LC0(%rip), %xmm0
ret
.cfi_endproc
.LFE4:
.size _Z3barv, .-_Z3barv
.globl y
.bss
.align 32
.type y, @object
.size y, 4096
y:
.zero 4096
.globl x
.align 32
.type x, @object
.size x, 4096
x:
.zero 4096
.section .rodata.cst4,"aM",@progbits,4
.align 4
.LC0:
.long 0
.ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
.section .note.GNU-stack,"",@progbits
[innocent@vinavx2 bugs48]$ c++ -O2 -ftree-vectorize -S attribute.cc
-march=corei7 -DOPT3 ; cat attribute.s
.file "attribute.cc"
.section .text._Z3sumff,"axG",@progbits,_Z3sumff,comdat
.p2align 4,,15
.weak _Z3sumff
.type _Z3sumff, @function
_Z3sumff:
.LFB0:
.cfi_startproc
addss %xmm1, %xmm0
ret
.cfi_endproc
.LFE0:
.size _Z3sumff, .-_Z3sumff
.text
.p2align 4,,15
.globl _Z4foo1v
.type _Z4foo1v, @function
_Z4foo1v:
.LFB1:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L3:
movss y(%rax), %xmm1
addq $4, %rax
addss x-4(%rax), %xmm1
addss %xmm1, %xmm0
cmpq $4096, %rax
jne .L3
ret
.cfi_endproc
.LFE1:
.size _Z4foo1v, .-_Z4foo1v
.p2align 4,,15
.globl _Z4foo2v
.type _Z4foo2v, @function
_Z4foo2v:
.LFB2:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L7:
movss y(%rax), %xmm1
addq $4, %rax
addss x-4(%rax), %xmm1
addss %xmm1, %xmm0
cmpq $4096, %rax
jne .L7
ret
.cfi_endproc
.LFE2:
.size _Z4foo2v, .-_Z4foo2v
.p2align 4,,-1
.globl _Z4foo3v
.type _Z4foo3v, @function
_Z4foo3v:
.LFB3:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
pxor %xmm3, %xmm3
xorl %ebx, %ebx
subq $16, %rsp
.cfi_def_cfa_offset 32
movss %xmm3, 12(%rsp)
.p2align 4,,10
.p2align 3
.L10:
movss x(%rbx), %xmm0
addq $4, %rbx
movss y-4(%rbx), %xmm1
call _Z3sumff
addss 12(%rsp), %xmm0
movss %xmm0, 12(%rsp)
cmpq $4096, %rbx
jne .L10
addq $16, %rsp
.cfi_def_cfa_offset 16
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE3:
.size _Z4foo3v, .-_Z4foo3v
.p2align 4,,15
.globl _Z3barv
.type _Z3barv, @function
_Z3barv:
.LFB4:
.cfi_startproc
xorl %eax, %eax
pxor %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L14:
movss y(%rax), %xmm1
addq $4, %rax
addss x-4(%rax), %xmm1
addss %xmm1, %xmm0
cmpq $4096, %rax
jne .L14
ret
.cfi_endproc
.LFE4:
.size _Z3barv, .-_Z3barv
.globl y
.bss
.align 32
.type y, @object
.size y, 4096
y:
.zero 4096
.globl x
.align 32
.type x, @object
.size x, 4096
x:
.zero 4096
.ident "GCC: (GNU) 4.9.0 20131110 (experimental) [trunk revision
204623]"
.section .note.GNU-stack,"",@progbits
notice how
float foo1() __attribute__ ((optimize("O3", "fast-math")));
manages to vectorize foo2,foo3,bar while prevents inlining in foo1 itself...
float foo2() __attribute__ ((optimize("fast-math")));
instead vectorize all others BUT foo2