On Sat, Oct 7, 2017 at 2:22 PM, Saldyrkine, Mikhail <mikhail.saldyrk...@gs.com> wrote: > The " uint64_t test_noasm(uint64_t idx)" has same loop and the function is > optimized out.
There is a difference there, objects is limited to 1024. Loading past the array bounds is undefined. Thanks, Andrew > I've changed code to constraint the loop iterations and compiler: > - unrolled loop > - did not eliminate the function as it does when asm is not used > It looks like the " infinite loop" is not root cause. > > inline uint64_t test_asm_inside_loop(uint64_t idx) { > uint64_t result; > for( int i = 0; i < capacity; ++i ) > { > asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) > ); > if( result > 128 ) > return result; > ++idx; > } > return 0; > } > > Dump of assembler code for function _Z28compile_test_asm_inside_loopv: > 0x0000000000400b40 <+0>: xor %eax,%eax > 0x0000000000400b42 <+2>: mov $0x602080,%edx > 0x0000000000400b47 <+7>: mov (%rdx,%rax,8),%rcx > 0x0000000000400b4b <+11>: cmp $0x80,%rcx > 0x0000000000400b52 <+18>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400b58 <+24>: mov $0x1,%eax > 0x0000000000400b5d <+29>: mov (%rdx,%rax,8),%rsi > 0x0000000000400b61 <+33>: cmp $0x80,%rsi > 0x0000000000400b68 <+40>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400b6e <+46>: lea 0x1(%rax),%rdi > 0x0000000000400b72 <+50>: mov (%rdx,%rdi,8),%r8 > 0x0000000000400b76 <+54>: cmp $0x80,%r8 > 0x0000000000400b7d <+61>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400b83 <+67>: lea 0x2(%rax),%r9 > 0x0000000000400b87 <+71>: mov (%rdx,%r9,8),%r10 > 0x0000000000400b8b <+75>: cmp $0x80,%r10 > 0x0000000000400b92 <+82>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400b98 <+88>: lea 0x3(%rax),%r11 > 0x0000000000400b9c <+92>: mov (%rdx,%r11,8),%rcx > 0x0000000000400ba0 <+96>: cmp $0x80,%rcx > 0x0000000000400ba7 <+103>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400bad <+109>: lea 0x4(%rax),%rsi > 0x0000000000400bb1 <+113>: mov (%rdx,%rsi,8),%r8 > 0x0000000000400bb5 <+117>: cmp $0x80,%r8 > 0x0000000000400bbc <+124>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400bbe <+126>: lea 0x5(%rax),%r9 > 0x0000000000400bc2 <+130>: mov (%rdx,%r9,8),%r10 > 0x0000000000400bc6 <+134>: cmp $0x80,%r10 > 0x0000000000400bcd <+141>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400bcf <+143>: lea 0x6(%rax),%r11 > 0x0000000000400bd3 <+147>: mov (%rdx,%r11,8),%rcx > 0x0000000000400bd7 <+151>: cmp $0x80,%rcx > 0x0000000000400bde <+158>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400be0 <+160>: lea 0x7(%rax),%rsi > 0x0000000000400be4 <+164>: mov (%rdx,%rsi,8),%r8 > 0x0000000000400be8 <+168>: cmp $0x80,%r8 > 0x0000000000400bef <+175>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400bf1 <+177>: lea 0x8(%rax),%r9 > 0x0000000000400bf5 <+181>: mov (%rdx,%r9,8),%r10 > 0x0000000000400bf9 <+185>: cmp $0x80,%r10 > 0x0000000000400c00 <+192>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400c02 <+194>: add $0x9,%rax > 0x0000000000400c06 <+198>: mov (%rdx,%rax,8),%rax > 0x0000000000400c0a <+202>: cmp $0x80,%rax > 0x0000000000400c10 <+208>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400c12 <+210>: lea 0x9(%rdi),%r11 > 0x0000000000400c16 <+214>: mov (%rdx,%r11,8),%rcx > 0x0000000000400c1a <+218>: cmp $0x80,%rcx > 0x0000000000400c21 <+225>: ja 0x400c38 > <_Z28compile_test_asm_inside_loopv+248> > 0x0000000000400c23 <+227>: lea 0xa(%rdi),%rax > 0x0000000000400c27 <+231>: cmp $0x400,%rax > 0x0000000000400c2d <+237>: jne 0x400b5d > <_Z28compile_test_asm_inside_loopv+29> > 0x0000000000400c33 <+243>: repz retq > 0x0000000000400c35 <+245>: nopl (%rax) > 0x0000000000400c38 <+248>: repz retq > > -----Original Message----- > From: Andrew Pinski [mailto:pins...@gmail.com] > Sent: Saturday, October 07, 2017 3:04 PM > To: Saldyrkine, Mikhail [Sec Div] > Cc: gcc-bugs@gcc.gnu.org > Subject: Re: GCC does not optimize out functions without side effects with > asm statements inside loop even if return velue is ignored > > On Sat, Oct 7, 2017 at 8:39 AM, Saldyrkine, Mikhail > <mikhail.saldyrk...@gs.com> wrote: >> g++ (GCC) 6.3.1 20170216 (Red Hat 6.3.1-3) >> >> In the below case compile_test_asm_inside_loop invokes test_asm_inside_loop >> and ignores results. >> The call into test_asm_inside_loop is expected to be eliminated since return >> value is not used and there is no side effect >> The call elimination works fine without asm and without loop >> It does not work with asm inside loop > > Because the loop could be an infinite loop and GCC does not know how > many times the inline-asm is going to be called and if there are other > side effects. > > Let's look at the function: > inline uint64_t test_asm_inside_loop(uint64_t idx) { > while(true) > { > uint64_t result; > asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) > ); > if( result > 128 ) > return result; > ++idx; > } > } > > The loop is only broken out of when result is > 128. result from the > inline-asm is used as the breakout from the loop. > > Thanks, > Andrew > >> >> TEST CODE >> >> #include <iostream> >> #include <assert.h> >> >> using namespace std; >> constexpr static size_t capacity = 1024; >> uint64_t objects[capacity]; >> >> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED >> inline uint64_t test_noloop(uint64_t idx) { >> uint64_t result; >> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) ); >> if( result > 128 ) >> return result; >> return 0; >> } >> >> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED >> inline uint64_t test_noasm(uint64_t idx) { >> while(true) >> { >> if( objects[idx] > 128 ) >> return objects[idx]; >> ++idx; >> } >> } >> >> // THE FUNCTION IS KEEPT EVEN WHEN IF RESULT IS NOT USED - ASM INSIDE LOOP >> CAUSING THE ISSUE >> inline uint64_t test_asm_inside_loop(uint64_t idx) { >> while(true) >> { >> uint64_t result; >> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) >> ); >> if( result > 128 ) >> return result; >> ++idx; >> } >> } >> >> void init() { >> srand(time(nullptr)); >> for( size_t i = 0; i < capacity - 1; ++i ) >> objects[i] = random() % 256; >> objects[capacity-1] = 255; >> } >> >> // TETS THAT test_noasm AND test_asm_inside_loop PRODUCE SAME RESULT >> void sanity_test() { >> for( size_t i = 0; i < capacity; ++i ) { >> assert( test_noasm(i) == test_asm_inside_loop(i)); >> } >> } >> >> void compile_test_noasm() { >> test_noasm(0); >> } >> >> void compile_test_noloop() { >> test_noloop(0); >> } >> >> void compile_test_asm_inside_loop() { >> test_asm_inside_loop(0); >> } >> >> int main( int argc, char* argv[] ) { >> init(); >> sanity_test(); >> compile_test_noasm(); >> compile_test_noloop(); >> compile_test_asm_inside_loop(); >> } >> >> COMPILATION AND DISASSEMBLER RESULTS: >> >> /opt/rh/devtoolset-6//root/bin/g++ -O3 -funroll-loops >> loop_optimization.cpp; gdb -batch -ex "file a.out" -ex "disas >> compile_test_noasm" -ex "disas compile_test_noloop" -ex "disas >> compile_test_asm_inside_loop" >> Dump of assembler code for function _Z18compile_test_noasmv: >> 0x0000000000400970 <+0>: repz retq >> End of assembler dump. >> Dump of assembler code for function _Z19compile_test_noloopv: >> 0x0000000000400980 <+0>: repz retq >> End of assembler dump. >> Dump of assembler code for function _Z28compile_test_asm_inside_loopv: >> 0x0000000000400990 <+0>: xor %edx,%edx >> 0x0000000000400992 <+2>: mov $0x601080,%ecx >> 0x0000000000400997 <+7>: xor %eax,%eax >> 0x0000000000400999 <+9>: mov (%rcx,%rdx,8),%rsi >> 0x000000000040099d <+13>: cmp $0x80,%rsi >> 0x00000000004009a4 <+20>: ja 0x4009c1 >> <_Z28compile_test_asm_inside_loopv+49> >> 0x00000000004009a6 <+22>: nopw %cs:0x0(%rax,%rax,1) >> 0x00000000004009b0 <+32>: add $0x1,%rax >> 0x00000000004009b4 <+36>: mov (%rcx,%rax,8),%rdi >> 0x00000000004009b8 <+40>: cmp $0x80,%rdi >> 0x00000000004009bf <+47>: jbe 0x4009b0 >> <_Z28compile_test_asm_inside_loopv+32> >> 0x00000000004009c1 <+49>: repz retq >> End of assembler dump. >> >>