https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93056
Bug ID: 93056 Summary: Poor codegen for heapsort in stephanov_vector benchmark Product: gcc Version: 10.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- Created attachment 47543 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=47543&action=edit preprocessed benchmark with other tests disabled. heap sort test in stepanov_vector benchmark runs about 50% slower when built with GCC10 compared to clang8 (with -O3 -march=native, on bdver1 hardware) Clang profile is: 33.19% stepanov_vector stepanov_vector [.] benchmark::heapsort<std::reverse_iterator<std::reverse_iterator<__gnu_cxx::__normal_iterator<double*, std::vector<double, std 16.73% stepanov_vector stepanov_vector [.] benchmark::heapsort<reverse_iterator<reverse_iterator<__gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocato 16.07% stepanov_vector stepanov_vector [.] benchmark::heapsort<__gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocator<double> > > > 15.61% stepanov_vector stepanov_vector [.] benchmark::heapsort<double*> 15.61% stepanov_vector stepanov_vector [.] benchmark::heapsort<std::reverse_iterator<std::reverse_iterator<double*> > > while GCC profile is: 32.90% stepanov_vector stepanov_vector [.] benchmark::__sift_in<std::reverse_iterator<std::reverse_iterator<__gnu_cxx::__normal_iterator<double*, std::vector<double, st 16.25% stepanov_vector stepanov_vector [.] benchmark::__sift_in<std::reverse_iterator<std::reverse_iterator<double*> >, double> 16.01% stepanov_vector stepanov_vector [.] benchmark::__sift_in<reverse_iterator<reverse_iterator<__gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocat 15.94% stepanov_vector stepanov_vector [.] benchmark::__sift_in<double*, double> 15.73% stepanov_vector stepanov_vector [.] benchmark::__sift_in<__gnu_cxx::__normal_iterator<double*, std::vector<double, std::allocator<double> > >, double> In the hottest function the internal iterator loop leads to worse code: Clang: │ xor %edx,%edx │ if ( *(begin+(i-1)) < *(begin+i)) 0.88 │180:┌─→vmovsd (%rax,%rsi,8),%xmm1 6.16 │ │ xor %edi,%edi 6.45 │ │ vucomi -0x8(%rax,%rsi,8),%xmm1 22.87 │ │ seta %dil 6.74 │ │ or %rsi,%rdi │ │ *(begin + free) = *(begin+(i-1)); 7.62 │ │ mov -0x8(%rax,%rdi,8),%rcx 14.08 │ │ mov %rcx,(%rax,%rdx,8) 0.59 │ │ lea -0x1(%rdi),%rdx │ │ for ( i = 2*(free+1); i < count; i += i) { 0.59 │ │ add %rdi,%rdi 2.05 │ │ mov %rdi,%rsi 2.05 │ │ cmp %r11,%rdi 0.29 │ └──jl 180 │ if (i == count) { GCC: │ nop 1.81 │18:┌─→mov %rax,%r10 │ │ if ( *(begin+(i-1)) < *(begin+i)) 1.21 │1b:│ lea -0x1(%rcx),%rax │ │_ZNK9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEplEl(): 2.02 │ │ lea 0x0(,%rax,8),%r9 0.40 │ │ lea (%rsi,%r9,1),%r8 0.40 │ │ lea 0x8(%rsi,%r9,1),%r9 │ │_ZN9benchmark9__sift_inISt16reverse_iteratorIS1_IN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEEEdEEvlT_lT0_(): 3.23 │ │ vmovsd (%r8),%xmm1 7.66 │ │ vmovsd (%r9),%xmm2 0.40 │ │ vcomis %xmm1,%xmm2 8.67 │ │↓ jbe 4d 4.64 │ │ vmovap %xmm2,%xmm1 │ │ i++; 18.55 │ │ mov %rcx,%rax │ │ mov %r9,%r8 1.01 │ │ inc %rcx │ │ for ( i = 2*(free+1); i < count; i += i) { 4.64 │4d:│ add %rcx,%rcx │ │ *(begin + free) = *(begin+(i-1)); 19.76 │ │ vmovsd %xmm1,(%rsi,%r10,8) │ │ for ( i = 2*(free+1); i < count; i += i) { 8.06 │ │ cmp %rcx,%rdi │ └──jg 18 │ free = i-1; │ } The code is: void __sift_in(ptrdiff_t count, RandomAccessIterator begin, ptrdiff_t free_in, T next) { ptrdiff_t i; ptrdiff_t free = free_in; // sift up the free node for ( i = 2*(free+1); i < count; i += i) { if ( *(begin+(i-1)) < *(begin+i)) i++; *(begin + free) = *(begin+(i-1)); free = i-1; } // special case in sift up if the last inner node has only 1 child if (i == count) { *(begin + free) = *(begin+(i-1)); free = i-1; } // sift down the new item next i = (free-1)/2; while( (free > free_in) && *(begin+i) < next) { *(begin + free) = *(begin+i); free = i; i = (free-1)/2; } *(begin + free) = next; }