https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98868

--- Comment #3 from Martin Liška <marxin at gcc dot gnu.org> ---
It's likely about a small loop alignment:

# Overhead  Command  Shared Object         Symbol                              
# ........  .......  ....................  ....................................
#
    78.19%  a.out    a.out                 [.] matsim_
    17.00%  a.out    a.out                 [.] evlrnf_

matsim_ hot place (with --show-total-period)

SLOW:

     8653282 :   4017cb: imul   $0x3243f6ad,%esi,%esi
             :            genuni():
             :            genuni = us231 * real (jsee)
   726254541 :   4017d1: vxorps %xmm0,%xmm0,%xmm0
             :            jsee = jsee * jmul + jadd
           0 :   4017d5: add    $0x1b0cb175,%esi
             :            genuni = us231 * real (jsee)
   105853662 :   4017db: vcvtsi2ss %esi,%xmm0,%xmm0
   273371557 :   4017df: vmulss %xmm1,%xmm0,%xmm0
             :            gentrs_():
             :            do icls = icls1, ncls
   454049783 :   4017e3: cmp    $0xffffffff,%edi
     2165881 :   4017e6: je     401970 <matsim_+0x470>
           0 :   4017ec: cmp    $0x1,%edi
     1081799 :   4017ef: jne    4017cb <matsim_+0x2cb>
     2155914 :   4017f1: mov    %r9,%rdx
     4307088 :   4017f4: mov    %r8d,%ecx
           0 :   4017f7: jmp    401811 <matsim_+0x311>
           0 :   4017f9: nopl   0x0(%rax)
  8624612913 :   401800: inc    %ecx
    42153493 :   401802: add    $0x400,%rdx
   484044717 :   401809: cmp    $0x101,%ecx
    38933067 :   40180f: je     4017cb <matsim_+0x2cb>

FAST:

    45442445 :   4017c9: imul   $0x3243f6ad,%edx,%edx
             :            genuni():
             :            genuni = us231 * real (jsee)
     1076892 :   4017cf: vxorps %xmm0,%xmm0,%xmm0
             :            jsee = jsee * jmul + jadd
     3245642 :   4017d3: add    $0x1b0cb175,%edx
             :            jsee = ibits(jsee, 0, 31)                   !
Replacement
     1083699 :   4017d9: and    $0x7fffffff,%edx
             :            genuni = us231 * real (jsee)
           0 :   4017df: vcvtsi2ss %edx,%xmm0,%xmm0
    76652291 :   4017e3: vmulss %xmm1,%xmm0,%xmm0
             :            gentrs_():
             :            do icls = icls1, ncls
   166631920 :   4017e7: cmp    $0xffffffff,%edi
     3251886 :   4017ea: je     401970 <matsim_+0x470>
           0 :   4017f0: cmp    $0x1,%edi
           0 :   4017f3: jne    4017c9 <matsim_+0x2c9>
           0 :   4017f5: mov    %r9,%rcx
           0 :   4017f8: mov    %r8d,%esi
     1083364 :   4017fb: jmp    401811 <matsim_+0x311>
           0 :   4017fd: nopl   (%rax)
  1099920836 :   401800: inc    %esi
   209587136 :   401802: add    $0x400,%rcx
   100391619 :   401809: cmp    $0x101,%esi
    69184337 :   40180f: je     4017c9 <matsim_+0x2c9>

For some reason the hottest "inc" instruction has in fast version ~10x smaller
number of cycles.
The instruction takes 20% of cycles in the slow version.

Reply via email to