https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65701

--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
OK, and setting --param large-function-insns=1000 gets the performance then.
The key seems to be in not inlining too much into main.  The hotspot change
from:

  1.11 �3682:   mov    0x60(%rsp),%rdx                                          
  9.32 �3687:���vmovss (%rax,%r12,2),%xmm5                                      
  1.44 �     �  vmovss (%rax),%xmm6                                             
  4.46 �     �  inc    %rdi                                                     
  0.01 �     �  add    $0x10,%rcx                                               
  1.17 �     �  vinser $0x10,(%rax,%r13,1),%xmm5,%xmm0                          
  1.92 �     �  vinser $0x10,(%rax,%r12,1),%xmm6,%xmm1                          
  0.28 �     �  add    %r14,%rax                                                
  0.07 �     �  vmovlh %xmm0,%xmm1,%xmm0                                        
  2.48 �     �  vfmadd %xmm3,-0x10(%rcx),%xmm0,%xmm3                            
  5.15 �     �  cmp    %rdi,%rdx                                                
  0.01 �     ���ja     3687                                                     
  1.21 �        vhaddp %xmm3,%xmm3,%xmm3                                        
 10.30 �        mov    0x58(%rsp),%rax                                          
  0.03 �        mov    %r13,0x10(%rsp)                                          
  0.00 �        add %rax,%rsi
  1.18 �        vhaddp %xmm3,%xmm3,%xmm3
 10.80 �        vaddss %xmm3,%xmm4,%xmm4                                        
  4.47 �        cmp    0x68(%rsp),%rax                                          

(the slower variant) to:

  1.38 �        xor    %ecx,%ecx                                                
  6.04 �17c0:���vmovss (%rax,%r11,2),%xmm3                                      
  0.18 �     �  mov    0x90(%rsp),%rsi                                          
  1.43 �     �  inc    %rcx                                                     
  1.42 �     �  vmovss (%rax),%xmm5                                             
  0.36 �     �  vmovss (%rdx,%rbx,2),%xmm6                                      
  2.81 �     �  vmovss (%rdx),%xmm7                                             
  0.90 �     �  vinser $0x10,(%rax,%rsi,1),%xmm3,%xmm2                          
  2.96 �     �  mov    0x88(%rsp),%rsi                                          
  0.04 �     �  vinser $0x10,(%rax,%r11,1),%xmm5,%xmm4                          
  2.76 �     �  add    0x70(%rsp),%rax                                          
  0.07 �     �  vinser $0x10,(%rdx,%rbx,1),%xmm7,%xmm3                          
  0.02 �     �  vmovlh %xmm2,%xmm4,%xmm4                                        
  2.69 �     �  vinser $0x10,(%rdx,%rsi,1),%xmm6,%xmm2                          
  1.13 �     �  add    0x78(%rsp),%rdx                                          
  0.04 �     �  vmovlh %xmm2,%xmm3,%xmm2                                        
  0.01 �     �  vfmadd %xmm0,%xmm2,%xmm4,%xmm0                                  
  2.74 �     �  cmp    %rcx,0x80(%rsp)                                          
  0.07 �     ���ja     17c0                                                     
  1.39 �        vhaddp %xmm0,%xmm0,%xmm0                                        
  4.45 �        mov    0x48(%rsp),%rsi                                          
  1.42 �        vhaddp %xmm0,%xmm0,%xmm0                                        
  7.96 �        vaddss %xmm0,%xmm1,%xmm1                                        
  4.09 �        cmp    %r15,0x60(%rsp)                                          
  0.01 �      � je     18b1                                                     

(the faster variant, dunno why)

Reply via email to