https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114987

--- Comment #3 from Colin Ian King <colin.king at intel dot com> ---
perf report from gcc-13 of stress_vecfp_float_add_16.avx of compute loop:

 57.93 │200:   vaddps       0xc0(%rsp),%ymm3,%ymm5                        
 11.11 │       vaddps       0xe0(%rsp),%ymm2,%ymm6                        
  0.02 │       vmovaps      %ymm5,0x60(%rsp)                              
  2.92 │       mov          0x60(%rsp),%rax                               
       │       mov          0x68(%rsp),%rdx                               
  0.37 │       vmovaps      %ymm6,0x40(%rsp)                              
       │       vmovaps      %ymm5,0x80(%rsp)                              
  6.30 │       vmovq        %rax,%xmm1                                    
  4.11 │       mov          0x40(%rsp),%rax                               
       │       vmovdqa      0x90(%rsp),%xmm5                              
       │       vmovaps      %ymm6,0xa0(%rsp)                              
  3.27 │       vpinsrq      $0x1,%rdx,%xmm1,%xmm1                         
       │       mov          0x48(%rsp),%rdx                               
       │       vmovdqa      0xb0(%rsp),%xmm6                              
  3.22 │       vmovdqa      %xmm1,0xc0(%rsp)                              
  0.42 │       vmovq        %rax,%xmm0                                    
       │       vmovdqa      %xmm5,0xd0(%rsp)                              
  6.80 │       vpinsrq      $0x1,%rdx,%xmm0,%xmm0                         
  3.52 │       vmovdqa      %xmm0,0xe0(%rsp)                              
       │       vmovdqa      %xmm6,0xf0(%rsp)                              
       │       sub          $0x1,%ecx                                     
       │     ↑ jne          200    

perf report from gcc-14 of stress_vecfp_float_add_16.avx of compute loop:

 65.79 │200:   vaddps       0xc0(%rsp),%ymm3,%ymm5                        
  3.26 │       vaddps       0xe0(%rsp),%ymm2,%ymm6                        
  0.00 │       vmovaps      %ymm5,0x60(%rsp)                              
  9.25 │       mov          0x60(%rsp),%rax                               
  0.00 │       mov          0x68(%rsp),%rdx                               
       │       vmovaps      %ymm6,0x40(%rsp)                              
       │       vmovaps      %ymm5,0x80(%rsp)                              
  6.49 │       vmovq        %rax,%xmm1                                    
  0.00 │       mov          0x40(%rsp),%rax                               
  0.00 │       vmovaps      %ymm6,0xa0(%rsp)                              
  3.02 │       vpinsrq      $0x1,%rdx,%xmm1,%xmm1                         
       │       mov          0x48(%rsp),%rdx                               
  0.35 │       vmovdqa      %xmm1,0xc0(%rsp)                              
  0.68 │       vmovq        %rax,%xmm0                                    
  0.00 │       vmovdqa      0x90(%rsp),%xmm1                              
  5.18 │       vpinsrq      $0x1,%rdx,%xmm0,%xmm0                         
  3.00 │       vmovdqa      %xmm0,0xe0(%rsp)                              
       │       vmovdqa      0xb0(%rsp),%xmm0                              
       │       vmovdqa      %xmm1,0xd0(%rsp)                              
       │       vmovdqa      %xmm0,0xf0(%rsp)                              
       │       sub          $0x1,%ecx                                     
  2.94 │     ↑ jne          200

Reply via email to