https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116582

            Bug ID: 116582
           Summary: gather is a win in some cases on zen CPUs
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

While the sparse multiply in parest and tsvc does not seem to work well gather
the following benchmark likes it:

T a[M], b[M];
__attribute__ ((noipa))
void
test ()
{
  for (int i = 0; i < 1024* 16; i++)
    a[i] += b[indices[i]];
}
int
main()
{
  for (int i = 0 ; i < M; i++)
    indices[i] = (i * 8) % M;
  for (int i = 0 ; i < 10000; i++)
    test ();
  return 0;
}

jan@localhost:/tmp> g++ -DT=float  -march=native gather.c  -Ofast
-mtune-ctrl=^use_gather_4parts -mtune-ctrl=^use_gather_8parts
-mtune-ctrl=^use_gather -mtune=native -fdump-tree-all-details ; objdump -d
a.out | grep gather ; perf stat ./a.out

 Performance counter stats for './a.out':

          3,499.60 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
               221      page-faults:u                    #   63.150 /sec        
    14,526,193,995      cycles:u                         #    4.151 GHz         
       467,072,127      stalled-cycles-frontend:u        #    3.22% frontend
cycles idle      
       577,324,069      instructions:u                   #    0.04  insn per
cycle            
                                                  #    0.81  stalled cycles per
insn   
        41,578,204      branches:u                       #   11.881 M/sec       
            50,517      branch-misses:u                  #    0.12% of all
branches           

       3.500660600 seconds time elapsed

       3.497150000 seconds user
       0.003333000 seconds sys


jan@localhost:/tmp> g++ -DT=float  -march=native gather.c  -Ofast
-mtune-ctrl=use_gather_4parts -mtune-ctrl=use_gather_8parts
-mtune-ctrl=use_gather -mtune=native -fdump-tree-all-details ; objdump -d a.out
| grep gather ; perf stat ./a.out
  401250:       c4 e2 65 92 04 8d 40    vgatherdps
%ymm3,0x404040(,%ymm1,4),%ymm0

 Performance counter stats for './a.out':

          1,263.87 msec task-clock:u                     #    0.922 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
               222      page-faults:u                    #  175.651 /sec        
     5,172,067,789      cycles:u                         #    4.092 GHz         
        93,135,962      stalled-cycles-frontend:u        #    1.80% frontend
cycles idle      
       167,783,419      instructions:u                   #    0.03  insn per
cycle            
                                                  #    0.56  stalled cycles per
insn   
        21,097,560      branches:u                       #   16.693 M/sec       
            24,253      branch-misses:u                  #    0.11% of all
branches           

       1.370533592 seconds time elapsed

       1.265143000 seconds user
       0.000000000 seconds sys


Non-gather loop is:
.L2:
        movslq  indices(%rax), %rcx
        movslq  indices+8(%rax), %rdi
        addq    $16, %rax
        movslq  indices-12(%rax), %rdx
        movslq  indices-4(%rax), %rsi
        vmovss  b(,%rdi,4), %xmm1
        vmovss  b(,%rcx,4), %xmm0
        vinsertps       $0x10, b(,%rsi,4), %xmm1, %xmm1
        vinsertps       $0x10, b(,%rdx,4), %xmm0, %xmm0
        vmovlhps        %xmm1, %xmm0, %xmm0
        vaddps  a-16(%rax), %xmm0, %xmm0
        vmovaps %xmm0, a-16(%rax)
        cmpq    $65536, %rax

while gather loop:

.L2:
        vmovdqa indices(%rax), %ymm1
        vmovaps %ymm2, %ymm3
        addq    $32, %rax
        vgatherdps      %ymm3, b(,%ymm1,4), %ymm0
        vaddps  a-32(%rax), %ymm0, %ymm0
        vmovaps %ymm0, a-32(%rax)
        cmpq    $65536, %rax
        jne     .L2

Reply via email to