[Bug middle-end/109849] suboptimal code for vector walking loop

hubicka at gcc dot gnu.org via Gcc-bugs Thu, 18 May 2023 02:36:06 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109849


--- Comment #8 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
We can only SRA if the address is non-escaping.  Clang does not seem to need it
to optimize better:

jan@localhost:~> cat t.c
extern void q(int *);
__attribute__ ((noinline))
void
test()
{
        for (int a = 0; a < 1000;a++)
                if (!(a%100))
                        q(&a);
}
int
main()
{
        for (int a = 0; a < 1000000;a++)
                test ();
}
jan@localhost:~> cat t2.c
void q(int *a)
{
}
jan@localhost:~> gcc -O2 t.c t2.c ; perf stat ./a.out

 Performance counter stats for './a.out':

          2,916.73 msec task-clock:u                     #    0.999 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                52      page-faults:u                    #   17.828 /sec        
     8,344,719,833      cycles:u                         #    2.861 GHz         
        13,561,375      stalled-cycles-frontend:u        #    0.16% frontend
cycles idle      
     5,128,112,757      stalled-cycles-backend:u         #   61.45% backend
cycles idle       
    10,050,172,242      instructions:u                   #    1.20  insn per
cycle            
                                                  #    0.51  stalled cycles per
insn   
     2,034,043,082      branches:u                       #  697.370 M/sec       
        11,186,312      branch-misses:u                  #    0.55% of all
branches           

       2.918344737 seconds time elapsed

       2.917844000 seconds user
       0.000000000 seconds sys


jan@localhost:~> clang -O2 t.c t2.c ; perf stat ./a.out

 Performance counter stats for './a.out':

            664.40 msec task-clock:u                     #    0.999 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                54      page-faults:u                    #   81.276 /sec        
     2,318,095,848      cycles:u                         #    3.489 GHz         
        10,417,694      stalled-cycles-frontend:u        #    0.45% frontend
cycles idle      
     1,057,731,301      stalled-cycles-backend:u         #   45.63% backend
cycles idle       
    10,062,172,840      instructions:u                   #    4.34  insn per
cycle            
                                                  #    0.11  stalled cycles per
insn   
     2,034,042,724      branches:u                       #    3.061 G/sec       
        10,003,620      branch-misses:u                  #    0.49% of all
branches           

       0.665267996 seconds time elapsed

       0.665247000 seconds user
       0.000000000 seconds sys


We do:

        jmp     .L3
        .p2align 4,,10
        .p2align 3
.L2:
        movl    12(%rsp), %eax
        addl    $1, %eax
        movl    %eax, 12(%rsp)
        cmpl    $999, %eax
        jg      .L7
.L3:
        imull   $-1030792151, %eax, %eax
        addl    $85899344, %eax
        rorl    $2, %eax
        cmpl    $42949672, %eax
        ja      .L2
        leaq    12(%rsp), %rdi
        call    q
        jmp     .L2

Which has stupid store-to-load dpendency in the internal loop. Clang keeps the
store but optimizes away the load:

        jmp     .LBB0_1
        .p2align        4, 0x90
.LBB0_3:                                #   in Loop: Header=BB0_1 Depth=1
        leal    1(%rax), %ecx
        movl    %ecx, 12(%rsp)
        cmpl    $999, %eax                      # imm = 0x3E7
        movl    %ecx, %eax
        jge     .LBB0_4
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        imull   $-1030792151, %eax, %ecx        # imm = 0xC28F5C29
        addl    $85899344, %ecx                 # imm = 0x51EB850
        rorl    $2, %ecx
        cmpl    $42949672, %ecx                 # imm = 0x28F5C28
        ja      .LBB0_3
# %bb.2:                                #   in Loop: Header=BB0_1 Depth=1
        movq    %rbx, %rdi
        callq   q@PLT
        movl    12(%rsp), %eax
        jmp     .LBB0_3

Wonder what makes clang to think it needs @PLT though.
Why we do not consider the load as partially redundant with itself?

[Bug middle-end/109849] suboptimal code for vector walking loop

Reply via email to