On Thu, Mar 24, 2016 at 6:26 PM, Bin Cheng <bin.ch...@arm.com> wrote:
> Hi,
> Quite lot of time is used when IVOPT computes cost for <use, cand> pairs.  As 
> a matter of fact, some pairs are very similar to each other, and we can 
> abstract and compute cost only once for these pairs.  This is a patch doing 
> so, the idea is skipping cost computation for sub-uses in each group, of 
> course it may result in different assembly code for some complicated cases 
> because it estimates cost rather than doing real computation.  I did double 
> check one of such case that the change in generated assembly is not 
> degeneration.  For an IVOPT heavy program (spec2k/173), this patch reduces 
> IVOPT's compilation time by 7~8%, as well as the memory consumption on my 
> developing machine.
>
> Bootstrap & test on x86_64.
>
> For spec2k6 data on x86_64.  Maybe because I ran spec2k6 compiled with 
> patched GCC in unclean environment, some cases are regressed by small amount 
> (< %1).  I manually compared assembly code for several cases, including ones 
> with the largest regression (still within <1%).  I could confirm that 
> generated assembly code is exact the same as unpatched GCC, except for 
> function emit_library_call_value_1 in 403.gcc/calls.c.
>
> In this case, difference of IVOPT dumps is as below:
>
> $ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts
>
>   <bb 44>:                                                        <bb 44>:
>   # val_21 = PHI <val_175(168), val_650(43)>                      # val_21 = 
> PHI <val_175(168), val_650(43)>
>   _811 = (void *) ivtmp.322_829;                                  _811 = 
> (void *) ivtmp.322_829;
>   MEM[base: _811, offset: -48B] = val_21;                     |   MEM[base: 
> _811, offset: -32B] = val_21;
>   _810 = (void *) ivtmp.322_829;                                  _810 = 
> (void *) ivtmp.322_829;
>   MEM[base: _810, offset: -40B] = mode_163;                   |   MEM[base: 
> _810, offset: -24B] = mode_163;
>   _182 = function_arg (&args_so_far, mode_163, 0B, 1);            _182 = 
> function_arg (&args_so_far, mode_163, 0B, 1);
>   _809 = (void *) ivtmp.322_829;                                  _809 = 
> (void *) ivtmp.322_829;
>   MEM[base: _809, offset: -32B] = _182;                       |   MEM[base: 
> _809, offset: -16B] = _182;
>   _807 = (void *) ivtmp.322_829;                                  _807 = 
> (void *) ivtmp.322_829;
>   MEM[base: _807, offset: -24B] = 0;                          |   MEM[base: 
> _807, offset: -8B] = 0;
>   _185 = (struct args_size *) ivtmp.322_829;                  |   _801 = 
> ivtmp.322_829 + 16;
>   _801 = ivtmp.322_829 + 18446744073709551600;                <
>   _800 = (struct args_size *) _801;                               _800 = 
> (struct args_size *) _801;
>   _186 = _800;                                                |   _185 = _800;
>                                                               >   _186 = 
> (struct args_size *) ivtmp.322_829;
>   _187 = _182 != 0B;                                              _187 = _182 
> != 0B;
>   _188 = (int) _187;                                              _188 = 
> (int) _187;
>   locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1     
> locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1
>   _802 = (void *) ivtmp.322_829;                                  _802 = 
> (void *) ivtmp.322_829;
>   _190 = MEM[base: _802, offset: 8B];                         |   _190 = 
> MEM[base: _802, offset: 24B];
>   if (_190 != 0B)                                                 if (_190 != 
> 0B)
>     goto <bb 45>;                                                   goto <bb 
> 45>;
>   else                                                            else
>     goto <bb 46>;                                                   goto <bb 
> 46>;
>
>   <bb 45>:                                                        <bb 45>:
>   fancy_abort ("calls.c", 3724, &__FUNCTION__);                   fancy_abort 
> ("calls.c", 3724, &__FUNCTION__);
>
> It's only an offset difference in IV.  And below is difference of generated 
> assembly:
> $ diff -y trunk/calls.S patch/calls.S
> .L489:                                                          .L489:
>         leaq    -80(%rbp), %rdi                                         leaq  
>   -80(%rbp), %rdi
>         xorl    %edx, %edx                                              xorl  
>   %edx, %edx
>         movl    $1, %ecx                                                movl  
>   $1, %ecx
>         movl    %r13d, %esi                                             movl  
>   %r13d, %esi
>         movq    %rax, -48(%r15)                               <
>         movl    %r13d, -40(%r15)                              <
>         call    function_arg                                  <
>         movl    $0, -24(%r15)                                 <
>         movq    %rax, -32(%r15)                                         movq  
>   %rax, -32(%r15)
>                                                               >         movl  
>   %r13d, -24(%r15)
>                                                               >         call  
>   function_arg
>         xorl    %edx, %edx                                              xorl  
>   %edx, %edx
>         pushq   %r12                                          |         movq  
>   %rax, -16(%r15)
>         testq   %rax, %rax                                              testq 
>   %rax, %rax
>         pushq   %r15                                          |         leaq  
>   16(%r15), %rax     <--I1
>         leaq    -16(%r15), %r9                                |         movl  
>   $0, -8(%r15)
>         leaq    -112(%rbp), %r8                                         leaq  
>   -112(%rbp), %r8
>                                                               >         pushq 
>   %r12
>         setne   %dl                                                     setne 
>   %dl
>         movl    %r13d, %edi                                   |         movq  
>   %r15, %r9          <--I2
>                                                               >         pushq 
>   %rax               <--I3
>         xorl    %ecx, %ecx                                              xorl  
>   %ecx, %ecx
>         xorl    %esi, %esi                                              xorl  
>   %esi, %esi
>                                                               >         movl  
>   %r13d, %edi
>         call    locate_and_pad_parm                                     call  
>   locate_and_pad_parm
>         cmpq    $0, 8(%r15)                                   |         cmpq  
>   $0, 24(%r15)
>         popq    %rax                                                    popq  
>   %rax
>         popq    %rdx                                                    popq  
>   %rdx
>         jne     .L602                                                   jne   
>   .L602
>
> There is one additional move instruction (I2) after patching, but I believe 
> it's a choice of RA.  If we switch %rax/%r9 in instructions I1/I2 as below:
>         ...
>         leaq    16(%r15), %r9
>         ...
>         movq    %r15, %rax
>         pushq   %r15
>
> Then I2 becomes redundant and can be removed.
>
> I will collect performance data on AArch64 to make sure there is no breakage 
> either.  So is it OK?

I think the patch is ok - note that I have a hard time following the
code, esp. the 'first' flag.

+      /* Add cost for sub uses in group.  */
+      do
+       {
+         /* Compute cost for the first sub_use with different offset to
+            the first one and use it afterwards, because the cost could
+            be very different if the offset is different.  */
+         if (first && use->addr_offset != sub_use->addr_offset)
+           {
+             first = false;
+             sub_cost = get_computation_cost (data, sub_use, cand, true,
+                                              NULL, &can_autoinc, NULL);
+             if (infinite_cost_p (sub_cost))
+               {
+                 cost = infinite_cost;
+                 break;
+               }
+           }
+
+         cost = add_costs (cost, sub_cost);
+         sub_use = sub_use->next;
+       }
+      while (sub_use);

we start this loop with first = true, so for each sub-use we compute
no new sub-cost until
use->addr_offset changes for the first time after which we will never
compute sub-cost
again.  So we call get_computation_cost at most twice for al sub-uses.

I suppose all sub-uses have equal ->addr_base.  Suppose sub-uses are then sorted
after ->addr_offset what keeps that list from having three different
addr_offset but
with "very different cost"?  There seems to be group_address_uses but
that suggests
the cost might be actually the same for all sub-uses.

So adding a little explaining before the loop over sub-uses would be
appreciated.

Thanks,
Richard.

> Thanks,
> bin
>
> 2016-03-23  Bin Cheng  <bin.ch...@arm.com>
>
>         * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field.
>         (no_cost, infinite_cost): Initialize the new field.
>         (get_computation_cost_at): Record setup cost.
>         (determine_use_iv_cost_address): Skip cost computation for sub
>         uses if we can estimate it without losing accuracy.
>

Reply via email to