[PATCH GCC]Reduce compilation time for IVOPT by skipping cost computation in use group

Bin Cheng Thu, 24 Mar 2016 10:28:19 -0700

Hi,
Quite lot of time is used when IVOPT computes cost for <use, cand> pairs.  As a 
matter of fact, some pairs are very similar to each other, and we can abstract 
and compute cost only once for these pairs.  This is a patch doing so, the idea 
is skipping cost computation for sub-uses in each group, of course it may 
result in different assembly code for some complicated cases because it 
estimates cost rather than doing real computation.  I did double check one of 
such case that the change in generated assembly is not degeneration.  For an 
IVOPT heavy program (spec2k/173), this patch reduces IVOPT's compilation time 
by 7~8%, as well as the memory consumption on my developing machine.


Bootstrap & test on x86_64.

For spec2k6 data on x86_64.  Maybe because I ran spec2k6 compiled with patched 
GCC in unclean environment, some cases are regressed by small amount (< %1).  I 
manually compared assembly code for several cases, including ones with the 
largest regression (still within <1%).  I could confirm that generated assembly 
code is exact the same as unpatched GCC, except for function 
emit_library_call_value_1 in 403.gcc/calls.c.

In this case, difference of IVOPT dumps is as below:

$ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts 

  <bb 44>:                                                        <bb 44>:
  # val_21 = PHI <val_175(168), val_650(43)>                      # val_21 = 
PHI <val_175(168), val_650(43)>
  _811 = (void *) ivtmp.322_829;                                  _811 = (void 
*) ivtmp.322_829;
  MEM[base: _811, offset: -48B] = val_21;                     |   MEM[base: 
_811, offset: -32B] = val_21;
  _810 = (void *) ivtmp.322_829;                                  _810 = (void 
*) ivtmp.322_829;
  MEM[base: _810, offset: -40B] = mode_163;                   |   MEM[base: 
_810, offset: -24B] = mode_163;
  _182 = function_arg (&args_so_far, mode_163, 0B, 1);            _182 = 
function_arg (&args_so_far, mode_163, 0B, 1);
  _809 = (void *) ivtmp.322_829;                                  _809 = (void 
*) ivtmp.322_829;
  MEM[base: _809, offset: -32B] = _182;                       |   MEM[base: 
_809, offset: -16B] = _182;
  _807 = (void *) ivtmp.322_829;                                  _807 = (void 
*) ivtmp.322_829;
  MEM[base: _807, offset: -24B] = 0;                          |   MEM[base: 
_807, offset: -8B] = 0;
  _185 = (struct args_size *) ivtmp.322_829;                  |   _801 = 
ivtmp.322_829 + 16;
  _801 = ivtmp.322_829 + 18446744073709551600;                <
  _800 = (struct args_size *) _801;                               _800 = 
(struct args_size *) _801;
  _186 = _800;                                                |   _185 = _800;
                                                              >   _186 = 
(struct args_size *) ivtmp.322_829;
  _187 = _182 != 0B;                                              _187 = _182 
!= 0B;
  _188 = (int) _187;                                              _188 = (int) 
_187;
  locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1     
locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1
  _802 = (void *) ivtmp.322_829;                                  _802 = (void 
*) ivtmp.322_829;
  _190 = MEM[base: _802, offset: 8B];                         |   _190 = 
MEM[base: _802, offset: 24B];
  if (_190 != 0B)                                                 if (_190 != 
0B)
    goto <bb 45>;                                                   goto <bb 
45>;
  else                                                            else
    goto <bb 46>;                                                   goto <bb 
46>;

  <bb 45>:                                                        <bb 45>:
  fancy_abort ("calls.c", 3724, &__FUNCTION__);                   fancy_abort 
("calls.c", 3724, &__FUNCTION__);

It's only an offset difference in IV.  And below is difference of generated 
assembly:
$ diff -y trunk/calls.S patch/calls.S 
.L489:                                                          .L489:
        leaq    -80(%rbp), %rdi                                         leaq    
-80(%rbp), %rdi
        xorl    %edx, %edx                                              xorl    
%edx, %edx
        movl    $1, %ecx                                                movl    
$1, %ecx
        movl    %r13d, %esi                                             movl    
%r13d, %esi
        movq    %rax, -48(%r15)                               <
        movl    %r13d, -40(%r15)                              <
        call    function_arg                                  <
        movl    $0, -24(%r15)                                 <
        movq    %rax, -32(%r15)                                         movq    
%rax, -32(%r15)
                                                              >         movl    
%r13d, -24(%r15)
                                                              >         call    
function_arg
        xorl    %edx, %edx                                              xorl    
%edx, %edx
        pushq   %r12                                          |         movq    
%rax, -16(%r15)
        testq   %rax, %rax                                              testq   
%rax, %rax
        pushq   %r15                                          |         leaq    
16(%r15), %rax     <--I1
        leaq    -16(%r15), %r9                                |         movl    
$0, -8(%r15)
        leaq    -112(%rbp), %r8                                         leaq    
-112(%rbp), %r8
                                                              >         pushq   
%r12
        setne   %dl                                                     setne   
%dl
        movl    %r13d, %edi                                   |         movq    
%r15, %r9          <--I2
                                                              >         pushq   
%rax               <--I3
        xorl    %ecx, %ecx                                              xorl    
%ecx, %ecx
        xorl    %esi, %esi                                              xorl    
%esi, %esi
                                                              >         movl    
%r13d, %edi
        call    locate_and_pad_parm                                     call    
locate_and_pad_parm
        cmpq    $0, 8(%r15)                                   |         cmpq    
$0, 24(%r15)
        popq    %rax                                                    popq    
%rax
        popq    %rdx                                                    popq    
%rdx
        jne     .L602                                                   jne     
.L602

There is one additional move instruction (I2) after patching, but I believe 
it's a choice of RA.  If we switch %rax/%r9 in instructions I1/I2 as below:
        ...
        leaq    16(%r15), %r9
        ...
        movq    %r15, %rax
        pushq   %r15
        
Then I2 becomes redundant and can be removed.

I will collect performance data on AArch64 to make sure there is no breakage 
either.  So is it OK?

Thanks,
bin

2016-03-23  Bin Cheng  <bin.ch...@arm.com>

        * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field.
        (no_cost, infinite_cost): Initialize the new field.
        (get_computation_cost_at): Record setup cost.
        (determine_use_iv_cost_address): Skip cost computation for sub
        uses if we can estimate it without losing accuracy.

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 5302edf..fc7e3de 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -168,10 +168,11 @@ struct comp_cost
                           the computation (in no concrete units --
                           complexity field should be larger for more
                           complex expressions and addressing modes).  */
+  int scratch;         /* Scratch used during cost computation.  */
 };
 
-static const comp_cost no_cost = {0, 0};
-static const comp_cost infinite_cost = {INFTY, INFTY};
+static const comp_cost no_cost = {0, 0, 0};
+static const comp_cost infinite_cost = {INFTY, INFTY, INFTY};
 
 /* The candidate - cost pair.  */
 struct cost_pair
@@ -4947,6 +4948,8 @@ get_computation_cost_at (struct ivopts_data *data,
       cost.cost += add_cost (data->speed, TYPE_MODE (ctype));
     }
 
+  /* Record setup cost in scrach field.  */
+  cost.scratch = cost.cost;
   /* Set of invariants depended on by sub use has already been computed
      for the first use in the group.  */
   if (use->sub_id)
@@ -5082,12 +5085,12 @@ determine_use_iv_cost_address (struct ivopts_data *data,
                               struct iv_use *use, struct iv_cand *cand)
 {
   bitmap depends_on;
-  bool can_autoinc;
+  bool can_autoinc, first;
   int inv_expr_id = -1;
   struct iv_use *sub_use;
-  comp_cost sub_cost;
   comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on,
                                         &can_autoinc, &inv_expr_id);
+  comp_cost sub_cost = cost;
 
   if (cand->ainc_use == use)
     {
@@ -5099,13 +5102,35 @@ determine_use_iv_cost_address (struct ivopts_data *data,
       else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE)
        cost = infinite_cost;
     }
-  for (sub_use = use->next;
-       sub_use && !infinite_cost_p (cost);
-       sub_use = sub_use->next)
+
+  if (!infinite_cost_p (cost) && use->next)
     {
-      sub_cost = get_computation_cost (data, sub_use, cand, true, NULL,
-                                      &can_autoinc, NULL);
-      cost = add_costs (cost, sub_cost);
+      first = true;
+      sub_use = use->next;
+      /* We don't want to add setup cost for sub-uses.  */
+      sub_cost.cost -= sub_cost.scratch;
+      /* Add cost for sub uses in group.  */
+      do
+       {
+         /* Compute cost for the first sub_use with different offset to
+            the first one and use it afterwards, because the cost could
+            be very different if the offset is different.  */
+         if (first && use->addr_offset != sub_use->addr_offset)
+           {
+             first = false;
+             sub_cost = get_computation_cost (data, sub_use, cand, true,
+                                              NULL, &can_autoinc, NULL);
+             if (infinite_cost_p (sub_cost))
+               {
+                 cost = infinite_cost;
+                 break;
+               }
+           }
+
+         cost = add_costs (cost, sub_cost);
+         sub_use = sub_use->next;
+       }
+      while (sub_use);
     }
 
   set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,

[PATCH GCC]Reduce compilation time for IVOPT by skipping cost computation in use group

Reply via email to