On Thu, Mar 24, 2016 at 6:26 PM, Bin Cheng <bin.ch...@arm.com> wrote: > Hi, > Quite lot of time is used when IVOPT computes cost for <use, cand> pairs. As > a matter of fact, some pairs are very similar to each other, and we can > abstract and compute cost only once for these pairs. This is a patch doing > so, the idea is skipping cost computation for sub-uses in each group, of > course it may result in different assembly code for some complicated cases > because it estimates cost rather than doing real computation. I did double > check one of such case that the change in generated assembly is not > degeneration. For an IVOPT heavy program (spec2k/173), this patch reduces > IVOPT's compilation time by 7~8%, as well as the memory consumption on my > developing machine. > > Bootstrap & test on x86_64. > > For spec2k6 data on x86_64. Maybe because I ran spec2k6 compiled with > patched GCC in unclean environment, some cases are regressed by small amount > (< %1). I manually compared assembly code for several cases, including ones > with the largest regression (still within <1%). I could confirm that > generated assembly code is exact the same as unpatched GCC, except for > function emit_library_call_value_1 in 403.gcc/calls.c. > > In this case, difference of IVOPT dumps is as below: > > $ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts > > <bb 44>: <bb 44>: > # val_21 = PHI <val_175(168), val_650(43)> # val_21 = > PHI <val_175(168), val_650(43)> > _811 = (void *) ivtmp.322_829; _811 = > (void *) ivtmp.322_829; > MEM[base: _811, offset: -48B] = val_21; | MEM[base: > _811, offset: -32B] = val_21; > _810 = (void *) ivtmp.322_829; _810 = > (void *) ivtmp.322_829; > MEM[base: _810, offset: -40B] = mode_163; | MEM[base: > _810, offset: -24B] = mode_163; > _182 = function_arg (&args_so_far, mode_163, 0B, 1); _182 = > function_arg (&args_so_far, mode_163, 0B, 1); > _809 = (void *) ivtmp.322_829; _809 = > (void *) ivtmp.322_829; > MEM[base: _809, offset: -32B] = _182; | MEM[base: > _809, offset: -16B] = _182; > _807 = (void *) ivtmp.322_829; _807 = > (void *) ivtmp.322_829; > MEM[base: _807, offset: -24B] = 0; | MEM[base: > _807, offset: -8B] = 0; > _185 = (struct args_size *) ivtmp.322_829; | _801 = > ivtmp.322_829 + 16; > _801 = ivtmp.322_829 + 18446744073709551600; < > _800 = (struct args_size *) _801; _800 = > (struct args_size *) _801; > _186 = _800; | _185 = _800; > > _186 = > (struct args_size *) ivtmp.322_829; > _187 = _182 != 0B; _187 = _182 > != 0B; > _188 = (int) _187; _188 = > (int) _187; > locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1 > locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1 > _802 = (void *) ivtmp.322_829; _802 = > (void *) ivtmp.322_829; > _190 = MEM[base: _802, offset: 8B]; | _190 = > MEM[base: _802, offset: 24B]; > if (_190 != 0B) if (_190 != > 0B) > goto <bb 45>; goto <bb > 45>; > else else > goto <bb 46>; goto <bb > 46>; > > <bb 45>: <bb 45>: > fancy_abort ("calls.c", 3724, &__FUNCTION__); fancy_abort > ("calls.c", 3724, &__FUNCTION__); > > It's only an offset difference in IV. And below is difference of generated > assembly: > $ diff -y trunk/calls.S patch/calls.S > .L489: .L489: > leaq -80(%rbp), %rdi leaq > -80(%rbp), %rdi > xorl %edx, %edx xorl > %edx, %edx > movl $1, %ecx movl > $1, %ecx > movl %r13d, %esi movl > %r13d, %esi > movq %rax, -48(%r15) < > movl %r13d, -40(%r15) < > call function_arg < > movl $0, -24(%r15) < > movq %rax, -32(%r15) movq > %rax, -32(%r15) > > movl > %r13d, -24(%r15) > > call > function_arg > xorl %edx, %edx xorl > %edx, %edx > pushq %r12 | movq > %rax, -16(%r15) > testq %rax, %rax testq > %rax, %rax > pushq %r15 | leaq > 16(%r15), %rax <--I1 > leaq -16(%r15), %r9 | movl > $0, -8(%r15) > leaq -112(%rbp), %r8 leaq > -112(%rbp), %r8 > > pushq > %r12 > setne %dl setne > %dl > movl %r13d, %edi | movq > %r15, %r9 <--I2 > > pushq > %rax <--I3 > xorl %ecx, %ecx xorl > %ecx, %ecx > xorl %esi, %esi xorl > %esi, %esi > > movl > %r13d, %edi > call locate_and_pad_parm call > locate_and_pad_parm > cmpq $0, 8(%r15) | cmpq > $0, 24(%r15) > popq %rax popq > %rax > popq %rdx popq > %rdx > jne .L602 jne > .L602 > > There is one additional move instruction (I2) after patching, but I believe > it's a choice of RA. If we switch %rax/%r9 in instructions I1/I2 as below: > ... > leaq 16(%r15), %r9 > ... > movq %r15, %rax > pushq %r15 > > Then I2 becomes redundant and can be removed. > > I will collect performance data on AArch64 to make sure there is no breakage > either. So is it OK?
I think the patch is ok - note that I have a hard time following the code, esp. the 'first' flag. + /* Add cost for sub uses in group. */ + do + { + /* Compute cost for the first sub_use with different offset to + the first one and use it afterwards, because the cost could + be very different if the offset is different. */ + if (first && use->addr_offset != sub_use->addr_offset) + { + first = false; + sub_cost = get_computation_cost (data, sub_use, cand, true, + NULL, &can_autoinc, NULL); + if (infinite_cost_p (sub_cost)) + { + cost = infinite_cost; + break; + } + } + + cost = add_costs (cost, sub_cost); + sub_use = sub_use->next; + } + while (sub_use); we start this loop with first = true, so for each sub-use we compute no new sub-cost until use->addr_offset changes for the first time after which we will never compute sub-cost again. So we call get_computation_cost at most twice for al sub-uses. I suppose all sub-uses have equal ->addr_base. Suppose sub-uses are then sorted after ->addr_offset what keeps that list from having three different addr_offset but with "very different cost"? There seems to be group_address_uses but that suggests the cost might be actually the same for all sub-uses. So adding a little explaining before the loop over sub-uses would be appreciated. Thanks, Richard. > Thanks, > bin > > 2016-03-23 Bin Cheng <bin.ch...@arm.com> > > * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field. > (no_cost, infinite_cost): Initialize the new field. > (get_computation_cost_at): Record setup cost. > (determine_use_iv_cost_address): Skip cost computation for sub > uses if we can estimate it without losing accuracy. >