On Wed, Mar 30, 2016 at 9:09 AM, Richard Biener <richard.guent...@gmail.com> wrote: > On Thu, Mar 24, 2016 at 6:26 PM, Bin Cheng <bin.ch...@arm.com> wrote: >> Hi, >> Quite lot of time is used when IVOPT computes cost for <use, cand> pairs. >> As a matter of fact, some pairs are very similar to each other, and we can >> abstract and compute cost only once for these pairs. This is a patch doing >> so, the idea is skipping cost computation for sub-uses in each group, of >> course it may result in different assembly code for some complicated cases >> because it estimates cost rather than doing real computation. I did double >> check one of such case that the change in generated assembly is not >> degeneration. For an IVOPT heavy program (spec2k/173), this patch reduces >> IVOPT's compilation time by 7~8%, as well as the memory consumption on my >> developing machine. >> >> Bootstrap & test on x86_64. >> >> For spec2k6 data on x86_64. Maybe because I ran spec2k6 compiled with >> patched GCC in unclean environment, some cases are regressed by small amount >> (< %1). I manually compared assembly code for several cases, including ones >> with the largest regression (still within <1%). I could confirm that >> generated assembly code is exact the same as unpatched GCC, except for >> function emit_library_call_value_1 in 403.gcc/calls.c. >> >> In this case, difference of IVOPT dumps is as below: >> >> $ diff -y trunk/calls.c.154t.ivopts patch/calls.c.154t.ivopts >> >> <bb 44>: <bb 44>: >> # val_21 = PHI <val_175(168), val_650(43)> # val_21 = >> PHI <val_175(168), val_650(43)> >> _811 = (void *) ivtmp.322_829; _811 = >> (void *) ivtmp.322_829; >> MEM[base: _811, offset: -48B] = val_21; | MEM[base: >> _811, offset: -32B] = val_21; >> _810 = (void *) ivtmp.322_829; _810 = >> (void *) ivtmp.322_829; >> MEM[base: _810, offset: -40B] = mode_163; | MEM[base: >> _810, offset: -24B] = mode_163; >> _182 = function_arg (&args_so_far, mode_163, 0B, 1); _182 = >> function_arg (&args_so_far, mode_163, 0B, 1); >> _809 = (void *) ivtmp.322_829; _809 = >> (void *) ivtmp.322_829; >> MEM[base: _809, offset: -32B] = _182; | MEM[base: >> _809, offset: -16B] = _182; >> _807 = (void *) ivtmp.322_829; _807 = >> (void *) ivtmp.322_829; >> MEM[base: _807, offset: -24B] = 0; | MEM[base: >> _807, offset: -8B] = 0; >> _185 = (struct args_size *) ivtmp.322_829; | _801 = >> ivtmp.322_829 + 16; >> _801 = ivtmp.322_829 + 18446744073709551600; < >> _800 = (struct args_size *) _801; _800 = >> (struct args_size *) _801; >> _186 = _800; | _185 = >> _800; >> > _186 = >> (struct args_size *) ivtmp.322_829; >> _187 = _182 != 0B; _187 = >> _182 != 0B; >> _188 = (int) _187; _188 = >> (int) _187; >> locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1 >> locate_and_pad_parm (mode_163, 0B, _188, 0B, &args_size, _1 >> _802 = (void *) ivtmp.322_829; _802 = >> (void *) ivtmp.322_829; >> _190 = MEM[base: _802, offset: 8B]; | _190 = >> MEM[base: _802, offset: 24B]; >> if (_190 != 0B) if (_190 >> != 0B) >> goto <bb 45>; goto <bb >> 45>; >> else else >> goto <bb 46>; goto <bb >> 46>; >> >> <bb 45>: <bb 45>: >> fancy_abort ("calls.c", 3724, &__FUNCTION__); >> fancy_abort ("calls.c", 3724, &__FUNCTION__); >> >> It's only an offset difference in IV. And below is difference of generated >> assembly: >> $ diff -y trunk/calls.S patch/calls.S >> .L489: .L489: >> leaq -80(%rbp), %rdi leaq >> -80(%rbp), %rdi >> xorl %edx, %edx xorl >> %edx, %edx >> movl $1, %ecx movl >> $1, %ecx >> movl %r13d, %esi movl >> %r13d, %esi >> movq %rax, -48(%r15) < >> movl %r13d, -40(%r15) < >> call function_arg < >> movl $0, -24(%r15) < >> movq %rax, -32(%r15) movq >> %rax, -32(%r15) >> > movl >> %r13d, -24(%r15) >> > call >> function_arg >> xorl %edx, %edx xorl >> %edx, %edx >> pushq %r12 | movq >> %rax, -16(%r15) >> testq %rax, %rax >> testq %rax, %rax >> pushq %r15 | leaq >> 16(%r15), %rax <--I1 >> leaq -16(%r15), %r9 | movl >> $0, -8(%r15) >> leaq -112(%rbp), %r8 leaq >> -112(%rbp), %r8 >> > >> pushq %r12 >> setne %dl >> setne %dl >> movl %r13d, %edi | movq >> %r15, %r9 <--I2 >> > >> pushq %rax <--I3 >> xorl %ecx, %ecx xorl >> %ecx, %ecx >> xorl %esi, %esi xorl >> %esi, %esi >> > movl >> %r13d, %edi >> call locate_and_pad_parm call >> locate_and_pad_parm >> cmpq $0, 8(%r15) | cmpq >> $0, 24(%r15) >> popq %rax popq >> %rax >> popq %rdx popq >> %rdx >> jne .L602 jne >> .L602 >> >> There is one additional move instruction (I2) after patching, but I believe >> it's a choice of RA. If we switch %rax/%r9 in instructions I1/I2 as below: >> ... >> leaq 16(%r15), %r9 >> ... >> movq %r15, %rax >> pushq %r15 >> >> Then I2 becomes redundant and can be removed. >> >> I will collect performance data on AArch64 to make sure there is no breakage >> either. So is it OK? > > I think the patch is ok - note that I have a hard time following the > code, esp. the 'first' flag. > > + /* Add cost for sub uses in group. */ > + do > + { > + /* Compute cost for the first sub_use with different offset to > + the first one and use it afterwards, because the cost could > + be very different if the offset is different. */ > + if (first && use->addr_offset != sub_use->addr_offset) > + { > + first = false; > + sub_cost = get_computation_cost (data, sub_use, cand, true, > + NULL, &can_autoinc, NULL); > + if (infinite_cost_p (sub_cost)) > + { > + cost = infinite_cost; > + break; > + } > + } > + > + cost = add_costs (cost, sub_cost); > + sub_use = sub_use->next; > + } > + while (sub_use); > > we start this loop with first = true, so for each sub-use we compute > no new sub-cost until > use->addr_offset changes for the first time after which we will never > compute sub-cost > again. So we call get_computation_cost at most twice for al sub-uses. > > I suppose all sub-uses have equal ->addr_base. Suppose sub-uses are then > sorted > after ->addr_offset what keeps that list from having three different > addr_offset but > with "very different cost"? There seems to be group_address_uses but > that suggests > the cost might be actually the same for all sub-uses. > > So adding a little explaining before the loop over sub-uses would be > appreciated.
Thanks for reviewing. Here is the patch with trivially revised comments. I also collected benchmark data for spec2k6 on AArch64, there is no surprise except for case 456.hmmer. I double checked generated assembly and can confirm it's not real. Will apply the patch later. Thanks, bin > > Thanks, > Richard. > >> Thanks, >> bin >> >> 2016-03-23 Bin Cheng <bin.ch...@arm.com> >> >> * tree-ssa-loop-ivopts.c (struct comp_cost): New scrach field. >> (no_cost, infinite_cost): Initialize the new field. >> (get_computation_cost_at): Record setup cost. >> (determine_use_iv_cost_address): Skip cost computation for sub >> uses if we can estimate it without losing accuracy. >>
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 5302edf..3c81bf7 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -168,10 +168,11 @@ struct comp_cost the computation (in no concrete units -- complexity field should be larger for more complex expressions and addressing modes). */ + int scratch; /* Scratch used during cost computation. */ }; -static const comp_cost no_cost = {0, 0}; -static const comp_cost infinite_cost = {INFTY, INFTY}; +static const comp_cost no_cost = {0, 0, 0}; +static const comp_cost infinite_cost = {INFTY, INFTY, INFTY}; /* The candidate - cost pair. */ struct cost_pair @@ -4947,6 +4948,8 @@ get_computation_cost_at (struct ivopts_data *data, cost.cost += add_cost (data->speed, TYPE_MODE (ctype)); } + /* Record setup cost in scrach field. */ + cost.scratch = cost.cost; /* Set of invariants depended on by sub use has already been computed for the first use in the group. */ if (use->sub_id) @@ -5082,12 +5085,12 @@ determine_use_iv_cost_address (struct ivopts_data *data, struct iv_use *use, struct iv_cand *cand) { bitmap depends_on; - bool can_autoinc; + bool can_autoinc, first; int inv_expr_id = -1; struct iv_use *sub_use; - comp_cost sub_cost; comp_cost cost = get_computation_cost (data, use, cand, true, &depends_on, &can_autoinc, &inv_expr_id); + comp_cost sub_cost = cost; if (cand->ainc_use == use) { @@ -5099,13 +5102,47 @@ determine_use_iv_cost_address (struct ivopts_data *data, else if (cand->pos == IP_AFTER_USE || cand->pos == IP_BEFORE_USE) cost = infinite_cost; } - for (sub_use = use->next; - sub_use && !infinite_cost_p (cost); - sub_use = sub_use->next) + + if (!infinite_cost_p (cost) && use->next) { - sub_cost = get_computation_cost (data, sub_use, cand, true, NULL, - &can_autoinc, NULL); - cost = add_costs (cost, sub_cost); + first = true; + sub_use = use->next; + /* We don't want to add setup cost for sub-uses. */ + sub_cost.cost -= sub_cost.scratch; + /* Add cost for sub uses in group. */ + do + { + /* Compute cost for the first sub use with different offset to + the main use and add it afterwards. Costs for these uses + could be quite different. Given below uses in a group: + use 0 : {base + A + offset_0, step} + use 0.1: {base + A + offset_0, step} + use 0.2: {base + A + offset_1, step} + use 0.3: {base + A + offset_2, step} + when we need to compute costs with candidate: + cand 1 : {base + B + offset_0, step} + + The first sub use with different offset is use 0.2, its cost + is larger than cost of use 0/0.1 because we need to compute: + A - B + offset_1 - offset_0 + rather than: + A - B. */ + if (first && use->addr_offset != sub_use->addr_offset) + { + first = false; + sub_cost = get_computation_cost (data, sub_use, cand, true, + NULL, &can_autoinc, NULL); + if (infinite_cost_p (sub_cost)) + { + cost = infinite_cost; + break; + } + } + + cost = add_costs (cost, sub_cost); + sub_use = sub_use->next; + } + while (sub_use); } set_use_iv_cost (data, use, cand, cost, depends_on, NULL_TREE, ERROR_MARK,