Hi Bin, > I see, it's similar to the auto-increment case where cost should be > recorded only once. So this is okay given 1) fine predicting > rtl-unroll is likely impossible here; 2) the patch has very limited > impact. > Really appreciate your time and patience!
I extended the previous version to address Richard S.'s comments on candidates with the same base/step but different offsets here: https://gcc.gnu.org/pipermail/gcc-patches/2020-June/547014.html. The previous version only allows the candidate derived from the group of interest, this updated patch extends it to those ones which have the same bases/steps and same/different offsets but in the acceptable range by considering unrolling. For one particular case like: for (i = 0; i < SIZE; i++) y[i] = a * x[i] + z[i]; we will mark reg_offset_p for IV candidates on x as below: - (unsigned long) (x_18(D) + 8) // only mark this before. - x_18(D) + 8 - (unsigned long) (x_18(D) + 24) - (unsigned long) ((vector(2) double *) (x_18(D) + 8) + 18446744073709551600) ... Do you mind to have a review again? Thanks in advance! Bootstrapped/regtested on powerpc64le-linux-gnu P8 and P9. SPEC2017 P9 performance run has no remarkable degradations/improvements. BR, Kewen ----- gcc/ChangeLog: * tree-ssa-loop-ivopts.c (struct iv_cand): New field reg_offset_p. (struct ivopts_data): New field consider_reg_offset_for_unroll_p. (mark_reg_offset_candidates): New function. (add_candidate_1): Set reg_offset_p to false for new candidate. (set_group_iv_cost): Scale up group cost with estimate_unroll_factor if consider_reg_offset_for_unroll_p. (determine_iv_cost): Increase step cost with estimate_unroll_factor if consider_reg_offset_for_unroll_p. (tree_ssa_iv_optimize_loop): Call estimate_unroll_factor, update consider_reg_offset_for_unroll_p.
diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 1d2697ae1ba..5a19b53c8d5 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -473,6 +473,9 @@ struct iv_cand struct iv *orig_iv; /* The original iv if this cand is added from biv with smaller type. */ bool doloop_p; /* Whether this is a doloop candidate. */ + bool reg_offset_p; /* Whether this is available for an address type group + where its all uses are valid to adopt reg_offset + addressing mode even considering unrolling. */ }; /* Hashtable entry for common candidate derived from iv uses. */ @@ -653,6 +656,10 @@ struct ivopts_data /* Whether the loop has doloop comparison use. */ bool doloop_use_p; + + /* Whether need to consider register offset addressing mode for the loop with + upcoming unrolling by estimated unroll factor. */ + bool consider_reg_offset_for_unroll_p; }; /* An assignment of iv candidates to uses. */ @@ -2731,6 +2738,112 @@ split_address_groups (struct ivopts_data *data) } } +/* For each address type group, it finds the address-based IV candidates with + the same base and step, for those that are available to be used for the + whole group with reg_offset addressing mode by considering the address offset + difference and increased offset with unrolling factor estimation, mark them + as reg_offset_p. */ + +static void +mark_reg_offset_candidates (struct ivopts_data *data) +{ + class loop *loop = data->current_loop; + gcc_assert (data->current_loop->estimated_unroll > 1); + bool any_reg_offset_p = false; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "<Reg_offset_p Candidates>:\n"); + + auto valid_reg_offset_p + = [] (struct iv_use *use, poly_uint64 off, poly_uint64 max_inc) { + if (!addr_offset_valid_p (use, off)) + return false; + if (!addr_offset_valid_p (use, off + max_inc)) + return false; + return true; + }; + + for (unsigned i = 0; i < data->vgroups.length (); i++) + { + struct iv_group *group = data->vgroups[i]; + + if (address_p (group->type)) + { + struct iv_use *head_use = group->vuses[0]; + if (!tree_fits_poly_int64_p (head_use->iv->step)) + continue; + + poly_int64 step = tree_to_poly_int64 (head_use->iv->step); + /* Max extra offset to be added due to unrolling. */ + poly_int64 max_increase = (loop->estimated_unroll - 1) * step; + + tree use_base = head_use->addr_base; + STRIP_NOPS (use_base); + + struct iv_use *last_use = NULL; + unsigned group_size = group->vuses.length (); + gcc_assert (group_size >= 1); + if (maybe_ne (head_use->addr_offset, + group->vuses[group_size - 1]->addr_offset)) + last_use = group->vuses[group_size - 1]; + + unsigned j; + bitmap_iterator bi; + EXECUTE_IF_SET_IN_BITMAP (group->related_cands, 0, j, bi) + { + struct iv_cand *cand = data->vcands[j]; + + if (!cand->iv->base_object) + continue; + + if (cand->reg_offset_p) + continue; + + if (!operand_equal_p (head_use->iv->base_object, + cand->iv->base_object, 0)) + continue; + + if (!operand_equal_p (head_use->iv->step, cand->iv->step, 0)) + continue; + + poly_uint64 cand_offset = 0; + tree cand_base = strip_offset (cand->iv->base, &cand_offset); + STRIP_NOPS (cand_base); + if (!operand_equal_p (use_base, cand_base, 0)) + continue; + + /* Only need to check the first one and the last one in the group + since it's sorted. If both are valid, the other intermediate + ones should be in the acceptable range. */ + poly_uint64 head_off = head_use->addr_offset - cand_offset; + if (!valid_reg_offset_p (head_use, head_off, max_increase)) + continue; + + if (last_use) + { + poly_int64 last_off = last_use->addr_offset - cand_offset; + if (!valid_reg_offset_p (head_use, last_off, max_increase)) + continue; + } + + cand->reg_offset_p = true; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, " cand %u valid for group %u\n", j, i); + + if (!any_reg_offset_p) + any_reg_offset_p = true; + } + } + } + + if (!any_reg_offset_p) + data->consider_reg_offset_for_unroll_p = false; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "\n"); +} + /* Finds uses of the induction variables that are interesting. */ static void @@ -3147,6 +3260,7 @@ add_candidate_1 (struct ivopts_data *data, tree base, tree step, bool important, cand->important = important; cand->incremented_at = incremented_at; cand->doloop_p = doloop; + cand->reg_offset_p = false; data->vcands.safe_push (cand); if (!poly_int_tree_p (step)) @@ -3654,6 +3768,14 @@ set_group_iv_cost (struct ivopts_data *data, return; } + /* Since we priced more on non reg_offset IV cand step cost, we should scale + up the appropriate IV group costs. Simply consider USE_COMPARE at the + loop exit, FIXME if multiple exits supported or no loop exit comparisons + matter. */ + if (data->consider_reg_offset_for_unroll_p + && group->vuses[0]->type != USE_COMPARE) + cost *= (HOST_WIDE_INT) data->current_loop->estimated_unroll; + if (data->consider_all_candidates) { group->cost_map[cand->id].cand = cand; @@ -5718,6 +5840,9 @@ find_iv_candidates (struct ivopts_data *data) if (!data->consider_all_candidates) relate_compare_use_with_all_cands (data); + if (data->consider_reg_offset_for_unroll_p) + mark_reg_offset_candidates (data); + if (dump_file && (dump_flags & TDF_DETAILS)) { unsigned i; @@ -5890,6 +6015,10 @@ determine_iv_cost (struct ivopts_data *data, struct iv_cand *cand) cost_step = add_cost (data->speed, TYPE_MODE (TREE_TYPE (base))); cost = cost_step + adjust_setup_cost (data, cost_base.cost); + /* Consider additional step updates during unrolling. */ + if (data->consider_reg_offset_for_unroll_p && !cand->reg_offset_p) + cost += (data->current_loop->estimated_unroll - 1) * cost_step; + /* Prefer the original ivs unless we may gain something by replacing it. The reason is to make debugging simpler; so this is not relevant for artificial ivs created by other optimization passes. */ @@ -7976,6 +8105,7 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, class loop *loop, data->current_loop = loop; data->loop_loc = find_loop_location (loop).get_location_t (); data->speed = optimize_loop_for_speed_p (loop); + data->consider_reg_offset_for_unroll_p = false; if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -8008,6 +8138,16 @@ tree_ssa_iv_optimize_loop (struct ivopts_data *data, class loop *loop, if (!find_induction_variables (data)) goto finish; + if (param_iv_consider_reg_offset_for_unroll != 0 && exit) + { + tree_niter_desc *desc = niter_for_exit (data, exit); + estimate_unroll_factor (loop, desc); + data->consider_reg_offset_for_unroll_p = loop->estimated_unroll > 1; + if (dump_file && (dump_flags & TDF_DETAILS) + && data->consider_reg_offset_for_unroll_p) + fprintf (dump_file, "\nEstimated_unroll:%u\n", loop->estimated_unroll); + } + /* Finds interesting uses (item 1). */ find_interesting_uses (data); if (data->vgroups.length () > MAX_CONSIDERED_GROUPS)