Sethi-Ullman numbering is a heuristic that lets us see a little farther when it comes to making decisions based on register pressure, instead of only greedily trying to decrease register pressure based on a threshold. It provides a lower bound on the number of registers required to schedule a node and all of its children, which we can use to avoid scheduling nodes which would definitely require more registers than we could afford.
total instructions in shared programs: 7564666 -> 7564666 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 51126212 -> 50217894 (-1.78%) cycles in affected programs: 8758434 -> 7850116 (-10.37%) helped: 1754 HURT: 1462 LOST: 36 GAINED: 72 Signed-off-by: Connor Abbott <cwabbo...@gmail.com> --- .../drivers/dri/i965/brw_schedule_instructions.cpp | 197 ++++++++++++++++++--- 1 file changed, 169 insertions(+), 28 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 85590df..af992a4 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -82,6 +82,12 @@ public: * its children, or just the issue_time if it's a leaf node. */ int delay; + + /* + * The Sethi-Ullman number for this node, which is a lower bound on the + * number of registers required to schedule it. + */ + int regs_required; }; void @@ -456,6 +462,7 @@ public: void run(cfg_t *cfg); void add_insts_from_block(bblock_t *block); void compute_delay(schedule_node *node); + void compute_regs_required(schedule_node *node); virtual void calculate_deps() = 0; virtual schedule_node *choose_instruction_to_schedule() = 0; @@ -472,6 +479,7 @@ public: virtual void setup_liveness(cfg_t *cfg) = 0; virtual void update_register_pressure(backend_instruction *inst) = 0; virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; + virtual int dst_size(schedule_node *n) = 0; void schedule_instructions(bblock_t *block); @@ -553,7 +561,10 @@ public: void count_writes_remaining(backend_instruction *inst); void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); + void update_unblocked(backend_instruction *inst); + int get_register_write_benefit(fs_inst *inst); int get_register_pressure_benefit(backend_instruction *inst); + int dst_size(schedule_node *n); }; fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, @@ -676,17 +687,23 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be) } int -fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) +fs_instruction_scheduler::get_register_write_benefit(fs_inst *inst) { - fs_inst *inst = (fs_inst *)be; - int benefit = 0; - if (inst->dst.file == GRF) { if (!BITSET_TEST(livein[block_idx], inst->dst.reg) && writes_remaining[inst->dst.reg] == 1) - benefit += v->alloc.sizes[inst->dst.reg]; + return v->alloc.sizes[inst->dst.reg]; } + return 0; +} + +int +fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) +{ + fs_inst *inst = (fs_inst *)be; + int benefit = get_register_write_benefit(inst); + for (int i = 0; i < inst->sources; i++) { if (is_src_duplicate(inst, i)) continue; @@ -725,6 +742,7 @@ public: void setup_liveness(cfg_t *cfg); void update_register_pressure(backend_instruction *inst); int get_register_pressure_benefit(backend_instruction *inst); + int dst_size(schedule_node *n); }; vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, @@ -768,6 +786,7 @@ schedule_node::schedule_node(backend_instruction *inst, this->parent_count = 0; this->unblocked_time = 0; this->delay = 0; + this->regs_required = 0; /* We can't measure Gen6 timings directly but expect them to be much * closer to Gen7 than Gen4. @@ -818,6 +837,95 @@ instruction_scheduler::compute_delay(schedule_node *n) } } +int +fs_instruction_scheduler::dst_size(schedule_node *n) +{ + fs_inst *inst = (fs_inst *) n->inst; + if (inst->dst.file != GRF) + return 0; + + return v->alloc.sizes[inst->dst.reg]; +} + +int +vec4_instruction_scheduler::dst_size(schedule_node *n) +{ + return 0; +} + +static int compare(const void *data1, const void *data2) +{ + const schedule_node *n1 = *(schedule_node **)data1; + const schedule_node *n2 = *(schedule_node **)data2; + return n1->regs_required - n2->regs_required; +} + +/** + * Compute the Sethi-Ullman numbering for a node. + * + * The Sethi-Ullman numbering is a lower bound for the number of registers + * required to schedule a given node and all of its children in the DAG. Say + * that we've scheduled all of a node N's children optimally. We'll ignore any + * interactions between the children, and assume that each child can be + * scheduled independently and one after another; this is an approximation, + * only correct for expression trees without any barriers. We schedule the + * child that takes the least registers first (although in the final order, + * it'll be last because we're doing bottom-up scheduling), and order the rest + * by the number of registers they require. We have a list of children C_1, + * C_2, ... of node N that are in increasing order of registers required. + * Now, say that C_n+1 requires dst_size(C_n+1) more registers than C_n. The + * result of C_n+1 can be passed through C_n without increasing the number of + * registers required beyond what C_n+1 requires, so the number of registers + * required is the same as the number of registers that C_n+1 requires. + * Otherwise, the number of registers required will be regs_required(C_n) + + * dst_size(C_n+1). Furthermore, this number of registers required defines a + * new "high water mark" against which to compare the next instruction's + * registers required. Any later children have to pass their result through + * the high water mark, and doing so will increase the number of registers + * required, unless the new source C_m requires even more registers; then the + * high water mark resets to C_m. We don't need to store where the high water + * mark actually is, though; all we need to know is its current height (i.e. + * the current number of registers required) as we compare it to each child in + * turn. + */ +void +instruction_scheduler::compute_regs_required(schedule_node *n) +{ + int num_children = 0; + for (int i = 0; i < n->child_count; i++) { + /* Assume dependencies with a latency of 0 are fake and ignore them */ + if (n->child_latency[i] == 0) + continue; + + num_children++; + } + + if (num_children == 0) { + n->regs_required = dst_size(n); + return; + } + + schedule_node *children[num_children]; + int j = 0; + for (int i = 0; i < n->child_count; i++) { + if (n->child_latency[i] == 0) + continue; + + children[j++] = n->children[i]; + } + + /* sort the children in increasing order of registers required */ + qsort(children, num_children, sizeof(schedule_node *), compare); + + n->regs_required = children[0]->regs_required; + for (int i = 1; i < num_children; i++) { + n->regs_required = MAX2(n->regs_required + dst_size(children[i]), + children[i]->regs_required); + } + + n->regs_required = MAX2(n->regs_required, dst_size(n)); +} + /** * Add a dependency between two instruction nodes. * @@ -1391,27 +1499,56 @@ fs_instruction_scheduler::choose_instruction_to_schedule() schedule_node *chosen = NULL; if (post_reg_alloc || reg_pressure < pressure_threshold) { - int chosen_unblocked_time = 0, chosen_delay = 0; - - /* First, find the earliest instruction we can possibly schedule. Then, - * if there are multiple instructions that we can schedule at the same - * time, choose the thing with the longest critical path. The idea here - * is to try not to lengthen already larger critical path lengths, but - * if we can, we should first schedule instructions that we can fit in - * before the instruction with the longest critical path. There might be - * some cases where we still bump the critical path instruction, but - * this seems unlikely, given that most instructions have the same issue - * time and most latencies are a multiple of the issue time. - */ foreach_in_list(schedule_node, n, &instructions) { + if (!chosen) { + chosen = n; + continue; + } + + if (!post_reg_alloc) { + /* Penalize choices which require more registers than the + * threshold. Note that if a choice benefits from ending a live + * interval, we should take that into account; for example, + * imagine we have an add instruction that takes two inputs. The + * instruction requires two registers to schedule, but it ends the + * live range of the register it writes to, then it only requires + * one additional register beyond the current register pressure. + */ + int write_benefit = get_register_write_benefit((fs_inst *)n->inst); + int chosen_write_benefit = + get_register_write_benefit((fs_inst *)chosen->inst); + int excess = MAX2(0, reg_pressure + n->regs_required + - write_benefit - pressure_threshold); + int chosen_excess = MAX2(0, reg_pressure + chosen->regs_required + - chosen_write_benefit + - pressure_threshold); + if (excess < chosen_excess) { + chosen = n; + continue; + } else if (excess > chosen_excess) { + continue; + } + } + + /* First, find the earliest instruction we can possibly schedule. + * Then, if there are multiple instructions that we can schedule at + * the same time, choose the thing with the longest critical path. + * The idea here is to try not to lengthen already larger critical + * path lengths, but if we can, we should first schedule instructions + * that we can fit in before the instruction with the longest + * critical path. There might be some cases where we still bump the + * critical path instruction, but this seems unlikely, given that + * most instructions have the same issue time and most latencies are + * a multiple of the issue time. + */ int unblocked_time = MAX2(n->unblocked_time, time); - if (!chosen || - unblocked_time < chosen_unblocked_time || + int chosen_unblocked_time = MAX2(chosen->unblocked_time, time); + + if (unblocked_time < chosen_unblocked_time || (unblocked_time == chosen_unblocked_time && - n->delay > chosen_delay)) { + n->delay > chosen->delay)) { chosen = n; - chosen_unblocked_time = unblocked_time; - chosen_delay = n->delay; + continue; } } } else { @@ -1445,15 +1582,13 @@ fs_instruction_scheduler::choose_instruction_to_schedule() } /* For instructions with the same benefit, prefer the one with the - * highest delay to the end of the program. This is most likely to - * have its values able to be consumed first (such as for a large - * tree of lowered ubo loads, which appear reversed in the - * instruction stream with respect to when they can be consumed). + * lowest number of registers required. This will reduce the amount + * we go over the threshold. */ - if (n->delay < chosen->delay) { + if (n->regs_required < chosen->regs_required) { chosen = n; continue; - } else if (n->delay > chosen->delay) { + } else if (n->regs_required > chosen->regs_required) { continue; } @@ -1651,6 +1786,12 @@ instruction_scheduler::run(cfg_t *cfg) compute_delay(n); } + if (!post_reg_alloc) { + foreach_in_list(schedule_node, n, &instructions) { + compute_regs_required(n); + } + } + schedule_instructions(block); } -- 2.4.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev