On Fri, Oct 2, 2015 at 2:37 PM, Connor Abbott <cwabbo...@gmail.com> wrote: > Previously, we were using some heuristics to try and detect when a write > was about to begin a live range, or when a read was about to end a live > range. We never used the liveness analysis information used by the > register allocator, though, which meant that the scheduler's and the > allocator's ideas of when a live range began and ended were different. > Not only did this make our estimate of the register pressure benefit of > scheduling an instruction wrong in some cases, but it was preventing us > from knowing the actual register pressure when scheduling each > instruction, which we want to have in order to switch to register > pressure scheduling only when the register pressure is too high. > > This commit rewrites the register pressure tracking code to use the same > model as our register allocator currently uses. We use the results of > liveness analysis, as well as the compute_payload_ranges() function that > we split out in the last commit. This means that we compute live ranges > twice on each round through the register allocator, although we could > speed it up by only recomputing the ranges and not the live in/live out > sets after scheduling, since we only shuffle around instructions within > a single basic block when we schedule. > > Shader-db results on bdw: > > total instructions in shared programs: 7130187 -> 7129880 (-0.00%) > instructions in affected programs: 1744 -> 1437 (-17.60%) > helped: 1 > HURT: 1 > > total cycles in shared programs: 172535126 -> 172473226 (-0.04%) > cycles in affected programs: 11338636 -> 11276736 (-0.55%) > helped: 876 > HURT: 873 > > LOST: 8 > GAINED: 0 > Signed-off-by: Connor Abbott <cwabbo...@gmail.com> > --- > The results are a wash, but this is needed for a lot of the more > experimental things I want to do. I can drop this if there are any > objections. > > .../drivers/dri/i965/brw_schedule_instructions.cpp | 300 > +++++++++++++++++---- > 1 file changed, 244 insertions(+), 56 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > index 22a493f..6b8792b 100644 > --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > @@ -26,6 +26,7 @@ > */ > > #include "brw_fs.h" > +#include "brw_fs_live_variables.h" > #include "brw_vec4.h" > #include "brw_cfg.h" > #include "brw_shader.h" > @@ -400,22 +401,49 @@ schedule_node::set_latency_gen7(bool is_haswell) > class instruction_scheduler { > public: > instruction_scheduler(backend_shader *s, int grf_count, > + int hw_reg_count, int block_count, > instruction_scheduler_mode mode) > { > this->bs = s; > this->mem_ctx = ralloc_context(NULL); > this->grf_count = grf_count; > + this->hw_reg_count = hw_reg_count; > this->instructions.make_empty(); > this->instructions_to_schedule = 0; > this->post_reg_alloc = (mode == SCHEDULE_POST); > this->mode = mode; > this->time = 0; > if (!post_reg_alloc) { > - this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count); > - this->grf_active = rzalloc_array(mem_ctx, bool, grf_count); > + this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count); > + > + this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count); > + for (int i = 0; i < block_count; i++) > + this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD, > + BITSET_WORDS(grf_count)); > + > + this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count); > + for (int i = 0; i < block_count; i++) > + this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, > + BITSET_WORDS(grf_count)); > + > + this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, > block_count); > + for (int i = 0; i < block_count; i++) > + this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD, > + BITSET_WORDS(hw_reg_count)); > + > + this->written = rzalloc_array(mem_ctx, bool, grf_count); > + > + this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count); > + > + this->hw_reads_remaining = rzalloc_array(mem_ctx, int, > hw_reg_count); > } else { > - this->remaining_grf_uses = NULL; > - this->grf_active = NULL; > + this->reg_pressure_in = NULL; > + this->livein = NULL; > + this->liveout = NULL; > + this->hw_liveout = NULL; > + this->written = NULL; > + this->reads_remaining = NULL; > + this->hw_reads_remaining = NULL; > } > } > > @@ -442,7 +470,8 @@ public: > */ > virtual int issue_time(backend_instruction *inst) = 0; > > - virtual void count_remaining_grf_uses(backend_instruction *inst) = 0; > + virtual void count_reads_remaining(backend_instruction *inst) = 0; > + virtual void setup_liveness(cfg_t *cfg) = 0; > virtual void update_register_pressure(backend_instruction *inst) = 0; > virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; > > @@ -453,33 +482,63 @@ public: > bool post_reg_alloc; > int instructions_to_schedule; > int grf_count; > + int hw_reg_count; > int time; > + int reg_pressure; > + int block_idx; > exec_list instructions; > backend_shader *bs; > > instruction_scheduler_mode mode; > > - /** > - * Number of instructions left to schedule that reference each vgrf. > - * > - * Used so that we can prefer scheduling instructions that will end the > - * live intervals of multiple variables, to reduce register pressure. > + /* > + * The register pressure at the beginning of each basic block. > */ > - int *remaining_grf_uses; > > - /** > - * Tracks whether each VGRF has had an instruction scheduled that uses it. > - * > - * This is used to estimate whether scheduling a new instruction will > - * increase register pressure. > + int *reg_pressure_in; > + > + /* > + * The virtual GRF's whose range overlaps the beginning of each basic > block. > */ > - bool *grf_active; > + > + BITSET_WORD **livein; > + > + /* > + * The virtual GRF's whose range overlaps the end of each basic block. > + */ > + > + BITSET_WORD **liveout; > + > + /* > + * The hardware GRF's whose range overlaps the end of each basic block. > + */ > + > + BITSET_WORD **hw_liveout; > + > + /* > + * Whether we've scheduled a write for this virtual GRF yet. > + */ > + > + bool *written; > + > + /* > + * How many reads we haven't scheduled for this virtual GRF yet. > + */ > + > + int *reads_remaining; > + > + /* > + * How many reads we haven't scheduled for this hardware GRF yet. > + */ > + > + int *hw_reads_remaining; > }; > > class fs_instruction_scheduler : public instruction_scheduler > { > public: > - fs_instruction_scheduler(fs_visitor *v, int grf_count, > + fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count, > + int block_count, > instruction_scheduler_mode mode); > void calculate_deps(); > bool is_compressed(fs_inst *inst); > @@ -487,35 +546,115 @@ public: > int issue_time(backend_instruction *inst); > fs_visitor *v; > > - void count_remaining_grf_uses(backend_instruction *inst); > + void count_reads_remaining(backend_instruction *inst); > + void setup_liveness(cfg_t *cfg); > void update_register_pressure(backend_instruction *inst); > int get_register_pressure_benefit(backend_instruction *inst); > }; > > fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, > - int grf_count, > + int grf_count, int > hw_reg_count, > + int block_count, > > instruction_scheduler_mode mode) > - : instruction_scheduler(v, grf_count, mode), > + : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode), > v(v) > { > } > > +static bool > +is_src_duplicate(fs_inst *inst, int src) > +{ > + for (int i = 0; i < src; i++) > + if (inst->src[i].equals(inst->src[src])) > + return true; > + > + return false; > +} > + > void > -fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) > +fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) > { > fs_inst *inst = (fs_inst *)be; > > - if (!remaining_grf_uses) > + if (!reads_remaining) > return; > > - if (inst->dst.file == GRF) > - remaining_grf_uses[inst->dst.reg]++; > - > for (int i = 0; i < inst->sources; i++) { > - if (inst->src[i].file != GRF) > + if (is_src_duplicate(inst, i)) > + continue; > + > + if (inst->src[i].file == GRF) { > + reads_remaining[inst->src[i].reg]++; > + } else if (inst->src[i].file == HW_REG && > + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { > + if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count) > + continue; > + > + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr]++; > + > + if (v->devinfo->gen >= 6 && > + inst->opcode == FS_OPCODE_LINTERP && i == 0) { > + for (int j = 1; j < 4; j++) {
We should use regs_read() here > + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++; > + } > + } > + } > + } > +} > + > +void > +fs_instruction_scheduler::setup_liveness(cfg_t *cfg) > +{ > + /* First, compute liveness on a per-GRF level using the in/out sets from > + * liveness calculation. > + */ > + for (int block = 0; block < cfg->num_blocks; block++) { > + for (int i = 0; i < v->live_intervals->num_vars; i++) { > + if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) { > + int vgrf = v->live_intervals->vgrf_from_var[i]; > + if (!BITSET_TEST(livein[block], vgrf)) { > + reg_pressure_in[block] += v->alloc.sizes[vgrf]; > + BITSET_SET(livein[block], vgrf); > + } > + } > + > + if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) > + BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]); > + } > + } > + > + /* Now, extend the live in/live out sets for when a range crosses a block > + * boundary, which matches what our register allocator/interference code > + * does to account for force_writemask_all and incompatible exec_mask's. > + */ > + for (int block = 0; block < cfg->num_blocks - 1; block++) { > + for (int i = 0; i < grf_count; i++) { > + if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip && > + v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) { > + if (!BITSET_TEST(livein[block + 1], i)) { > + reg_pressure_in[block + 1] += v->alloc.sizes[i]; > + BITSET_SET(livein[block + 1], i); > + } > + > + BITSET_SET(liveout[block], i); > + } > + } > + } > + > + int payload_last_use_ip[hw_reg_count]; > + v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip); > + > + for (int i = 0; i < hw_reg_count; i++) { > + if (payload_last_use_ip[i] == -1) > continue; > > - remaining_grf_uses[inst->src[i].reg]++; > + for (int block = 0; block < cfg->num_blocks; block++) { > + if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i]) > + reg_pressure_in[block]++; > + > + if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) > + BITSET_SET(hw_liveout[block], i); > + } > } > } > > @@ -524,18 +663,29 @@ > fs_instruction_scheduler::update_register_pressure(backend_instruction *be) > { > fs_inst *inst = (fs_inst *)be; > > - if (!remaining_grf_uses) > + if (!reads_remaining) > return; > > if (inst->dst.file == GRF) { > - remaining_grf_uses[inst->dst.reg]--; > - grf_active[inst->dst.reg] = true; > + written[inst->dst.reg] = true; > } > > for (int i = 0; i < inst->sources; i++) { > + if (is_src_duplicate(inst, i)) > + continue; > + > if (inst->src[i].file == GRF) { > - remaining_grf_uses[inst->src[i].reg]--; > - grf_active[inst->src[i].reg] = true; > + reads_remaining[inst->src[i].reg]--; > + } else if (inst->src[i].file == HW_REG && > + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE > && > + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { > + int regs_read = (v->devinfo->gen >= 6 && > + inst->opcode == FS_OPCODE_LINTERP && > + i == 0) ? 4 : 1; and here > + > + for (int off = 0; off < regs_read; off++) { > + hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--; > + } > } > } > } > @@ -547,20 +697,35 @@ > fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction > *be) > int benefit = 0; > > if (inst->dst.file == GRF) { > - if (remaining_grf_uses[inst->dst.reg] == 1) > - benefit += v->alloc.sizes[inst->dst.reg]; > - if (!grf_active[inst->dst.reg]) > + if (!BITSET_TEST(livein[block_idx], inst->dst.reg) && > + !written[inst->dst.reg]) > benefit -= v->alloc.sizes[inst->dst.reg]; > } > > for (int i = 0; i < inst->sources; i++) { > - if (inst->src[i].file != GRF) > + if (is_src_duplicate(inst, i)) > continue; > > - if (remaining_grf_uses[inst->src[i].reg] == 1) > + if (inst->src[i].file == GRF && > + !BITSET_TEST(liveout[block_idx], inst->src[i].reg) && > + reads_remaining[inst->src[i].reg] == 1) > benefit += v->alloc.sizes[inst->src[i].reg]; > - if (!grf_active[inst->src[i].reg]) > - benefit -= v->alloc.sizes[inst->src[i].reg]; > + > + if (inst->src[i].file == HW_REG && > + inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && > + inst->src[i].fixed_hw_reg.nr < hw_reg_count) { > + int regs_read = (v->devinfo->gen >= 6 && > + inst->opcode == FS_OPCODE_LINTERP && > + i == 0) ? 4 : 1; and here Other than that, it seems reasonable Reviewed-by: Jason Ekstrand <jason.ekstr...@intel.com> > + > + for (int off = 0; off < regs_read; off++) { > + int reg = inst->src[i].fixed_hw_reg.nr + off; > + if (!BITSET_TEST(hw_liveout[block_idx], reg) && > + hw_reads_remaining[reg] == 1) { > + benefit++; > + } > + } > + } > } > > return benefit; > @@ -575,20 +740,26 @@ public: > int issue_time(backend_instruction *inst); > vec4_visitor *v; > > - void count_remaining_grf_uses(backend_instruction *inst); > + void count_reads_remaining(backend_instruction *inst); > + void setup_liveness(cfg_t *cfg); > void update_register_pressure(backend_instruction *inst); > int get_register_pressure_benefit(backend_instruction *inst); > }; > > vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, > int grf_count) > - : instruction_scheduler(v, grf_count, SCHEDULE_POST), > + : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST), > v(v) > { > } > > void > -vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) > +vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be) > +{ > +} > + > +void > +vec4_instruction_scheduler::setup_liveness(cfg_t *cfg) > { > } > > @@ -1387,6 +1558,9 @@ instruction_scheduler::schedule_instructions(bblock_t > *block) > const struct brw_device_info *devinfo = bs->devinfo; > backend_instruction *inst = block->end(); > time = 0; > + if (!post_reg_alloc) > + reg_pressure = reg_pressure_in[block->num]; > + block_idx = block->num; > > /* Remove non-DAG heads from the list. */ > foreach_in_list_safe(schedule_node, n, &instructions) { > @@ -1403,7 +1577,11 @@ instruction_scheduler::schedule_instructions(bblock_t > *block) > chosen->remove(); > inst->insert_before(block, chosen->inst); > instructions_to_schedule--; > - update_register_pressure(chosen->inst); > + > + if (!post_reg_alloc) { > + reg_pressure -= get_register_pressure_benefit(chosen->inst); > + update_register_pressure(chosen->inst); > + } > > /* If we expected a delay for scheduling, then bump the clock to > reflect > * that. In reality, the hardware will switch to another hyperthread > @@ -1421,6 +1599,8 @@ instruction_scheduler::schedule_instructions(bblock_t > *block) > if (debug) { > fprintf(stderr, "clock %4d, scheduled: ", time); > bs->dump_instruction(chosen->inst); > + if (!post_reg_alloc) > + fprintf(stderr, "(register pressure %d)\n", reg_pressure); > } > > /* Now that we've scheduled a new instruction, some of its > @@ -1490,25 +1670,30 @@ static unsigned get_cycle_count(cfg_t *cfg) > void > instruction_scheduler::run(cfg_t *cfg) > { > - if (debug) { > + if (debug && !post_reg_alloc) { > fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", > post_reg_alloc); > - bs->dump_instructions(); > + bs->dump_instructions(); > } > > - /* Populate the remaining GRF uses array to improve the pre-regalloc > - * scheduling. > - */ > - if (remaining_grf_uses) { > - foreach_block_and_inst(block, backend_instruction, inst, cfg) { > - count_remaining_grf_uses(inst); > - } > - } > + if (!post_reg_alloc) > + setup_liveness(cfg); > > foreach_block(block, cfg) { > if (block->end_ip - block->start_ip <= 1) > continue; > > + if (reads_remaining) { > + memset(reads_remaining, 0, > + grf_count * sizeof(*reads_remaining)); > + memset(hw_reads_remaining, 0, > + hw_reg_count * sizeof(*hw_reads_remaining)); > + memset(written, 0, grf_count * sizeof(*written)); > + > + foreach_inst_in_block(fs_inst, inst, block) > + count_reads_remaining(inst); > + } > + > add_insts_from_block(block); > > calculate_deps(); > @@ -1520,7 +1705,7 @@ instruction_scheduler::run(cfg_t *cfg) > schedule_instructions(block); > } > > - if (debug) { > + if (debug && !post_reg_alloc) { > fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", > post_reg_alloc); > bs->dump_instructions(); > @@ -1532,13 +1717,16 @@ instruction_scheduler::run(cfg_t *cfg) > void > fs_visitor::schedule_instructions(instruction_scheduler_mode mode) > { > + calculate_live_intervals(); > + > int grf_count; > if (mode == SCHEDULE_POST) > grf_count = grf_used; > else > grf_count = alloc.count; > > - fs_instruction_scheduler sched(this, grf_count, mode); > + fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf, > + cfg->num_blocks, mode); > sched.run(cfg); > > if (unlikely(debug_enabled) && mode == SCHEDULE_POST) { > -- > 2.1.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev