Certain platforms support the ability to sample from a texture, and write it out to the file RT - thus saving a costly send instructions (and before Topi's recent patch, excess MOVs),
On BSW, the performance data isn't quite what I was hoping for. Statistically, it is an improvement. Several microbenchmarks improve between 0-2%, and Egypt improves by similar amounts. No benchmarks regress with n=10. BDW and SKL also support this, testing would be welcome. The ministat with n=10 Egypt: 1.94611% +/- 1.35665% Egpyt Offscreen: 1.2429% +/- 0.834759% v2: Modify the algorithm. Instead of iterating in reverse through blocks and insts, since the last block/inst is the only thing which can benefit. Rebased on top of Ken's patching modifying is_last_send Cc: Kenneth Graunke <kenn...@whitecape.org> Cc: Jason Ekstrand <jason.ekstr...@intel.com> Signed-off-by: Ben Widawsky <b...@bwidawsk.net> --- src/mesa/drivers/dri/i965/brw_fs.cpp | 101 +++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_fs.h | 3 + .../dri/i965/brw_fs_dead_code_eliminate.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 12 +++ 4 files changed, 117 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 78feee2..4b8fb4a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2452,6 +2452,106 @@ fs_visitor::opt_algebraic() return progress; } +/* Set up the sampler message to also have EOT set */ +void +fs_visitor::combine_send_tex(struct bblock_t *block, + fs_inst *tex_inst, + fs_inst *fb_write) +{ + assert((tex_inst->offset & (0xff << 24)) == 0); + assert(!tex_inst->eot); /* We can't get here twice */ + + fs_inst *old_load_payload = (fs_inst *) tex_inst->prev; + /* It's unlikely yet possible the load_payload is further above */ + if (old_load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) { + perf_debug("Potential send collapse skipped: Where's LOAD_PAYLOAD?"); + return; + } + + tex_inst->offset |= fb_write->target << 24; + tex_inst->eot = true; + fb_write->remove(block); + + /* The texture instruction with EOT requires a header. If one is already + * present, there is nothing to do. + */ + if (tex_inst->header_present) + return; + + /* If a header is not present, we need to create a new LOAD_PAYLOAD command + * with the same sources and a space saved for the header. Using a new + * destination register not only makes sure we have enough space, but it will + * make sure the dead code eliminator kills the old command. + */ + tex_inst->mlen++; + tex_inst->header_present = true; + + fs_reg send_header = vgrf(old_load_payload->sources + 1); + fs_reg *new_sources = + ralloc_array(mem_ctx, fs_reg, old_load_payload->sources + 1); + + new_sources[0] = fs_reg(); + for (int i = 0; i < old_load_payload->sources; i++) + new_sources[i+1] = old_load_payload->src[i]; + + /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it + * requires a lot of information about the sources to appropriately figure + * out the number of registers needed to be used. Given this stage in our + * optimization, we may not have the appropriate GRFs required by + * LOAD_PAYLOAD at this point (copy propogation). Therefore, we need to + * manually emit the instruction. + * + * jekstrand: "The helper is too automagic for what you're doing." + */ + fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, + old_load_payload->exec_size, + send_header, + new_sources, + old_load_payload->sources + 1); + + new_load_payload->regs_written = old_load_payload->regs_written + 1; + + tex_inst->insert_before(block, new_load_payload); + tex_inst->src[0] = send_header; + +} + +/* Try to collapse two sends into one. + * + * The first send should be sampling from a texture, the second send should be + * writing out the returned information to the RT. + */ +bool +fs_visitor::opt_sampler_eot() +{ + brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; + + if (brw->gen < 9 && !brw->is_cherryview) + return false; + + if (key->nr_color_regions != 1) + return false; + + fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end(); + fs_inst *tex_inst = (fs_inst *) fb_write->prev; + + if (tex_inst->is_head_sentinel()) + return false; + + /* It will potentially be multiple iterations of optimizations before we get + * the condition we wish to optimize for. + */ + if (!tex_inst->is_tex()) + return false; + + assert(fb_write->eot); + assert(fb_write->opcode == FS_OPCODE_FB_WRITE); + + combine_send_tex(cfg->blocks[cfg->num_blocks - 1], tex_inst, fb_write); + + return true; +} + bool fs_visitor::opt_register_renaming() { @@ -3609,6 +3709,7 @@ fs_visitor::optimize() OPT(opt_peephole_predicated_break); OPT(opt_cmod_propagation); OPT(dead_code_eliminate); + OPT(opt_sampler_eot); OPT(opt_peephole_sel); OPT(dead_control_flow_eliminate, this); OPT(opt_register_renaming); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b95e2c0..c9f5ab0 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -459,6 +459,9 @@ public: bool compute_to_mrf(); bool dead_code_eliminate(); bool remove_duplicate_mrf_writes(); + + void combine_send_tex(bblock_t *block, fs_inst *tex_inst, fs_inst *fb_write); + bool opt_sampler_eot(); bool virtual_grf_interferes(int a, int b); void schedule_instructions(instruction_scheduler_mode mode); void insert_gen4_send_dependency_workarounds(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp index d66808b..e5e52c8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp @@ -52,7 +52,7 @@ fs_visitor::dead_code_eliminate() sizeof(BITSET_WORD)); foreach_inst_in_block_reverse(fs_inst, inst, block) { - if (inst->dst.file == GRF && !inst->has_side_effects()) { + if (!inst->eot && inst->dst.file == GRF && !inst->has_side_effects()) { bool result_live = false; if (inst->regs_written == 1) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 8cd36f8..9f2ebaf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -503,6 +503,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src int rlen = 4; uint32_t simd_mode; uint32_t return_format; + bool is_combined_send = inst->eot; switch (dst.type) { case BRW_REGISTER_TYPE_D: @@ -662,6 +663,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src dst = vec16(dst); } + if (is_combined_send) { + assert(brw->gen >= 9 || brw->is_cherryview); + rlen = 0; + } + assert(brw->gen < 7 || !inst->header_present || src.file == BRW_GENERAL_REGISTER_FILE); @@ -777,6 +783,12 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * so has already done marking. */ } + + if (is_combined_send) { + brw_inst_set_eot(brw, brw_last_inst, true); + brw_set_dest(p, brw_last_inst, brw_null_reg()); + brw_inst_set_opcode(brw, brw_last_inst, BRW_OPCODE_SENDC); + } } -- 2.3.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev