On 02/15/2013 10:26 PM, Eric Anholt wrote:
This should fix the register allocation explosion on the GLES 3.0 test on gen6.
Except that it doesn't. You can check by running it with INTEL_DEVID_OVERRIDE=0x116.
A tiny change below makes it work though...
--- src/mesa/drivers/dri/i965/brw_fs.cpp | 63 +++++++++++++++++++++++++- src/mesa/drivers/dri/i965/brw_fs.h | 1 + src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 28 ++---------- 3 files changed, 65 insertions(+), 27 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 35cdc6a..1f98b27 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1710,8 +1710,6 @@ fs_visitor::setup_pull_constants() dst, index, offset); pull->ir = inst->ir; pull->annotation = inst->annotation; - pull->base_mrf = 14; - pull->mlen = 1; inst->insert_before(pull); @@ -2447,6 +2445,66 @@ fs_visitor::insert_gen4_send_dependency_workarounds() } } +/** + * Turns the generic expression-style uniform pull constant load instruction + * into a hardware-specific series of instructions for loading a pull + * constant. + * + * The expression style allows the CSE pass before this to optimize out + * repeated loads from the same offset, and gives the pre-register-allocation + * scheduling full flexibility, while the conversion to native instructions + * allows the post-register-allocation scheduler the best information + * possible. + */ +void +fs_visitor::lower_uniform_pull_constant_loads() +{ + foreach_list(node, &this->instructions) { + fs_inst *inst = (fs_inst *)node; + + if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) + continue; + + if (intel->gen >= 7) { + fs_reg const_offset_reg = inst->src[1]; + assert(const_offset_reg.file == IMM && + const_offset_reg.type == BRW_REGISTER_TYPE_UD); + const_offset_reg.imm.u /= 16; + fs_reg payload = fs_reg(this, glsl_type::uint_type); + struct brw_reg g0 = retype(brw_vec8_grf(0, 0), + BRW_REGISTER_TYPE_UD); + + fs_inst *setup1 = MOV(payload, fs_reg(g0)); + setup1->force_writemask_all = true; + /* We don't need the second half of this vgrf to be filled with g1 + * in the 16-wide case, but if we use force_uncompressed then live + * variable analysis won't consider this a def! + */ + + fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET, + payload, payload, + const_offset_reg); + + setup1->ir = inst->ir; + setup1->annotation = inst->annotation; + inst->insert_before(setup1); + setup2->ir = inst->ir; + setup2->annotation = inst->annotation; + inst->insert_before(setup2); + inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; + inst->src[1] = payload; + } else { + /* Before register allocation, we didn't tell the scheduler about the + * MRF we use. We know it's safe to use this MRF because nothing + * else does except for register spill/unspill, which generates and + * uses its MRF within a single IR instruction. + */ + inst->base_mrf = 14; + inst->mlen = 1; + } + } +} + void fs_visitor::dump_instruction(fs_inst *inst) { @@ -2746,6 +2804,7 @@ fs_visitor::run() remove_dead_constants(); + lower_uniform_pull_constant_loads(); schedule_instructions(false);
You need to do lower_uniform_pull_constant_loads() *after* the first scheduling pass, or else you hit the same strongly-ordered scheduling dependency problem we had before.
With tha change, the reg. alloc explosion is fixed. (I haven't actually tested on a real SNB though.)
Assuming you fix that (and it passes piglit), this series gets a: NOTE: This is a candidate for the 9.1 branch. Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> Also, nice work! I'm really glad to see CSE of UBO loads. _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev