Instead of the complicated and broken-by-design pile of heuristics we had before, we now have a straightforward lowering:
1) All header sources are copied directly using force_writemask_all and, since they are guaranteed to be a single register, there are no force_sechalf issues. 2) All non-header sources are copied using the exact same force_sechalf and saturate modifiers as the LOAD_PAYLOAD operation itself. 3) In order to accommodate older gens that need interleaved colors, lower_load_payload detects when the destination is a COMPR4 register and automatically interleaves the non-header sources. The lower_load_payload pass does the right thing here regardless of whether or not the hardware actually supports COMPR4. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 154 ++++++++++++++------------- src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 - 2 files changed, 80 insertions(+), 77 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f8e26c0..bc45a38 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3201,93 +3201,99 @@ fs_visitor::lower_load_payload() { bool progress = false; - int vgrf_to_reg[alloc.count]; - int reg_count = 0; - for (unsigned i = 0; i < alloc.count; ++i) { - vgrf_to_reg[i] = reg_count; - reg_count += alloc.sizes[i]; - } - - struct { - bool written:1; /* Whether this register has ever been written */ - bool force_writemask_all:1; - bool force_sechalf:1; - } metadata[reg_count]; - memset(metadata, 0, sizeof(metadata)); - foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { - const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; - bool force_sechalf = inst->force_sechalf && - !inst->force_writemask_all; - bool toggle_sechalf = inst->dst.width == 16 && - type_sz(inst->dst.type) == 4 && - !inst->force_writemask_all; - for (int i = 0; i < inst->regs_written; ++i) { - metadata[dst_reg + i].written = true; - metadata[dst_reg + i].force_sechalf = force_sechalf; - metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all; - force_sechalf = (toggle_sechalf != force_sechalf); - } - } - if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { assert(inst->dst.file == MRF || inst->dst.file == GRF); + fs_reg dst = inst->dst; - for (int i = 0; i < inst->sources; i++) { - dst.width = inst->src[i].effective_width; - dst.type = inst->src[i].type; - - if (inst->src[i].file == BAD_FILE) { - /* Do nothing but otherwise increment as normal */ - } else if (dst.file == MRF && - dst.width == 8 && - brw->has_compr4 && - i + 4 < inst->sources && - inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) { - fs_reg compr4_dst = dst; - compr4_dst.reg += BRW_MRF_COMPR4; - compr4_dst.width = 16; - fs_reg compr4_src = inst->src[i]; - compr4_src.width = 16; - fs_inst *mov = MOV(compr4_dst, compr4_src); + /* Get rid of COMPR4. We'll add it back in if we need it */ + if (dst.file == MRF && dst.reg & BRW_MRF_COMPR4) + dst.reg = dst.reg & ~BRW_MRF_COMPR4; + + dst.width = 8; + for (uint8_t i = 0; i < inst->header_size; i++) { + if (inst->src[i].file != BAD_FILE) { + fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); + fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); + mov_src.width = 8; + fs_inst *mov = MOV(mov_dst, mov_src); mov->force_writemask_all = true; inst->insert_before(block, mov); - /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */ - inst->src[i + 4].file = BAD_FILE; - } else { - fs_inst *mov = MOV(dst, inst->src[i]); - if (inst->src[i].file == GRF) { - int src_reg = vgrf_to_reg[inst->src[i].reg] + - inst->src[i].reg_offset; - mov->force_sechalf = metadata[src_reg].force_sechalf; - mov->force_writemask_all = metadata[src_reg].force_writemask_all; - } else { - /* We don't have any useful metadata for immediates or - * uniforms. Assume that any of the channels of the - * destination may be used. - */ - assert(inst->src[i].file == IMM || - inst->src[i].file == UNIFORM); - mov->force_writemask_all = true; - } + } + dst = offset(dst, 1); + } - if (dst.file == GRF) { - const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset; - const bool force_writemask = mov->force_writemask_all; - metadata[dst_reg].force_writemask_all = force_writemask; - metadata[dst_reg].force_sechalf = mov->force_sechalf; - if (dst.width * type_sz(dst.type) > 32) { - assert(!mov->force_sechalf); - metadata[dst_reg + 1].force_writemask_all = force_writemask; - metadata[dst_reg + 1].force_sechalf = !force_writemask; + dst.width = inst->exec_size; + if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && + inst->exec_size > 8) { + /* In this case, the payload portion of the LOAD_PAYLOAD isn't + * a straightforward copy. Instead, the result of the + * LOAD_PAYLOAD is treated as interlaced and the first four + * non-header sources are unpacked as: + * + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * This is used for gen <= 5 fb writes. + */ + assert(inst->exec_size == 16); + assert(inst->header_size + 4 <= inst->sources); + for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { + if (inst->src[i].file != BAD_FILE) { + if (brw->has_compr4) { + fs_reg compr4_dst = retype(dst, inst->src[i].type); + compr4_dst.reg |= BRW_MRF_COMPR4; + + fs_inst *mov = MOV(compr4_dst, inst->src[i]); + mov->force_writemask_all = inst->force_writemask_all; + mov->saturate = inst->saturate; + inst->insert_before(block, mov); + } else { + /* Platform doesn't have COMPR4. We have to fake it */ + fs_reg mov_dst = retype(dst, inst->src[i].type); + mov_dst.width = 8; + + fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0)); + mov->force_writemask_all = inst->force_writemask_all; + mov->saturate = inst->saturate; + inst->insert_before(block, mov); + + mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1)); + mov->force_writemask_all = inst->force_writemask_all; + mov->saturate = inst->saturate; + mov->force_sechalf = true; + inst->insert_before(block, mov); } } - inst->insert_before(block, mov); + dst.reg++; } + dst.reg += 4; + + /* The COMPR4 code took care of the first 4 sources. We'll let + * the regular path handle any remaining sources. Yes, we are + * modifying the instruction but we're about to delete it so + * this really doesn't hurt anything. + */ + inst->header_size += 4; + } + + for (uint8_t i = inst->header_size; i < inst->sources; i++) { + if (inst->src[i].file != BAD_FILE) { + fs_inst *mov = MOV(retype(dst, inst->src[i].type), + inst->src[i]); + mov->force_writemask_all = inst->force_writemask_all; + mov->saturate = inst->saturate; + inst->insert_before(block, mov); + } dst = offset(dst, 1); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index b7eeb47..a0d8b798 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -3471,9 +3471,6 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components, color.type, color.width); inst = emit(MOV(dst[len], offset(color, i))); inst->saturate = key->clamp_fragment_color; - } else if (color.width == 16) { - /* We need two BAD_FILE slots for a 16-wide color */ - len++; } len++; } -- 2.3.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev