On Mon, 2016-02-29 at 22:06 -0800, Matt Turner wrote: > Some shaders from Synmark contain this loop: > > for (float i = 0.02; i < 0.9; i += 0.11) > > and in its body it uses both i and (1.0 - i). All 16 immediates are > promoted to registers (they're used by 3-src MAD instructions). By > recognizing that we can load 8 of these into a single register and then > subtract that from 1.0 to produce the other 8 values, we can load the 16 > values in 9 instructions rather than 16: > > total instructions in shared programs: 7121101 -> 7120611 (-0.01%) > instructions in affected programs: 12915 -> 12425 (-3.79%) > helped: 70 > > More importantly, because of the false dependencies between the mov(1) > instructions, instruction scheduling is not able to schedule enough work > after texture operations. With this patch, the mov(1)s and the add(8) > are emitted together and avoid the scheduling problems: > > total cycles in shared programs: 65013218 -> 64895688 (-0.18%) > cycles in affected programs: 378700 -> 261170 (-31.04%) > helped: 70 > > Synmark's OglPSPom improves by 2.42872% +/- 0.158891% (n=120) on a > Haswell GT3e. > --- > I've had this in a branch for more than a year now. Might as well send it. > > .../drivers/dri/i965/brw_fs_combine_constants.cpp | 93 > ++++++++++++++++++++++ > 1 file changed, 93 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp > b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp > index d7a1456..f272263 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp > @@ -133,6 +133,9 @@ struct imm { > */ > bool must_promote; > > + /** Whether this constant has been inserted via a different method. */ > + bool inserted; > + > uint16_t first_use_ip; > uint16_t last_use_ip; > }; > @@ -168,6 +171,90 @@ new_imm(struct table *table, void *mem_ctx) > } > > /** > + * Recognizes when a group of 8 immediates can be loaded by subtracting other > + * constants we're already loading from 1.0. > + */ > +static void > +try_recognize_1_minus_x(fs_visitor *v, struct table *table) > +{ > + struct { > + int a_index, b_index; > + float a_val, b_val; > + > + bool valid; > + bool found; > + } info[table->len] = {0}; > + > + int c = 0; > + bblock_t *block = NULL; > + > + for (int i = 0; i < table->len; i++) { > + /* Skip some values that aren't useful to include this in optimization > */ > + if (table->imm[i].val == 0.0f || > + table->imm[i].val == 0.5f || > + table->imm[i].val == 1.0f) > + continue; > + > + for (int j = 0; j < table->len; j++) { > + if (table->imm[i].val == 1.0f - table->imm[j].val) { > + if (info[j].found) > + continue; > + > + if (block == NULL) > + block = table->imm[i].block; > + if (block != table->imm[i].block) > + continue; > + > + info[i].a_index = i; > + info[i].b_index = j; > + info[i].a_val = table->imm[i].val; > + info[i].b_val = table->imm[j].val; > + info[i].valid = true; > + > + info[i].found = true; > + info[j].found = true; > + c++; > + } > + } > + } > + > + if (c == 8) {
c could be any multiple of 8 and this opt would still make sense for any group of 8 that we can make, right? maybe make this a loop with c / 8 iterations? > + backend_instruction *inst = block->first_non_control_flow_inst(); > + const fs_builder bld = v->bld.at(block, inst).exec_all().group(1, 0); > + > + fs_reg first(VGRF, v->alloc.allocate(v->dispatch_width / 8)); > + fs_reg second(VGRF, v->alloc.allocate(v->dispatch_width / 8)); This is allocating a VGRF of size 2 for SIMD16, but the code below is only going to use half of that, since we check above that c == 8, right? > + first.stride = 0; > + > + for (int i = 0; i < table->len; i++) { > + if (!info[i].valid) > + continue; > + > + fs_inst *mov = bld.MOV(first, brw_imm_f(info[i].a_val)); > + mov->no_dd_clear = first.subreg_offset != 28; It seems that this is also assuming that we only use one SIMD8 register > + mov->no_dd_check = first.subreg_offset != 0; > + > + int a = info[i].a_index; > + int b = info[i].b_index; > + > + table->imm[a].inserted = true; > + table->imm[a].nr = first.nr; > + table->imm[a].subreg_offset = first.subreg_offset; > + > + table->imm[b].inserted = true; > + table->imm[b].nr = second.nr; > + table->imm[b].subreg_offset = first.subreg_offset; > + > + first.subreg_offset += sizeof(float); > + } > + > + first.stride = 1; > + first.subreg_offset = 0; > + bld.group(8, 0).ADD(second, negate(first), brw_imm_f(1.0f)); > + } > +} > + > +/** > * Comparator used for sorting an array of imm structures. > * > * We sort by basic block number, then last use IP, then first use IP (least > @@ -241,6 +328,7 @@ fs_visitor::opt_combine_constants() > imm->val = val; > imm->uses_by_coissue = could_coissue(devinfo, inst); > imm->must_promote = must_promote_imm(devinfo, inst); > + imm->inserted = false; > imm->first_use_ip = ip; > imm->last_use_ip = ip; > } > @@ -267,11 +355,16 @@ fs_visitor::opt_combine_constants() > if (cfg->num_blocks != 1) > qsort(table.imm, table.len, sizeof(struct imm), compare); > > + try_recognize_1_minus_x(this, &table); > + > /* Insert MOVs to load the constant values into GRFs. */ > fs_reg reg(VGRF, alloc.allocate(1)); > reg.stride = 0; > for (int i = 0; i < table.len; i++) { > struct imm *imm = &table.imm[i]; > + if (imm->inserted) > + continue; > + > /* Insert it either before the instruction that generated the immediate > * or after the last non-control flow instruction of the common > ancestor. > */ _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev