Jose Maria Casanova Crespo <jmcasan...@igalia.com> writes: > We use the information of the registers read/write patterns > to improve variable liveness analysis avoiding extending the > liveness range of a variable to the beginning of the block so > it always reaches the beginning of the shader. > > This optimization analyses inside each block if a partial write > defines completely the bytes used by a following instruction > in the block. So we are not in the case of the use of an undefined > value in the block. > > This avoids almost all the spilling that happens with 8bit/16bit > storage tests, without any compilation performance impact for shader-db > execution that is compensated by spilling reductions. > > At this moment we don't extend the logic to intra-block calculations > of livein/liveout to not hurt performance on the general case because of > not taking advance of BITWORD operations. > > The execution time for running dEQP-VK.*8bit_storage.* tests is reduced > from 7m27.966s to 13.015s. > > shader-bd on SKL shows improvements reducing spilling on > deus-ex-mankind-divided and dophin without increasing execution time. > > total instructions in shared programs: 14867218 -> 14863959 (-0.02%) > instructions in affected programs: 121570 -> 118311 (-2.68%) > helped: 38 > HURT: 0 > > total cycles in shared programs: 537923248 -> 537720965 (-0.04%) > cycles in affected programs: 63154229 -> 62951946 (-0.32%) > helped: 61 > HURT: 26 > > total loops in shared programs: 4828 -> 4828 (0.00%) > loops in affected programs: 0 -> 0 > helped: 0 > HURT: 0 > > total spills in shared programs: 7790 -> 7375 (-5.33%) > spills in affected programs: 2824 -> 2409 (-14.70%) > helped: 35 > HURT: 0 > > total fills in shared programs: 10557 -> 10024 (-5.05%) > fills in affected programs: 3752 -> 3219 (-14.21%) > helped: 38 > HURT: 0 > > v2: - Use functions dst_write_pattern and src_read_pattern > introduced in previous patch at v2. > - Avoid calculating read_pattern if defpartial is 0 > > Cc: Francisco Jerez <curroje...@riseup.net> > --- > src/intel/compiler/brw_fs_live_variables.cpp | 61 ++++++++++++-------- > src/intel/compiler/brw_fs_live_variables.h | 13 ++++- > 2 files changed, 47 insertions(+), 27 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_live_variables.cpp > b/src/intel/compiler/brw_fs_live_variables.cpp > index 059f076fa51..d3559e3114f 100644 > --- a/src/intel/compiler/brw_fs_live_variables.cpp > +++ b/src/intel/compiler/brw_fs_live_variables.cpp > @@ -54,9 +54,9 @@ using namespace brw; > > void > fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst, > - int ip, const fs_reg ®) > + int ip, int src, int reg_offset) > { > - int var = var_from_reg(reg); > + int var = var_from_reg(inst->src[src]) + reg_offset; > assert(var < num_vars); > > start[var] = MIN2(start[var], ip); > @@ -64,31 +64,48 @@ fs_live_variables::setup_one_read(struct block_data *bd, > fs_inst *inst, > > /* The use[] bitset marks when the block makes use of a variable (VGRF > * channel) without having completely defined that variable within the > - * block. > + * block. We take into account that a partial write could have defined > + * completely the read bytes in the block. > */ > - if (!BITSET_TEST(bd->def, var)) > - BITSET_SET(bd->use, var); > + if (!BITSET_TEST(bd->def, var)) { > + if (!bd->defpartial[var]) { > + BITSET_SET(bd->use, var);
Is there any measurable benefit from not executing the functionally equivalent else block in this case? > + } else { > + unsigned read_pattern = inst->src_read_pattern(src, reg_offset); > + if ((bd->defpartial[var] & read_pattern) != read_pattern) > + BITSET_SET(bd->use, var); > + } > + } > } > > void > fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst, > - int ip, const fs_reg ®) > + int ip, int reg_offset) > { > - int var = var_from_reg(reg); > + int var = var_from_reg(inst->dst) + reg_offset; > assert(var < num_vars); > > start[var] = MIN2(start[var], ip); > end[var] = MAX2(end[var], ip); > > /* The def[] bitset marks when an initialization in a block completely > - * screens off previous updates of that variable (VGRF channel). > + * screens off previous updates of that variable (VGRF channel). If > + * we have a partial write now we store the write pattern so next > + * reads in the block can check if what they read was completelly screened "completely" > + * of by this partial write. "off" > */ > - if (inst->dst.file == VGRF) { > - if (!inst->is_partial_write() && !BITSET_TEST(bd->use, var)) > + assert(inst->dst.file == VGRF); > + if(!BITSET_TEST(bd->use, var)) { > + if (!inst->is_partial_write()) { > BITSET_SET(bd->def, var); > - > - BITSET_SET(bd->defout, var); > + bd->defpartial[var] = ~0u; Same question as above. Is there any measurable benefit from not executing the functionally equivalent else block here when the if condition evaluates to true? > + } else { > + bd->defpartial[var] |= inst->dst_write_pattern(reg_offset); > + if (bd->defpartial[var] == ~0u) > + BITSET_SET(bd->def, var); > + } > } > + BITSET_SET(bd->defout, var); > } > > /** > @@ -115,14 +132,9 @@ fs_live_variables::setup_def_use() > foreach_inst_in_block(fs_inst, inst, block) { > /* Set use[] for this instruction */ > for (unsigned int i = 0; i < inst->sources; i++) { > - fs_reg reg = inst->src[i]; > - > - if (reg.file != VGRF) > - continue; > - > - for (unsigned j = 0; j < regs_read(inst, i); j++) { > - setup_one_read(bd, inst, ip, reg); > - reg.offset += REG_SIZE; > + if (inst->src[i].file == VGRF) { > + for (unsigned j = 0; j < regs_read(inst, i); j++) > + setup_one_read(bd, inst, ip, i, j); > } > } > > @@ -130,11 +142,8 @@ fs_live_variables::setup_def_use() > > /* Set def[] for this instruction */ > if (inst->dst.file == VGRF) { > - fs_reg reg = inst->dst; > - for (unsigned j = 0; j < regs_written(inst); j++) { > - setup_one_write(bd, inst, ip, reg); > - reg.offset += REG_SIZE; > - } > + for (unsigned j = 0; j < regs_written(inst); j++) > + setup_one_write(bd, inst, ip, j); > } > > if (!inst->predicate && inst->exec_size >= 8) > @@ -281,6 +290,7 @@ fs_live_variables::fs_live_variables(fs_visitor *v, const > cfg_t *cfg) > bitset_words = BITSET_WORDS(num_vars); > for (int i = 0; i < cfg->num_blocks; i++) { > block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); > + block_data[i].defpartial = rzalloc_array(mem_ctx, unsigned, num_vars); > block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); > block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, > bitset_words); > block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, > bitset_words); > @@ -319,6 +329,7 @@ fs_visitor::invalidate_live_intervals() > void > fs_visitor::calculate_live_intervals() > { > + Stray whitespace. > if (this->live_intervals) > return; > > diff --git a/src/intel/compiler/brw_fs_live_variables.h > b/src/intel/compiler/brw_fs_live_variables.h > index 9e95e443170..5842b24f3e5 100644 > --- a/src/intel/compiler/brw_fs_live_variables.h > +++ b/src/intel/compiler/brw_fs_live_variables.h > @@ -44,6 +44,15 @@ struct block_data { > */ > BITSET_WORD *def; > > + /** > + * Which bytes of a variable are defined before being used in the block. > + * Each bit of the 32-bit integer represents one of the register 32 bytes. > + * > + * As above, "defined" means unconditionally defined but at byte level. > + */ > + > + unsigned *defpartial; > + > /** > * Which variables are used before being defined in the block. > */ > @@ -115,9 +124,9 @@ public: > protected: > void setup_def_use(); > void setup_one_read(struct block_data *bd, fs_inst *inst, int ip, > - const fs_reg ®); > + int src, int reg_offset); > void setup_one_write(struct block_data *bd, fs_inst *inst, int ip, > - const fs_reg ®); > + int reg_offset); > void compute_live_variables(); > void compute_start_end(); > > -- > 2.17.1
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev