On 2 December 2013 11:31, Francisco Jerez <curroje...@riseup.net> wrote:
> --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 29 > +++++++++++++++++++--- > src/mesa/drivers/dri/i965/brw_fs.h | 3 +++ > .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 7 +++++- > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 +++--- > .../drivers/dri/i965/brw_fs_live_variables.cpp | 2 +- > src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 4 +-- > 6 files changed, 43 insertions(+), 10 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index 0caae2d..e4cee33 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -380,6 +380,7 @@ fs_reg::init() > { > memset(this, 0, sizeof(*this)); > this->smear = -1; > + stride = 1; > } > > /** Generic unset register constructor. */ > @@ -445,6 +446,7 @@ fs_reg::equals(const fs_reg &r) const > memcmp(&fixed_hw_reg, &r.fixed_hw_reg, > sizeof(fixed_hw_reg)) == 0 && > smear == r.smear && > + stride == r.stride && > imm.u == r.imm.u); > } > > @@ -456,6 +458,22 @@ fs_reg::retype(uint32_t type) > return result; > } > > +fs_reg & > +fs_reg::apply_stride(unsigned stride) > +{ > + assert((this->stride * stride) <= 4 && > + is_power_of_two(stride) && > + file != HW_REG && file != IMM); > This function relies on the (IMO surprising) fact that is_power_of_two(0) == true. Can we add a comment to clarify that we're taking advantage of this to allow a stride of 0? Otherwise a reader might reasonably assume that strides of 0 were not allowed. > + this->stride *= stride; > + return *this; > +} > + > +bool > +fs_reg::is_contiguous() const > +{ > + return stride == 1; > +} > + > bool > fs_reg::is_valid_3src() const > { > @@ -686,7 +704,7 @@ fs_inst::is_partial_write() > { > return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || > this->force_uncompressed || > - this->force_sechalf); > + this->force_sechalf || !this->dst.is_contiguous()); > } > > int > @@ -2246,6 +2264,7 @@ fs_visitor::register_coalesce_2() > inst->src[0].negate || > inst->src[0].abs || > inst->src[0].smear != -1 || > + !inst->src[0].is_contiguous() || > inst->dst.file != GRF || > inst->dst.type != inst->src[0].type || > virtual_grf_sizes[inst->src[0].reg] != 1) { > @@ -2338,6 +2357,7 @@ fs_visitor::register_coalesce() > bool has_source_modifiers = (inst->src[0].abs || > inst->src[0].negate || > inst->src[0].smear != -1 || > + !inst->src[0].is_contiguous() || > inst->src[0].file == UNIFORM); > > /* Found a move of a GRF to a GRF. Let's see if we can coalesce > @@ -2422,7 +2442,9 @@ fs_visitor::register_coalesce() > } > new_src.negate ^= scan_inst->src[i].negate; > new_src.sechalf = scan_inst->src[i].sechalf; > - new_src.subreg_offset += scan_inst->src[i].subreg_offset; > + new_src.subreg_offset += > + scan_inst->src[i].subreg_offset * new_src.stride; > + new_src.stride *= scan_inst->src[i].stride; > scan_inst->src[i] = new_src; > } > } > @@ -2458,7 +2480,8 @@ fs_visitor::compute_to_mrf() > inst->dst.file != MRF || inst->src[0].file != GRF || > inst->dst.type != inst->src[0].type || > inst->src[0].abs || inst->src[0].negate || > - inst->src[0].smear != -1 || inst->src[0].subreg_offset) > + inst->src[0].smear != -1 || !inst->src[0].is_contiguous() || > + inst->src[0].subreg_offset) > continue; > > /* Work out which hardware MRF registers are written by this > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index 93a393d..b0ce812 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -77,13 +77,16 @@ public: > > bool equals(const fs_reg &r) const; > bool is_valid_3src() const; > + bool is_contiguous() const; > fs_reg retype(uint32_t type); > + fs_reg &apply_stride(unsigned stride); > > bool negate; > bool abs; > bool sechalf; > int smear; /* -1, or a channel of the reg to smear to all channels. */ > int subreg_offset; /**< Offset in bytes from the start of the > register. */ > + int stride; /**< Register region horizontal stride */ > > fs_reg *reladdr; > }; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > index f3f44c6..2f2d6b6 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp > @@ -298,7 +298,11 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int > arg, acp_entry *entry) > bool has_source_modifiers = entry->src.abs || entry->src.negate; > > if ((has_source_modifiers || entry->src.file == UNIFORM || > - entry->src.smear != -1) && !can_do_source_mods(inst)) > + entry->src.smear != -1 || !entry->src.is_contiguous()) && > + !can_do_source_mods(inst)) > + return false; > + > + if (entry->src.stride * inst->src[arg].stride > 4) > return false; > > if (has_source_modifiers && entry->dst.type != inst->src[arg].type) > @@ -310,6 +314,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, > acp_entry *entry) > if (entry->src.smear != -1) > inst->src[arg].smear = entry->src.smear; > inst->src[arg].subreg_offset = entry->src.subreg_offset; > + inst->src[arg].stride *= entry->src.stride; > > if (!inst->src[arg].abs) { > inst->src[arg].abs = entry->src.abs; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index a0b6135..66321bd 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -988,11 +988,13 @@ brw_reg_from_fs_reg(fs_reg *reg) > switch (reg->file) { > case GRF: > case MRF: > - if (reg->smear == -1) { > - brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); > + if (reg->stride == 0 || reg->smear >= 0) { > + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, > reg->smear); > } else { > - brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, > reg->smear); > + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); > + brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); > } > + > brw_reg = retype(brw_reg, reg->type); > if (reg->sechalf) > brw_reg = sechalf(brw_reg); > diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp > b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp > index 21b2618..1a2d398 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp > @@ -84,7 +84,7 @@ fs_live_variables::setup_one_read(bblock_t *block, > fs_inst *inst, > * would get stomped by the first decode as well. > */ > int end_ip = ip; > - if (v->dispatch_width == 16 && (reg.smear != -1 || > + if (v->dispatch_width == 16 && (reg.smear != -1 || reg.stride == 0 || > (v->pixel_x.reg == reg.reg || > v->pixel_y.reg == reg.reg))) { > end_ip++; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > index 2d35909..3290004 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp > @@ -591,7 +591,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) > * loading pull constants, so spilling them is unlikely to > reduce > * register pressure anyhow. > */ > - if (inst->src[i].smear >= 0) { > + if (inst->src[i].smear >= 0 || !inst->src[i].is_contiguous()) > { > no_spill[inst->src[i].reg] = true; > } > } > @@ -600,7 +600,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) > if (inst->dst.file == GRF) { > spill_costs[inst->dst.reg] += inst->regs_written * loop_scale; > > - if (inst->dst.smear >= 0) { > + if (inst->dst.smear >= 0 || !inst->dst.is_contiguous()) { > no_spill[inst->dst.reg] = true; > } > } > In v2 of this patch, you add the following code to fs_visitor::try_copy_propagate(): + /* Bail if the result of composing both strides cannot be expressed + * as another stride. + */ + if (entry->src.stride != 1 && + (inst->src[arg].stride * + type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) return false; I don't understand. It seems like this is trying to make sure that (src_stride * src_type_sz) / entry_type_sz is an integer so we can later use the factor (src_type_sz / entry_type_sz) as a multiplicative factor to correct the stride without creating a fractional result. But I don't see us applying this correction factor anywhere. Is there some code missing?
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev