Samuel Iglesias Gonsálvez <sigles...@igalia.com> writes: > Generalize it to lower any unsupported narrower conversion. > > v2 (Curro): > - Add supports_type_conversion() > - Reuse existing intruction instead of cloning it. > - Generalize d2x to narrower and equal size conversions. > > v3 (Curro): > - Make supports_type_conversion() const and improve it. > - Use foreach_block_and_inst to process added instructions. > - Simplify code. > - Add assert and improve comments. > - Remove redundant mov. > - Remove useless comment. > - Remove saturate == false assert and add support for saturation > when fixing the conversion. > - Add get_exec_type() function. > > Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> > --- > > This patch replaces the respective v3 one. > > src/intel/compiler/brw_fs.cpp | 11 +-- > src/intel/compiler/brw_fs_lower_d2x.cpp | 117 > +++++++++++++++++++++++--------- > 2 files changed, 91 insertions(+), 37 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index 086b1a04855..8eb8789905c 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -5694,11 +5694,6 @@ fs_visitor::optimize() > OPT(dead_code_eliminate); > } > > - if (OPT(lower_d2x)) { > - OPT(opt_copy_propagation); > - OPT(dead_code_eliminate); > - } > - > OPT(lower_simd_width); > > /* After SIMD lowering just in case we had to unroll the EOT send. */ > @@ -5745,6 +5740,12 @@ fs_visitor::optimize() > OPT(dead_code_eliminate); > } > > + if (OPT(lower_d2x)) { > + OPT(opt_copy_propagation); > + OPT(dead_code_eliminate); > + OPT(lower_simd_width); > + } > + > lower_uniform_pull_constant_loads(); > > validate(); > diff --git a/src/intel/compiler/brw_fs_lower_d2x.cpp > b/src/intel/compiler/brw_fs_lower_d2x.cpp > index a2db1154615..9bc78fa969b 100644 > --- a/src/intel/compiler/brw_fs_lower_d2x.cpp > +++ b/src/intel/compiler/brw_fs_lower_d2x.cpp > @@ -27,48 +27,101 @@ > > using namespace brw; > > +static unsigned
The return type is not really an integer. > +get_exec_type(const fs_inst *inst) > +{ > + unsigned exec_type = 0; This isn't either. You could initialize this to BRW_REGISTER_TYPE_B which isn't a valid execution type so you can drop the whole has_valid_source tracking. > + bool has_valid_source = false; > + > + for (int i = 0; i < inst->sources; i++) { > + if (inst->src[i].type != BAD_FILE) { > + if (!has_valid_source) { > + exec_type = inst->src[i].type; > + has_valid_source = true; > + } else { > + if (type_sz(inst->src[i].type) > type_sz(exec_type)) > + exec_type = inst->src[i].type; Note that this doesn't handle vector immediates correctly (which give you either an F or W execution type), byte nor unsigned source types. I suggest you add a get_exec_type(brw_reg_type) overload you'd call here to translate a source type into the matching exec type. To handle cases where you have mixed floating point and integer types it would probably be sensible to do something along the lines of: | const brw_reg_type t = get_exec_type(inst->src[i].type); | // ... | if (type_sz(t) == type_sz(exec_type) && is_floating_point(t)) | exec_type = t; > + } > + } > + } > + > + /* FIXME: if this assert fails, then the instruction has no valid sources > */ > + assert(has_valid_source); > + You could assert 'exec_type != BRW_REGISTER_TYPE_HF || inst->dst.type == BRW_REGISTER_TYPE_HF' so we remember to handle half-float conversions here when they are enabled in the back-end. > + return exec_type; With this in place the get_exec_type_size(inst) helper seems redundant since it should be equivalent to type_sz(get_exec_type(inst)). I think this should replace PATCH 3. > +} > + > +static bool > +supports_type_conversion(const fs_inst *inst) { > + switch(inst->opcode) { > + case BRW_OPCODE_MOV: > + case SHADER_OPCODE_MOV_INDIRECT: > + return true; > + case BRW_OPCODE_SEL: > + return inst->dst.type == get_exec_type(inst); > + default: > + /* FIXME: We assume the opcodes don't explicitly mentioned > + * before just work fine with arbitrary conversions. > + */ > + return true; > + } > +} > + > bool > fs_visitor::lower_d2x() > { > bool progress = false; > > - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > - if (inst->opcode != BRW_OPCODE_MOV) > - continue; > - > - if (inst->dst.type != BRW_REGISTER_TYPE_F && > - inst->dst.type != BRW_REGISTER_TYPE_D && > - inst->dst.type != BRW_REGISTER_TYPE_UD) > - continue; > + foreach_block_and_inst(block, fs_inst, inst, cfg) { > + const fs_builder ibld(this, block, inst); > + fs_reg dst = inst->dst; > + bool saturate = inst->saturate; > > - if (inst->src[0].type != BRW_REGISTER_TYPE_DF && > - inst->src[0].type != BRW_REGISTER_TYPE_UQ && > - inst->src[0].type != BRW_REGISTER_TYPE_Q) > - continue; > + if (supports_type_conversion(inst)) { > + if (get_exec_type_size(inst) == 8 && type_sz(inst->dst.type) < 8) { > + /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision > Float to > + * Single Precision Float": > + * > + * The upper Dword of every Qword will be written with > undefined > + * value when converting DF to F. > + * > + * So we need to allocate a temporary that's two registers, and > then do > + * a strided MOV to get the lower DWord of every Qword that has > the > + * result. > + */ > + fs_reg temp = ibld.vgrf(inst->src[0].type); This should probably be using the execution type. > + fs_reg strided_temp = subscript(temp, dst.type, 0); > > - assert(inst->dst.file == VGRF); > - assert(inst->saturate == false); > - fs_reg dst = inst->dst; > + assert(inst->size_written == > inst->dst.component_size(inst->exec_size)); > + inst->dst = strided_temp; > + inst->saturate = false; > + /* As it is an strided destination, we write n-times more being > n the > + * size ratio between source and destination types. Update > + * size_written accordingly. > + */ > + inst->size_written = inst->dst.component_size(inst->exec_size); > + ibld.at(block, inst->next).MOV(dst, strided_temp)->saturate = > saturate; > > - const fs_builder ibld(this, block, inst); > + progress = true; > + } > + } else { > + fs_reg temp0 = ibld.vgrf(inst->src[0].type); Same here. With these taken into account and the get_exec_type() helper moved into PATCH 3 this patch is: Reviewed-by: Francisco Jerez <curroje...@riseup.net> > > - /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to > - * Single Precision Float": > - * > - * The upper Dword of every Qword will be written with undefined > - * value when converting DF to F. > - * > - * So we need to allocate a temporary that's two registers, and then do > - * a strided MOV to get the lower DWord of every Qword that has the > - * result. > - */ > - fs_reg temp = ibld.vgrf(inst->src[0].type, 1); > - fs_reg strided_temp = subscript(temp, inst->dst.type, 0); > - ibld.MOV(strided_temp, inst->src[0]); > - ibld.MOV(dst, strided_temp); > + assert(inst->size_written == > inst->dst.component_size(inst->exec_size)); > + inst->dst = temp0; > + /* As it is an strided destination, we write n-times more being n > the > + * size ratio between source and destination types. Update > + * size_written accordingly. > + */ > + inst->size_written = inst->dst.component_size(inst->exec_size); > + inst->saturate = false; > + /* Now, do the conversion to original destination's type. In next > iteration, > + * we will lower it if it is a d2f conversion. > + */ > + ibld.at(block, inst->next).MOV(dst, temp0)->saturate = saturate; > > - inst->remove(block); > - progress = true; > + progress = true; > + } > } > > if (progress) > -- > 2.11.0
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev