Samuel Iglesias Gonsálvez <sigles...@igalia.com> writes: > Generalize it to lower any unsupported narrower conversion. > > v2 (Curro): > - Add supports_type_conversion() > - Reuse existing intruction instead of cloning it. > - Generalize d2x to narrower and equal size conversions. > > Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> > --- > src/intel/compiler/brw_fs.cpp | 11 ++-- > src/intel/compiler/brw_fs_lower_d2x.cpp | 97 > ++++++++++++++++++++++++--------- > 2 files changed, 77 insertions(+), 31 deletions(-) > > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index 6da23b17107..8612a83805d 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -5694,11 +5694,6 @@ fs_visitor::optimize() > OPT(dead_code_eliminate); > } > > - if (OPT(lower_d2x)) { > - OPT(opt_copy_propagation); > - OPT(dead_code_eliminate); > - } > - > OPT(lower_simd_width); > > /* After SIMD lowering just in case we had to unroll the EOT send. */ > @@ -5745,6 +5740,12 @@ fs_visitor::optimize() > OPT(dead_code_eliminate); > } > > + if (OPT(lower_d2x)) { > + OPT(opt_copy_propagation); > + OPT(dead_code_eliminate); > + OPT(lower_simd_width); > + } > + > lower_uniform_pull_constant_loads(); > > validate(); > diff --git a/src/intel/compiler/brw_fs_lower_d2x.cpp > b/src/intel/compiler/brw_fs_lower_d2x.cpp > index a2db1154615..fa25d313dcd 100644 > --- a/src/intel/compiler/brw_fs_lower_d2x.cpp > +++ b/src/intel/compiler/brw_fs_lower_d2x.cpp > @@ -27,47 +27,92 @@ > > using namespace brw; > > +static bool > +supports_type_conversion(fs_inst *inst) {
Pointer should be marked const. > + switch(inst->opcode) { > + case BRW_OPCODE_MOV: > + case SHADER_OPCODE_MOV_INDIRECT: > + return true; > + case BRW_OPCODE_SEL: > + return false; I suggest you return 'inst->dst.type == get_exec_type_size(inst)' here in order to simplify the logic below and restructure things in a way that allows you to make opcode-specific decisions here based on the actual execution and destination types. > + default: > + /* FIXME: We assume the opcodes don't explicitly mentioned > + * before just work fine with arbitrary conversions. > + */ > + return true; > + } > +} > + > bool > fs_visitor::lower_d2x() > { > bool progress = false; > > foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { > - if (inst->opcode != BRW_OPCODE_MOV) > - continue; > + bool inst_support_conversion = supports_type_conversion(inst); > + bool supported_conversion = > + inst_support_conversion && > + (get_exec_type_size(inst) != 8 || > + type_sz(inst->dst.type) > 4 || > + type_sz(inst->dst.type) >= get_exec_type_size(inst)); > > - if (inst->dst.type != BRW_REGISTER_TYPE_F && > - inst->dst.type != BRW_REGISTER_TYPE_D && > - inst->dst.type != BRW_REGISTER_TYPE_UD) > + /* If the conversion is supported or there is no conversion then > + * do nothing. > + */ > + if (supported_conversion || > + (!inst_support_conversion && inst->dst.type == inst->src[0].type) > || You should be using the execution type here (and below where you allocate the temp0 and temp1 vgrfs) instead of assuming it matches the type of the first source. Also I'm not 100% certain that the combination of this conditional continue, the one below, the if/else branches below and the supported_conversion definitions above catch the exact set of cases where you need to apply lowering. This would be a lot easier to follow (most of the above goes away) if you did something along the lines of: | if (supports_type_conversion(inst)) { | if (get_exec_type_size(inst) == 8 && type_sz(inst->dst.type) < 8) { | // Apply strided move workaround... | progress = true; | } | } else { | // Apply general conversion workaround... | progress = true; | } > + inst->dst.file == BAD_FILE || inst->src[0].file == BAD_FILE) I don't think the 'inst->dst.file == BAD_FILE || inst->src[0].file == BAD_FILE' terms of this expression are correct or useful, the 'inst->src[0].file' term because there might be a destination region conversion regardless of what the file of the first source is, the 'inst->dst.file == BAD_FILE' term because you could just rely on DCE instead... > continue; > > - if (inst->src[0].type != BRW_REGISTER_TYPE_DF && > - inst->src[0].type != BRW_REGISTER_TYPE_UQ && > - inst->src[0].type != BRW_REGISTER_TYPE_Q) > - continue; > + /* This pass only supports conversion to narrower or equal size types. > */ > + if (get_exec_type_size(inst) < type_sz(inst->dst.type)) > + continue; > I think you need to apply this condition to the if branch below that deals with DF to F conversions *only*, the else branch you probably need to apply regardless of whether the destination type is larger or smaller than the execution type. > - assert(inst->dst.file == VGRF); > assert(inst->saturate == false); I don't see why it's safe to rely on saturate being false here, in any case it would be straightforward to handle it by setting the flag on the emitted MOV instruction and clearing it on the original instruction. > - fs_reg dst = inst->dst; > > const fs_builder ibld(this, block, inst); > + fs_reg dst = inst->dst; > > - /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to > - * Single Precision Float": > - * > - * The upper Dword of every Qword will be written with undefined > - * value when converting DF to F. > - * > - * So we need to allocate a temporary that's two registers, and then do > - * a strided MOV to get the lower DWord of every Qword that has the > - * result. > - */ > - fs_reg temp = ibld.vgrf(inst->src[0].type, 1); > - fs_reg strided_temp = subscript(temp, inst->dst.type, 0); > - ibld.MOV(strided_temp, inst->src[0]); > - ibld.MOV(dst, strided_temp); > + if (inst_support_conversion && !supported_conversion) { > + /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float > to > + * Single Precision Float": > + * > + * The upper Dword of every Qword will be written with undefined > + * value when converting DF to F. > + * > + * So we need to allocate a temporary that's two registers, and > then do > + * a strided MOV to get the lower DWord of every Qword that has the > + * result. > + */ > + fs_reg temp = ibld.vgrf(inst->src[0].type, 1); The '1' argument is redundant in all fs_builder::vgrf() calls here and below. > + fs_reg strided_temp = subscript(temp, dst.type, 0); > + > + /* We clone the original instruction as we are going to modify it > + * and emit another one after it. > + */ Comment seems stale. > + inst->dst = strided_temp; > + /* As it is an strided destination, we write n-times more being n > the > + * size difference between source and destination types. Update I guess by difference you meant ratio here and in the copy-pasted comment below (the comments also have a few minor spelling mistakes but that's unlikely to cause as much confusion). > + * size_written with the new destination. > + */ > + inst->size_written = inst->dst.component_size(inst->exec_size); I think I'd add an assert(inst->size_written == inst->dst.component_size(inst->exec_size)) right before you assign 'inst->dst' to point at the temporary in order to make sure you aren't accidentally miscompiling instructions that write multiple components. > + ibld.at(block, inst->next).MOV(dst, strided_temp); > + } else { > + fs_reg temp0 = ibld.vgrf(inst->src[0].type, 1); > + fs_reg temp1 = ibld.vgrf(inst->src[0].type, 1); > + fs_reg strided_temp1 = subscript(temp1, dst.type, 0); > + > + inst->dst = temp0; > + /* As it is an strided destination, we write n-times more being n > the > + * size difference between source and destination types. Update > + * size_written with the new destination. > + */ > + inst->size_written = inst->dst.component_size(inst->exec_size); > > - inst->remove(block); > + /* Now, do the conversion to original destination's type. */ > + fs_inst *mov = ibld.at(block, inst->next).MOV(strided_temp1, temp0); > + ibld.at(block, mov->next).MOV(dst, strided_temp1); Isn't this MOV and the strided_temp1 handling redundant? I'd expect it to be handled automatically during the next loop iteration of this pass (though you may have to switch back to foreach_block_and_inst() instead of foreach_block_and_inst_safe()), which would also result in better code because the extra move would only be emitted if this is an actual DF->F conversion. > + } > progress = true; > } > > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev