On 09/09/2015 07:10 PM, Jason Ekstrand wrote: > > On Sep 8, 2015 23:27, "Eduardo Lima Mitev" <el...@igalia.com > <mailto:el...@igalia.com>> wrote: >> >> This pass will propagate the destination components of a vecN > instructions, >> as destination of the instructions that define its sources; if certain >> conditions are met. >> >> If all the components of the destination register in the vecN instruction >> can be propagated, the instruction is removed. Otherwise, a new, reduced >> vecN instruction is emitted with the channels that remained. >> >> This effectively coalesces registers and reduces indirection. >> >> By now, this pass will only propagate to ALU instructions, but it could >> be extended to include other instructions like load_input intrinsic. >> >> It also propagates to instructions within the same block as the vecN >> instruction. But it could be made to work cross-block in the future, >> though there are non-trivial issues with this like considering >> registers that are written in different branches of a conditional. >> More analysis is needed to correctly cover these cases. >> >> This pass works on a NIR shader in final form (after SSA), and is >> expected to run before nir_lower_vec_to_movs(). >> --- >> src/glsl/Makefile.sources | 1 + >> src/glsl/nir/nir.h | 1 + >> src/glsl/nir/nir_lower_vec_and_coalesce.c | 301 > ++++++++++++++++++++++++++++++ >> 3 files changed, 303 insertions(+) >> create mode 100644 src/glsl/nir/nir_lower_vec_and_coalesce.c >> >> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources >> index c422303..015f242 100644 >> --- a/src/glsl/Makefile.sources >> +++ b/src/glsl/Makefile.sources >> @@ -48,6 +48,7 @@ NIR_FILES = \ >> nir/nir_lower_vars_to_ssa.c \ >> nir/nir_lower_var_copies.c \ >> nir/nir_lower_vec_to_movs.c \ >> + nir/nir_lower_vec_and_coalesce.c \ >> nir/nir_metadata.c \ >> nir/nir_normalize_cubemap_coords.c \ >> nir/nir_opt_constant_folding.c \ >> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h >> index 3c375f3..6a89f1d 100644 >> --- a/src/glsl/nir/nir.h >> +++ b/src/glsl/nir/nir.h >> @@ -1786,6 +1786,7 @@ void nir_lower_vars_to_ssa(nir_shader *shader); >> void nir_remove_dead_variables(nir_shader *shader); >> >> void nir_lower_vec_to_movs(nir_shader *shader); >> +void nir_lower_vec_and_coalesce(nir_shader *shader); >> void nir_lower_alu_to_scalar(nir_shader *shader); >> void nir_lower_load_const_to_scalar(nir_shader *shader); >> >> diff --git a/src/glsl/nir/nir_lower_vec_and_coalesce.c > b/src/glsl/nir/nir_lower_vec_and_coalesce.c >> new file mode 100644 >> index 0000000..2b21ec1 >> --- /dev/null >> +++ b/src/glsl/nir/nir_lower_vec_and_coalesce.c >> @@ -0,0 +1,301 @@ >> +/* >> + * Copyright © 2015 Intel Corporation >> + * >> + * Permission is hereby granted, free of charge, to any person > obtaining a >> + * copy of this software and associated documentation files (the > "Software"), >> + * to deal in the Software without restriction, including without > limitation >> + * the rights to use, copy, modify, merge, publish, distribute, > sublicense, >> + * and/or sell copies of the Software, and to permit persons to whom the >> + * Software is furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice (including > the next >> + * paragraph) shall be included in all copies or substantial portions > of the >> + * Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT > SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES > OR OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING >> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > OTHER DEALINGS >> + * IN THE SOFTWARE. >> + * >> + * Authors: >> + * Eduardo Lima Mitev (el...@igalia.com <mailto:el...@igalia.com>) >> + * >> + */ >> + >> +#include "nir.h" >> + >> +/* >> + * Implements a pass that lowers vecN instructions by propagating the >> + * components of their destinations, as the destination of the >> + * instructions that defines the sources of the vecN instruction. >> + * >> + * This effectively coalesces registers and reduces indirection. >> + * >> + * If all the components of the destination register in the vecN >> + * instruction can be propagated, the instruction is removed. Otherwise, >> + * a new, reduced vecN instruction is emitted with the channels that >> + * remained. >> + * >> + * By now, this pass will only propagate to ALU instructions, but it > could >> + * be extended to include load_const instructions or some intrinsics like >> + * load_input. >> + * >> + * This pass works on a NIR shader in final form (after SSA), and is >> + * expected to run before nir_lower_vec_to_movs(). >> + */ >> + >> +/** >> + * Clone an ALU instruction and override the destination with the one > given by >> + * new_dest. It copies sources from original ALU to the new one, > adjusting >> + * their swizzles. >> + * >> + * Returns the new ALU instruction. >> + */ >> +static nir_alu_instr * >> +clone_alu_instr_and_override_dest(nir_alu_instr *alu_instr, >> + nir_alu_dest *new_dest, unsigned index, >> + void *mem_ctx) >> +{ >> + nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx, > alu_instr->op); >> + >> + for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; > i++) { >> + nir_alu_src_copy(&new_alu_instr->src[i], &alu_instr->src[i], > mem_ctx); >> + >> + /* Dot-product operations should not be swizzle'd */ >> + switch (alu_instr->op) { >> + case nir_op_fdot2: >> + case nir_op_fdot3: >> + case nir_op_fdot4: >> + continue; >> + default: >> + break; >> + } > > You should use nir_op_infos[alu_instr->op].output_size == 0 to determine > whether or not the instruction is per-channel. > > You also need to check that nir_op_infos[op].input_sizes[i] == 0 to make > sure that the components of the source actually map to components of of > the destination. If output_size == 0 but input_sizes[i] != 0 for some > source, we should probably bail entirely. The one exception to this > that I can think of off-hand is bcsel where it has a single > one-component source and you are free to reswizzle the others. >
Good to know! I will definitely consider this in the future. >> + new_alu_instr->src[i].swizzle[index] = >> + alu_instr->src[i].swizzle[0]; >> + } >> + >> + nir_alu_dest_copy(&new_alu_instr->dest, new_dest, mem_ctx); >> + new_alu_instr->dest.write_mask = 1 << index; >> + >> + return new_alu_instr; >> +} >> + >> +/** >> + * Simply searches an entry in the list, and return true if found, false >> + * otherwise. >> + */ >> +static bool >> +register_already_tracked(const nir_register *reg, nir_register *list[4], >> + unsigned num_items) >> +{ >> + for (unsigned i = 0; i < num_items; i++) { >> + if (list[i] == reg) >> + return true; >> + } >> + return false; >> +} >> + >> +/** >> + * Tells whether an instruction comes before another one inside the same >> + * block. Return true if 'target' appears after 'other', false otherwise. >> + */ >> +static bool >> +instr_is_previous_in_block(nir_instr *target, nir_instr *other) >> +{ >> + nir_instr *prev = target; >> + while ((prev = nir_instr_prev(prev)) != NULL) { >> + if (prev == other) >> + return true; >> + } >> + return false; >> +} >> + >> +static bool >> +lower_vec_and_coalesce_block(nir_block *block, void *mem_ctx) >> +{ >> + nir_foreach_instr_safe(block, instr) { >> + if (instr->type != nir_instr_type_alu) >> + continue; >> + >> + nir_alu_instr *vec = nir_instr_as_alu(instr); >> + >> + switch (vec->op) { >> + case nir_op_vec2: >> + case nir_op_vec3: >> + case nir_op_vec4: >> + break; >> + default: >> + continue; /* The loop */ >> + } >> + >> + /* Since we insert multiple MOVs, we have to be non-SSA. */ >> + assert(!vec->dest.dest.is_ssa); >> + >> + unsigned finished_write_mask = 0; >> + nir_register *tracked_registers[4] = {0}; >> + unsigned num_tracked_registers = 0; >> + >> + for (unsigned i = 0; i < 4; i++) { >> + if (!(vec->dest.write_mask & (1 << i))) >> + continue; >> + >> + if (vec->src[i].src.is_ssa) >> + continue; >> + >> + nir_register *reg = vec->src[i].src.reg.reg; >> + >> + /* By now, lets ignore the sources whose register is the > same as the >> + * destination register of the vecN instruction. >> + */ >> + if (reg == vec->dest.dest.reg.reg) >> + continue; >> + >> + nir_foreach_def_safe(reg, src) { >> + nir_instr *parent_instr = src->reg.parent_instr; >> + >> + /* By now, we enforce that the parent instruction must be > in the >> + * same block as the vecN instruction. This is because if > it is in >> + * other block, like a branch of a conditional, we need > to be sure >> + * that the register component is propagated in all branches. >> + * Otherwise we cannot reduce the current vecN instruction. >> + * @TODO: To extend this pass to work across blocks, more > analysis >> + * is needed. >> + */ >> + if (parent_instr->block != block) >> + continue; >> + >> + /* The instruction to propagate to must come before the vecN >> + * instruction inside the block. >> + */ >> + if (!instr_is_previous_in_block(instr, parent_instr)) >> + continue; >> + >> + /* We only coalesce registers written by ALU > instructions, by now. >> + * @TODO: consider other type of instructions, like > intrinsics, etc. >> + */ >> + if (parent_instr->type != nir_instr_type_alu) >> + continue; >> + >> + nir_alu_instr *parent_alu_instr = > nir_instr_as_alu(parent_instr); >> + nir_register *parent_dest_reg = > parent_alu_instr->dest.dest.reg.reg; >> + >> + /* We only override dest registers that are only used > once, and in >> + * this vecX instruction. >> + * @TODO: In the future we might consider registers used > more than >> + * once as sources of the same vecX instruction. >> + */ > > Yeah, we definitely want to handle that case and we want to handle it by > emitting a single instruction that does multiple channels. > I started implementing that as part of of this patch but quickly realized it was not trivial and basically postponed it until I got some feedback for the basics. >> + if (list_length(&parent_dest_reg->uses) != 1) >> + continue; >> + >> + nir_alu_instr *new_alu_instr = >> + clone_alu_instr_and_override_dest(parent_alu_instr, > &vec->dest, >> + i, mem_ctx); > > This mess would be a lot easier if we were still in SSA form. I've been > thinking for a while that we should make lower_vec_to_movs work in SSA > form (it would replace SSA destinations with register destinations). > Then use partial SSA lowering like we do in the FS backend. That would > make this pass much easier together correct. > I didn't know about the partial SSA pass in FS backend, I could have got some inspiration there. But did tried hard to implement this while in SSA form at the beginning, without much success because the combination of nir_from_ssa and lower_vec_to_movs would screw things after any SSA pass. > In general, I'm kind of thinking that this might be better done as an > extension to the lower_vec_to_moves pass or as a new version of it. You > would loop through the sources of a vecN and, if you can rewrite a > destination, you do and if you can't, you insert moves. We may also > want to it handle SSA while we're at it. Note: it will get run after > nir_from_ssa so it will have to handle partial SSA where some > destinations are SSA and some are registers. > Actually, the first version of this patch, I wrote it against lower_vec_to_movs, but then Matt suggested that likely it would be easier for the opt_register_coalsece pass to avoid lower_vec_to_movs and deal with vecN instructions in the backend. Then I rewrote this pass as a separate thing that would re-emit reduced vecN instructions, to essentially make it compatible with Matt's suggestion, thus future proof :). > Does this seem reasonable? > Absolutely. The first thing I noticed is that lowering stuff in NIR out of SSA form is not very elegant and also not very well supported (lack of utility APIs). I see you sent a series to the ML implementing some of the ideas in this patch. The shader-db results are impressive! I will use that as training material :) and follow the review process closely. This also means I stop working on this pass. Thanks a lot for the review! Eduardo >> + /* For imov and fmov operations, we set the original > swizzle from >> + * the vecN source, directly into the new instruction's only >> + * source. >> + */ >> + if (parent_alu_instr->op == nir_op_imov || >> + parent_alu_instr->op == nir_op_fmov) { >> + new_alu_instr->src[0].swizzle[0] = vec->src[i].swizzle[i]; >> + } >> + >> + finished_write_mask |= new_alu_instr->dest.write_mask; >> + >> + /* Remove the old instruction */ >> + nir_instr_remove(&parent_alu_instr->instr); >> + ralloc_free(parent_alu_instr); >> + >> + /* Track the intermediate register, to remove it later if > not used */ >> + if (!register_already_tracked(parent_dest_reg, > tracked_registers, >> + num_tracked_registers)) { >> + tracked_registers[num_tracked_registers] = > parent_dest_reg; >> + num_tracked_registers++; >> + } >> + >> + /* Insert the new instruction with the overwritten > destination */ >> + nir_instr_insert_before(&vec->instr, &new_alu_instr->instr); >> + } >> + } >> + >> + if (finished_write_mask == 0) >> + continue; >> + >> + nir_alu_instr *new_alu_instr = nir_alu_instr_create(mem_ctx, > nir_op_vec4); >> + nir_alu_dest_copy(&new_alu_instr->dest, &vec->dest, mem_ctx); >> + new_alu_instr->dest.write_mask = 0; >> + >> + unsigned c = 0; >> + for (unsigned i = 0; i < nir_op_infos[vec->op].num_inputs; i++) { >> + if (!(vec->dest.write_mask & (1 << i))) >> + continue; >> + >> + if (finished_write_mask & (1 << i)) >> + continue; >> + >> + nir_alu_src_copy(&new_alu_instr->src[c], &vec->src[i], mem_ctx); >> + new_alu_instr->src[c].swizzle[i] = vec->src[i].swizzle[c]; >> + >> + new_alu_instr->dest.write_mask |= (1 << i); >> + >> + c++; >> + } >> + >> + switch (c) { >> + case 0: >> + /* we coalesced all register components, so no need to emit any >> + * reduced instruction. >> + */ >> + ralloc_free(new_alu_instr); >> + break; >> + case 1: >> + new_alu_instr->op = nir_op_imov; >> + break; >> + case 2: >> + new_alu_instr->op = nir_op_vec2; >> + break; >> + case 3: >> + new_alu_instr->op = nir_op_vec3; >> + break; >> + default: >> + unreachable("Not reached"); >> + } >> + >> + if (c != 0) >> + nir_instr_insert_before(&vec->instr, &new_alu_instr->instr); >> + >> + /* Remove the original vecN instruction */ >> + nir_instr_remove(&vec->instr); >> + ralloc_free(vec); >> + >> + /* Remove the tracked registers that are no longer used */ >> + for (unsigned i = 0; i < num_tracked_registers; i++) { >> + if (list_length(&tracked_registers[i]->defs) == 0 && >> + list_length(&tracked_registers[i]->uses) == 0 && >> + list_length(&tracked_registers[i]->if_uses) == 0) { >> + nir_reg_remove(tracked_registers[i]); >> + } >> + } >> + } >> + >> + return true; >> +} >> + >> +static void >> +nir_lower_vec_and_coalesce_impl(nir_function_impl *impl) >> +{ >> + nir_foreach_block(impl, lower_vec_and_coalesce_block, > ralloc_parent(impl)); >> +} >> + >> +void >> +nir_lower_vec_and_coalesce(nir_shader *shader) >> +{ >> + nir_foreach_overload(shader, overload) { >> + if (overload->impl) >> + nir_lower_vec_and_coalesce_impl(overload->impl); >> + } >> +} >> -- >> 2.4.6 >> > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev