On 09/03/2018 03:17 AM, andrey simiklit wrote: > Hi, > > I guess it is just a small mistake in description that: > 'z' was before: > z = y - x; > and became after: > z = x - y; > > I think you inadvertently misspelled in the description and you mean: > > > This pass attempts to dectect code sequences like > > if (x < y) { > z = y - x; > ... > } > > and replace them with sequences like > > t = *y - x*; > if (t < 0) { > z = t; > ... > }
There is a mistake in the original comment, but this isn't correct either. (y - x) < 0 is not the same as x < y. :) The optimization does detect cases like this, but it would replace the comparison with a zero on the left (see the zero_on_left parameter to rewrite_compare_instruction): if (x < y) { z = y - x; ... } would become t = y - x; if (0 < t) { z = t; ... } > Because the subtraction is not commutative operation) > Please correct me if I am incorrect. > > Regards, > Andrii. > > On Thu, Aug 30, 2018 at 8:36 AM Ian Romanick <i...@freedesktop.org > <mailto:i...@freedesktop.org>> wrote: > > From: Ian Romanick <ian.d.roman...@intel.com > <mailto:ian.d.roman...@intel.com>> > > This pass attempts to dectect code sequences like > > if (x < y) { > z = y - x; > ... > } > > and replace them with sequences like > > t = x - y; > if (t < 0) { > z = t; > ... > } > > On architectures where the subtract can generate the flags used by the > if-statement, this saves an instruction. It's also possible that moving > an instruction out of the if-statement will allow > nir_opt_peephole_select to convert the whole thing to a bcsel. > > Currently only floating point compares and adds are supported. Adding > support for integer will be a challenge due to integer overflow. There > are a couple possible solutions, but they may not apply to all > architectures. > > v2: Fix a typo in the commit message and a couple typos in comments. > Fix possible NULL pointer deref from result of push_block(). Add > missing (-A + B) case. Suggested by Caio. > > Signed-off-by: Ian Romanick <ian.d.roman...@intel.com > <mailto:ian.d.roman...@intel.com>> > --- > src/compiler/Makefile.sources | 1 + > src/compiler/nir/meson.build | 1 + > src/compiler/nir/nir.h | 2 + > src/compiler/nir/nir_opt_comparison_pre.c | 360 > ++++++++++++++++++++++++++++++ > src/compiler/nir/nir_search_helpers.h | 29 +++ > 5 files changed, 393 insertions(+) > create mode 100644 src/compiler/nir/nir_opt_comparison_pre.c > > diff --git a/src/compiler/Makefile.sources > b/src/compiler/Makefile.sources > index d3b06564832..9fe8d5b8904 100644 > --- a/src/compiler/Makefile.sources > +++ b/src/compiler/Makefile.sources > @@ -267,6 +267,7 @@ NIR_FILES = \ > nir/nir_move_load_const.c \ > nir/nir_move_vec_src_uses_to_dest.c \ > nir/nir_normalize_cubemap_coords.c \ > + nir/nir_opt_comparison_pre.c \ > nir/nir_opt_conditional_discard.c \ > nir/nir_opt_constant_folding.c \ > nir/nir_opt_copy_prop_vars.c \ > diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build > index 5438c17a8f8..2bcc854829e 100644 > --- a/src/compiler/nir/meson.build > +++ b/src/compiler/nir/meson.build > @@ -151,6 +151,7 @@ files_libnir = files( > 'nir_move_load_const.c', > 'nir_move_vec_src_uses_to_dest.c', > 'nir_normalize_cubemap_coords.c', > + 'nir_opt_comparison_pre.c', > 'nir_opt_conditional_discard.c', > 'nir_opt_constant_folding.c', > 'nir_opt_copy_prop_vars.c', > diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h > index ddbcb3c647e..c78387d0acf 100644 > --- a/src/compiler/nir/nir.h > +++ b/src/compiler/nir/nir.h > @@ -3000,6 +3000,8 @@ bool nir_convert_from_ssa(nir_shader *shader, > bool phi_webs_only); > bool nir_lower_phis_to_regs_block(nir_block *block); > bool nir_lower_ssa_defs_to_regs_block(nir_block *block); > > +bool nir_opt_comparison_pre(nir_shader *shader); > + > bool nir_opt_algebraic(nir_shader *shader); > bool nir_opt_algebraic_before_ffma(nir_shader *shader); > bool nir_opt_algebraic_late(nir_shader *shader); > diff --git a/src/compiler/nir/nir_opt_comparison_pre.c > b/src/compiler/nir/nir_opt_comparison_pre.c > new file mode 100644 > index 00000000000..b2827c21816 > --- /dev/null > +++ b/src/compiler/nir/nir_opt_comparison_pre.c > @@ -0,0 +1,360 @@ > +/* > + * Copyright © 2018 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person > obtaining a > + * copy of this software and associated documentation files (the > "Software"), > + * to deal in the Software without restriction, including without > limitation > + * the rights to use, copy, modify, merge, publish, distribute, > sublicense, > + * and/or sell copies of the Software, and to permit persons to > whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including > the next > + * paragraph) shall be included in all copies or substantial > portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, > DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > OTHER DEALINGS > + * IN THE SOFTWARE. > + */ > + > +#include "nir_instr_set.h" > +#include "nir_search_helpers.h" > +#include "nir_builder.h" > +#include "util/u_vector.h" > + > +/* Partial redundancy elimination of compares > + * > + * Seaches for comparisons of the form 'a cmp b' that dominate > arithmetic > + * instructions like 'b - a'. The comparison is replaced by the > arithmetic > + * instruction, and the result is compared with zero. For example, > + * > + * vec1 32 ssa_111 = flt 0.37, ssa_110.w > + * if ssa_111 { > + * block block_1: > + * vec1 32 ssa_112 = fadd ssa_110.w, -0.37 > + * ... > + * > + * becomes > + * > + * vec1 32 ssa_111 = fadd ssa_110.w, -0.37 > + * vec1 32 ssa_112 = flt 0.0, ssa_111 > + * if ssa_112 { > + * block block_1: > + * ... > + */ > + > +struct block_queue { > + struct exec_list blocks; > + struct exec_list reusable_blocks; > +}; > + > +struct block_instructions { > + struct exec_node node; > + struct u_vector instructions; > +}; > + > +static void > +block_queue_init(struct block_queue *bq) > +{ > + exec_list_make_empty(&bq->blocks); > + exec_list_make_empty(&bq->reusable_blocks); > +} > + > +static void > +block_queue_finish(struct block_queue *bq) > +{ > + struct block_instructions *n; > + > + while ((n = (struct block_instructions *) > exec_list_pop_head(&bq->blocks)) != NULL) { > + u_vector_finish(&n->instructions); > + free(n); > + } > + > + while ((n = (struct block_instructions *) > exec_list_pop_head(&bq->reusable_blocks)) != NULL) { > + free(n); > + } > +} > + > +static struct block_instructions * > +push_block(struct block_queue *bq) > +{ > + struct block_instructions *bi = > + (struct block_instructions *) > exec_list_pop_head(&bq->reusable_blocks); > + > + if (bi == NULL) { > + bi = calloc(1, sizeof(struct block_instructions)); > + > + if (bi == NULL) > + return NULL; > + } > + > + if (!u_vector_init(&bi->instructions, > + sizeof(struct nir_alu_instr *), > + 8 * sizeof(struct nir_alu_instr *))) > + return NULL; > + > + exec_list_push_tail(&bq->blocks, &bi->node); > + > + return bi; > +} > + > +static void > +pop_block(struct block_queue *bq, struct block_instructions *bi) > +{ > + u_vector_finish(&bi->instructions); > + exec_node_remove(&bi->node); > + exec_list_push_head(&bq->reusable_blocks, &bi->node); > +} > + > +static void > +add_instruction_for_block(struct block_instructions *bi, > + struct nir_alu_instr *alu) > +{ > + struct nir_alu_instr **data = > + u_vector_add(&bi->instructions); > + > + *data = alu; > +} > + > +static void > +rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp, > + nir_alu_instr *orig_add, bool zero_on_left) > +{ > + void *const mem_ctx = ralloc_parent(orig_cmp); > + > + bld->cursor = nir_before_instr(&orig_cmp->instr); > + > + /* This is somewhat tricky. The compare instruction may be > something like > + * (fcmp, a, b) while the add instruction is something like > (fadd, fneg(a), > + * b). This is problematic because the SSA value for the > fneg(a) may not > + * exist yet at the compare instruction. > + * > + * We fabricate the operands of the new add. This is done using > + * information provided by zero_on_left. If zero_on_left is > true, we know > + * the resulting compare instruction is (fcmp, 0.0, (fadd, x, > y)). If the > + * original compare instruction was (fcmp, a, b), x = b and y = > -a. If > + * zero_on_left is false, the resulting compare instruction is > (fcmp, > + * (fadd, x, y), 0.0) and x = a and y = -b. > + */ > + nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0); > + nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1); > + > + nir_ssa_def *const fadd = zero_on_left > + ? nir_fadd(bld, b, nir_fneg(bld, a)) > + : nir_fadd(bld, a, nir_fneg(bld, b)); > + > + nir_ssa_def *const zero = > + nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size); > + > + nir_ssa_def *const cmp = zero_on_left > + ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL) > + : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL); > + > + /* Generating extra moves of the results is the easy way to make > sure the > + * writemasks match the original instructions. Later > optimization passes > + * will clean these up. This is similar to nir_replace_instr (in > + * nir_search.c). > + */ > + nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_imov); > + mov_add->dest.write_mask = orig_add->dest.write_mask; > + nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest, > + orig_add->dest.dest.ssa.num_components, > + orig_add->dest.dest.ssa.bit_size, NULL); > + mov_add->src[0].src = nir_src_for_ssa(fadd); > + > + nir_builder_instr_insert(bld, &mov_add->instr); > + > + nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_imov); > + mov_cmp->dest.write_mask = orig_cmp->dest.write_mask; > + nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest, > + orig_cmp->dest.dest.ssa.num_components, > + orig_cmp->dest.dest.ssa.bit_size, NULL); > + mov_cmp->src[0].src = nir_src_for_ssa(cmp); > + > + nir_builder_instr_insert(bld, &mov_cmp->instr); > + > + nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa, > + nir_src_for_ssa(&mov_cmp->dest.dest.ssa)); > + nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa, > + nir_src_for_ssa(&mov_add->dest.dest.ssa)); > + > + /* We know these have no more uses because we just rewrote them > all, so we > + * can remove them. > + */ > + nir_instr_remove(&orig_cmp->instr); > + nir_instr_remove(&orig_add->instr); > +} > + > +static bool > +comparison_pre_block(nir_block *block, struct block_queue *bq, > nir_builder *bld) > +{ > + bool progress = false; > + > + struct block_instructions *bi = push_block(bq); > + if (bi == NULL) > + return false; > + > + nir_foreach_instr_safe(instr, block) { > + if (instr->type != nir_instr_type_alu) > + continue; > + > + struct nir_alu_instr *const alu = nir_instr_as_alu(instr); > + > + if (alu->dest.dest.ssa.num_components != 1) > + continue; > + > + if (alu->dest.saturate) > + continue; > + > + static const uint8_t swizzle[4] = { 0, 0, 0, 0 }; > + > + switch (alu->op) { > + case nir_op_fadd: { > + /* If the instruction is fadd, check it against comparison > + * instructions that dominate it. > + */ > + struct block_instructions *b = > + (struct block_instructions *) > exec_list_get_head_raw(&bq->blocks); > + > + while (b->node.next != NULL) { > + nir_alu_instr **a; > + bool rewrote_compare = false; > + > + u_vector_foreach(a, &b->instructions) { > + nir_alu_instr *const cmp = *a; > + > + if (cmp == NULL) > + continue; > + > + /* The operands of both instructions are, with some > liberty, > + * commutative. Check all three permutations. The > third > + * permutation is a negation of the original > operation, so it > + * is handled separately. > + */ > + if ((nir_alu_srcs_equal(cmp, alu, 0, 0) && > + nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) || > + (nir_alu_srcs_equal(cmp, alu, 0, 1) && > + nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) { > + /* These are the cases where (A cmp B) matches > either (A + > + * -B) or (-B + A) > + * > + * A cmp B <=> A + -B cmp 0 > + */ > + rewrite_compare_instruction(bld, cmp, alu, false); > + > + *a = NULL; > + rewrote_compare = true; > + break; > + } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) && > + nir_alu_srcs_negative_equal(cmp, alu, 0, > 1)) || > + (nir_alu_srcs_equal(cmp, alu, 1, 1) && > + nir_alu_srcs_negative_equal(cmp, alu, 0, > 0))) { > + /* This is the case where (A cmp B) matches (B + > -A) or (-A > + * + B). > + * > + * A cmp B <=> 0 cmp B + -A > + */ > + rewrite_compare_instruction(bld, cmp, alu, true); > + > + *a = NULL; > + rewrote_compare = true; > + break; > + } > + } > + > + /* Bail after a compare in the most dominating block is > found. > + * This is necessary because 'alu' has been removed > from the > + * instruction stream. Should there be a matching > compare in > + * another block, calling rewrite_compare_instruction > again will > + * try to operate on a node that is not in the list as > if it were > + * in the list. > + * > + * FINISHME: There may be opportunity for additional > optimization > + * here. I discovered this problem due to a shader in > Guacamelee. > + * It may be possible to rewrite the matching compares > that are > + * encountered later to reuse the result from the > compare that was > + * first rewritten. It's also possible that this is > just taken > + * care of by calling the optimization pass repeatedly. > + */ > + if (rewrote_compare) { > + progress = true; > + break; > + } > + > + b = (struct block_instructions *) b->node.next; > + } > + > + break; > + } > + > + case nir_op_flt: > + case nir_op_fge: > + case nir_op_fne: > + case nir_op_feq: > + /* If the instruction is a comparison that is used by an > if-statement > + * or a bcsel and neither operand is immediate value 0, > add it to the > + * set. > + */ > + if (is_used_by_if(alu) && > + is_not_const_zero(alu, 0, 1, swizzle) && > + is_not_const_zero(alu, 1, 1, swizzle)) > + add_instruction_for_block(bi, alu); > + > + break; > + > + default: > + break; > + } > + } > + > + for (unsigned i = 0; i < block->num_dom_children; i++) { > + nir_block *child = block->dom_children[i]; > + > + if (comparison_pre_block(child, bq, bld)) > + progress = true; > + } > + > + pop_block(bq, bi); > + > + return progress; > +} > + > +static bool > +nir_opt_comparison_pre_impl(nir_function_impl *impl) > +{ > + struct block_queue bq; > + nir_builder bld; > + > + block_queue_init(&bq); > + nir_builder_init(&bld, impl); > + > + nir_metadata_require(impl, nir_metadata_dominance); > + > + const bool progress = > + comparison_pre_block(nir_start_block(impl), &bq, &bld); > + > + block_queue_finish(&bq); > + > + if (progress) > + nir_metadata_preserve(impl, nir_metadata_block_index | > + nir_metadata_dominance); > + > + return progress; > +} > + > +bool > +nir_opt_comparison_pre(nir_shader *shader) > +{ > + bool progress = false; > + > + nir_foreach_function(function, shader) { > + if (function->impl) > + progress |= nir_opt_comparison_pre_impl(function->impl); > + } > + > + return progress; > +} > diff --git a/src/compiler/nir/nir_search_helpers.h > b/src/compiler/nir/nir_search_helpers.h > index 8bc6d723c34..85af2f12f9e 100644 > --- a/src/compiler/nir/nir_search_helpers.h > +++ b/src/compiler/nir/nir_search_helpers.h > @@ -109,6 +109,29 @@ is_zero_to_one(nir_alu_instr *instr, unsigned > src, unsigned num_components, > return true; > } > > +static inline bool > +is_not_const_zero(nir_alu_instr *instr, unsigned src, unsigned > num_components, > + const uint8_t *swizzle) > +{ > + nir_const_value *val = nir_src_as_const_value(instr->src[src].src); > + > + if (!val) > + return true; > + > + for (unsigned i = 0; i < num_components; i++) { > + switch (nir_op_infos[instr->op].input_types[src]) { > + case nir_type_float: > + if (val->f32[swizzle[i]] == 0.0f) > + return false; > + break; > + default: > + return false; > + } > + } > + > + return true; > +} > + > static inline bool > is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned > num_components, > UNUSED const uint8_t *swizzle) > @@ -159,6 +182,12 @@ is_used_once(nir_alu_instr *instr) > return true; > } > > +static inline bool > +is_used_by_if(nir_alu_instr *instr) > +{ > + return !list_empty(&instr->dest.dest.ssa.if_uses); > +} > + > static inline bool > is_not_used_by_if(nir_alu_instr *instr) > { > -- > 2.14.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org <mailto:mesa-dev@lists.freedesktop.org> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev