Re: [Mesa-dev] [PATCH 8/9 v2] nir: Add partial redundancy elimination for compares

Ian Romanick Tue, 04 Sep 2018 09:20:14 -0700

On 09/03/2018 03:17 AM, andrey simiklit wrote:
> Hi,
> 
> I guess it is just a small mistake in description that:
> 'z' was before:
>    z = y - x;
> and became after:
>    z = x - y;
> 
> I think you inadvertently misspelled in the description and you mean:
> 
> 
>     This pass attempts to dectect code sequences like
> 
>         if (x < y) {
>             z = y - x;
>             ...
>         }
> 
>     and replace them with sequences like
> 
>         t = *y - x*;
>         if (t < 0) {
>             z = t;
>             ...
>         }


There is a mistake in the original comment, but this isn't correct
either.  (y - x) < 0 is not the same as x < y. :)  The optimization does
detect cases like this, but it would replace the comparison with a zero
on the left (see the zero_on_left parameter to rewrite_compare_instruction):

    if (x < y) {
        z = y - x;
        ...
    }

would become

    t = y - x;
    if (0 < t) {
        z = t;
        ...
    }

> Because the subtraction is not commutative operation)
> Please correct me if I am incorrect.
> 
> Regards,
> Andrii.
> 
> On Thu, Aug 30, 2018 at 8:36 AM Ian Romanick <i...@freedesktop.org
> <mailto:i...@freedesktop.org>> wrote:
> 
>     From: Ian Romanick <ian.d.roman...@intel.com
>     <mailto:ian.d.roman...@intel.com>>
> 
>     This pass attempts to dectect code sequences like
> 
>         if (x < y) {
>             z = y - x;
>             ...
>         }
> 
>     and replace them with sequences like
> 
>         t = x - y;
>         if (t < 0) {
>             z = t;
>             ...
>         }
> 
>     On architectures where the subtract can generate the flags used by the
>     if-statement, this saves an instruction.  It's also possible that moving
>     an instruction out of the if-statement will allow
>     nir_opt_peephole_select to convert the whole thing to a bcsel.
> 
>     Currently only floating point compares and adds are supported.  Adding
>     support for integer will be a challenge due to integer overflow.  There
>     are a couple possible solutions, but they may not apply to all
>     architectures.
> 
>     v2: Fix a typo in the commit message and a couple typos in comments.
>     Fix possible NULL pointer deref from result of push_block().  Add
>     missing (-A + B) case.  Suggested by Caio.
> 
>     Signed-off-by: Ian Romanick <ian.d.roman...@intel.com
>     <mailto:ian.d.roman...@intel.com>>
>     ---
>      src/compiler/Makefile.sources             |   1 +
>      src/compiler/nir/meson.build              |   1 +
>      src/compiler/nir/nir.h                    |   2 +
>      src/compiler/nir/nir_opt_comparison_pre.c | 360
>     ++++++++++++++++++++++++++++++
>      src/compiler/nir/nir_search_helpers.h     |  29 +++
>      5 files changed, 393 insertions(+)
>      create mode 100644 src/compiler/nir/nir_opt_comparison_pre.c
> 
>     diff --git a/src/compiler/Makefile.sources
>     b/src/compiler/Makefile.sources
>     index d3b06564832..9fe8d5b8904 100644
>     --- a/src/compiler/Makefile.sources
>     +++ b/src/compiler/Makefile.sources
>     @@ -267,6 +267,7 @@ NIR_FILES = \
>             nir/nir_move_load_const.c \
>             nir/nir_move_vec_src_uses_to_dest.c \
>             nir/nir_normalize_cubemap_coords.c \
>     +       nir/nir_opt_comparison_pre.c \
>             nir/nir_opt_conditional_discard.c \
>             nir/nir_opt_constant_folding.c \
>             nir/nir_opt_copy_prop_vars.c \
>     diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
>     index 5438c17a8f8..2bcc854829e 100644
>     --- a/src/compiler/nir/meson.build
>     +++ b/src/compiler/nir/meson.build
>     @@ -151,6 +151,7 @@ files_libnir = files(
>        'nir_move_load_const.c',
>        'nir_move_vec_src_uses_to_dest.c',
>        'nir_normalize_cubemap_coords.c',
>     +  'nir_opt_comparison_pre.c',
>        'nir_opt_conditional_discard.c',
>        'nir_opt_constant_folding.c',
>        'nir_opt_copy_prop_vars.c',
>     diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
>     index ddbcb3c647e..c78387d0acf 100644
>     --- a/src/compiler/nir/nir.h
>     +++ b/src/compiler/nir/nir.h
>     @@ -3000,6 +3000,8 @@ bool nir_convert_from_ssa(nir_shader *shader,
>     bool phi_webs_only);
>      bool nir_lower_phis_to_regs_block(nir_block *block);
>      bool nir_lower_ssa_defs_to_regs_block(nir_block *block);
> 
>     +bool nir_opt_comparison_pre(nir_shader *shader);
>     +
>      bool nir_opt_algebraic(nir_shader *shader);
>      bool nir_opt_algebraic_before_ffma(nir_shader *shader);
>      bool nir_opt_algebraic_late(nir_shader *shader);
>     diff --git a/src/compiler/nir/nir_opt_comparison_pre.c
>     b/src/compiler/nir/nir_opt_comparison_pre.c
>     new file mode 100644
>     index 00000000000..b2827c21816
>     --- /dev/null
>     +++ b/src/compiler/nir/nir_opt_comparison_pre.c
>     @@ -0,0 +1,360 @@
>     +/*
>     + * Copyright © 2018 Intel Corporation
>     + *
>     + * Permission is hereby granted, free of charge, to any person
>     obtaining a
>     + * copy of this software and associated documentation files (the
>     "Software"),
>     + * to deal in the Software without restriction, including without
>     limitation
>     + * the rights to use, copy, modify, merge, publish, distribute,
>     sublicense,
>     + * and/or sell copies of the Software, and to permit persons to
>     whom the
>     + * Software is furnished to do so, subject to the following conditions:
>     + *
>     + * The above copyright notice and this permission notice (including
>     the next
>     + * paragraph) shall be included in all copies or substantial
>     portions of the
>     + * Software.
>     + *
>     + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>     EXPRESS OR
>     + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>     MERCHANTABILITY,
>     + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>     EVENT SHALL
>     + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
>     DAMAGES OR OTHER
>     + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>     ARISING
>     + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>     OTHER DEALINGS
>     + * IN THE SOFTWARE.
>     + */
>     +
>     +#include "nir_instr_set.h"
>     +#include "nir_search_helpers.h"
>     +#include "nir_builder.h"
>     +#include "util/u_vector.h"
>     +
>     +/* Partial redundancy elimination of compares
>     + *
>     + * Seaches for comparisons of the form 'a cmp b' that dominate
>     arithmetic
>     + * instructions like 'b - a'.  The comparison is replaced by the
>     arithmetic
>     + * instruction, and the result is compared with zero.  For example,
>     + *
>     + *       vec1 32 ssa_111 = flt 0.37, ssa_110.w
>     + *       if ssa_111 {
>     + *               block block_1:
>     + *              vec1 32 ssa_112 = fadd ssa_110.w, -0.37
>     + *              ...
>     + *
>     + * becomes
>     + *
>     + *       vec1 32 ssa_111 = fadd ssa_110.w, -0.37
>     + *       vec1 32 ssa_112 = flt 0.0, ssa_111
>     + *       if ssa_112 {
>     + *               block block_1:
>     + *              ...
>     + */
>     +
>     +struct block_queue {
>     +   struct exec_list blocks;
>     +   struct exec_list reusable_blocks;
>     +};
>     +
>     +struct block_instructions {
>     +   struct exec_node node;
>     +   struct u_vector instructions;
>     +};
>     +
>     +static void
>     +block_queue_init(struct block_queue *bq)
>     +{
>     +   exec_list_make_empty(&bq->blocks);
>     +   exec_list_make_empty(&bq->reusable_blocks);
>     +}
>     +
>     +static void
>     +block_queue_finish(struct block_queue *bq)
>     +{
>     +   struct block_instructions *n;
>     +
>     +   while ((n = (struct block_instructions *)
>     exec_list_pop_head(&bq->blocks)) != NULL) {
>     +      u_vector_finish(&n->instructions);
>     +      free(n);
>     +   }
>     +
>     +   while ((n = (struct block_instructions *)
>     exec_list_pop_head(&bq->reusable_blocks)) != NULL) {
>     +      free(n);
>     +   }
>     +}
>     +
>     +static struct block_instructions *
>     +push_block(struct block_queue *bq)
>     +{
>     +   struct block_instructions *bi =
>     +      (struct block_instructions *)
>     exec_list_pop_head(&bq->reusable_blocks);
>     +
>     +   if (bi == NULL) {
>     +      bi = calloc(1, sizeof(struct block_instructions));
>     +
>     +      if (bi == NULL)
>     +         return NULL;
>     +   }
>     +
>     +   if (!u_vector_init(&bi->instructions,
>     +                      sizeof(struct nir_alu_instr *),
>     +                      8 * sizeof(struct nir_alu_instr *)))
>     +      return NULL;
>     +
>     +   exec_list_push_tail(&bq->blocks, &bi->node);
>     +
>     +   return bi;
>     +}
>     +
>     +static void
>     +pop_block(struct block_queue *bq, struct block_instructions *bi)
>     +{
>     +   u_vector_finish(&bi->instructions);
>     +   exec_node_remove(&bi->node);
>     +   exec_list_push_head(&bq->reusable_blocks, &bi->node);
>     +}
>     +
>     +static void
>     +add_instruction_for_block(struct block_instructions *bi,
>     +                          struct nir_alu_instr *alu)
>     +{
>     +   struct nir_alu_instr **data =
>     +      u_vector_add(&bi->instructions);
>     +
>     +   *data = alu;
>     +}
>     +
>     +static void
>     +rewrite_compare_instruction(nir_builder *bld, nir_alu_instr *orig_cmp,
>     +                            nir_alu_instr *orig_add, bool zero_on_left)
>     +{
>     +   void *const mem_ctx = ralloc_parent(orig_cmp);
>     +
>     +   bld->cursor = nir_before_instr(&orig_cmp->instr);
>     +
>     +   /* This is somewhat tricky.  The compare instruction may be
>     something like
>     +    * (fcmp, a, b) while the add instruction is something like
>     (fadd, fneg(a),
>     +    * b).  This is problematic because the SSA value for the
>     fneg(a) may not
>     +    * exist yet at the compare instruction.
>     +    *
>     +    * We fabricate the operands of the new add.  This is done using
>     +    * information provided by zero_on_left.  If zero_on_left is
>     true, we know
>     +    * the resulting compare instruction is (fcmp, 0.0, (fadd, x,
>     y)).  If the
>     +    * original compare instruction was (fcmp, a, b), x = b and y =
>     -a.  If
>     +    * zero_on_left is false, the resulting compare instruction is
>     (fcmp,
>     +    * (fadd, x, y), 0.0) and x = a and y = -b.
>     +    */
>     +   nir_ssa_def *const a = nir_ssa_for_alu_src(bld, orig_cmp, 0);
>     +   nir_ssa_def *const b = nir_ssa_for_alu_src(bld, orig_cmp, 1);
>     +
>     +   nir_ssa_def *const fadd = zero_on_left
>     +      ? nir_fadd(bld, b, nir_fneg(bld, a))
>     +      : nir_fadd(bld, a, nir_fneg(bld, b));
>     +
>     +   nir_ssa_def *const zero =
>     +      nir_imm_floatN_t(bld, 0.0, orig_add->dest.dest.ssa.bit_size);
>     +
>     +   nir_ssa_def *const cmp = zero_on_left
>     +      ? nir_build_alu(bld, orig_cmp->op, zero, fadd, NULL, NULL)
>     +      : nir_build_alu(bld, orig_cmp->op, fadd, zero, NULL, NULL);
>     +
>     +   /* Generating extra moves of the results is the easy way to make
>     sure the
>     +    * writemasks match the original instructions.  Later
>     optimization passes
>     +    * will clean these up.  This is similar to nir_replace_instr (in
>     +    * nir_search.c).
>     +    */
>     +   nir_alu_instr *mov_add = nir_alu_instr_create(mem_ctx, nir_op_imov);
>     +   mov_add->dest.write_mask = orig_add->dest.write_mask;
>     +   nir_ssa_dest_init(&mov_add->instr, &mov_add->dest.dest,
>     +                     orig_add->dest.dest.ssa.num_components,
>     +                     orig_add->dest.dest.ssa.bit_size, NULL);
>     +   mov_add->src[0].src = nir_src_for_ssa(fadd);
>     +
>     +   nir_builder_instr_insert(bld, &mov_add->instr);
>     +
>     +   nir_alu_instr *mov_cmp = nir_alu_instr_create(mem_ctx, nir_op_imov);
>     +   mov_cmp->dest.write_mask = orig_cmp->dest.write_mask;
>     +   nir_ssa_dest_init(&mov_cmp->instr, &mov_cmp->dest.dest,
>     +                     orig_cmp->dest.dest.ssa.num_components,
>     +                     orig_cmp->dest.dest.ssa.bit_size, NULL);
>     +   mov_cmp->src[0].src = nir_src_for_ssa(cmp);
>     +
>     +   nir_builder_instr_insert(bld, &mov_cmp->instr);
>     +
>     +   nir_ssa_def_rewrite_uses(&orig_cmp->dest.dest.ssa,
>     +                            nir_src_for_ssa(&mov_cmp->dest.dest.ssa));
>     +   nir_ssa_def_rewrite_uses(&orig_add->dest.dest.ssa,
>     +                            nir_src_for_ssa(&mov_add->dest.dest.ssa));
>     +
>     +   /* We know these have no more uses because we just rewrote them
>     all, so we
>     +    * can remove them.
>     +    */
>     +   nir_instr_remove(&orig_cmp->instr);
>     +   nir_instr_remove(&orig_add->instr);
>     +}
>     +
>     +static bool
>     +comparison_pre_block(nir_block *block, struct block_queue *bq,
>     nir_builder *bld)
>     +{
>     +   bool progress = false;
>     +
>     +   struct block_instructions *bi = push_block(bq);
>     +   if (bi == NULL)
>     +      return false;
>     +
>     +   nir_foreach_instr_safe(instr, block) {
>     +      if (instr->type != nir_instr_type_alu)
>     +         continue;
>     +
>     +      struct nir_alu_instr *const alu = nir_instr_as_alu(instr);
>     +
>     +      if (alu->dest.dest.ssa.num_components != 1)
>     +         continue;
>     +
>     +      if (alu->dest.saturate)
>     +         continue;
>     +
>     +      static const uint8_t swizzle[4] = { 0, 0, 0, 0 };
>     +
>     +      switch (alu->op) {
>     +      case nir_op_fadd: {
>     +         /* If the instruction is fadd, check it against comparison
>     +          * instructions that dominate it.
>     +          */
>     +         struct block_instructions *b =
>     +            (struct block_instructions *)
>     exec_list_get_head_raw(&bq->blocks);
>     +
>     +         while (b->node.next != NULL) {
>     +            nir_alu_instr **a;
>     +            bool rewrote_compare = false;
>     +
>     +            u_vector_foreach(a, &b->instructions) {
>     +               nir_alu_instr *const cmp = *a;
>     +
>     +               if (cmp == NULL)
>     +                  continue;
>     +
>     +               /* The operands of both instructions are, with some
>     liberty,
>     +                * commutative.  Check all three permutations.  The
>     third
>     +                * permutation is a negation of the original
>     operation, so it
>     +                * is handled separately.
>     +                */
>     +               if ((nir_alu_srcs_equal(cmp, alu, 0, 0) &&
>     +                    nir_alu_srcs_negative_equal(cmp, alu, 1, 1)) ||
>     +                   (nir_alu_srcs_equal(cmp, alu, 0, 1) &&
>     +                    nir_alu_srcs_negative_equal(cmp, alu, 1, 0))) {
>     +                  /* These are the cases where (A cmp B) matches
>     either (A +
>     +                   * -B) or (-B + A)
>     +                   *
>     +                   *    A cmp B <=> A + -B cmp 0
>     +                   */
>     +                  rewrite_compare_instruction(bld, cmp, alu, false);
>     +
>     +                  *a = NULL;
>     +                  rewrote_compare = true;
>     +                  break;
>     +               } else if ((nir_alu_srcs_equal(cmp, alu, 1, 0) &&
>     +                           nir_alu_srcs_negative_equal(cmp, alu, 0,
>     1)) ||
>     +                          (nir_alu_srcs_equal(cmp, alu, 1, 1) &&
>     +                           nir_alu_srcs_negative_equal(cmp, alu, 0,
>     0))) {
>     +                  /* This is the case where (A cmp B) matches (B +
>     -A) or (-A
>     +                   * + B).
>     +                   *
>     +                   *    A cmp B <=> 0 cmp B + -A
>     +                   */
>     +                  rewrite_compare_instruction(bld, cmp, alu, true);
>     +
>     +                  *a = NULL;
>     +                  rewrote_compare = true;
>     +                  break;
>     +               }
>     +            }
>     +
>     +            /* Bail after a compare in the most dominating block is
>     found.
>     +             * This is necessary because 'alu' has been removed
>     from the
>     +             * instruction stream.  Should there be a matching
>     compare in
>     +             * another block, calling rewrite_compare_instruction
>     again will
>     +             * try to operate on a node that is not in the list as
>     if it were
>     +             * in the list.
>     +             *
>     +             * FINISHME: There may be opportunity for additional
>     optimization
>     +             * here.  I discovered this problem due to a shader in
>     Guacamelee.
>     +             * It may be possible to rewrite the matching compares
>     that are
>     +             * encountered later to reuse the result from the
>     compare that was
>     +             * first rewritten.  It's also possible that this is
>     just taken
>     +             * care of by calling the optimization pass repeatedly.
>     +             */
>     +            if (rewrote_compare) {
>     +               progress = true;
>     +               break;
>     +            }
>     +
>     +            b = (struct block_instructions *) b->node.next;
>     +         }
>     +
>     +         break;
>     +      }
>     +
>     +      case nir_op_flt:
>     +      case nir_op_fge:
>     +      case nir_op_fne:
>     +      case nir_op_feq:
>     +         /* If the instruction is a comparison that is used by an
>     if-statement
>     +          * or a bcsel and neither operand is immediate value 0,
>     add it to the
>     +          * set.
>     +          */
>     +         if (is_used_by_if(alu) &&
>     +             is_not_const_zero(alu, 0, 1, swizzle) &&
>     +             is_not_const_zero(alu, 1, 1, swizzle))
>     +            add_instruction_for_block(bi, alu);
>     +
>     +         break;
>     +
>     +      default:
>     +         break;
>     +      }
>     +   }
>     +
>     +   for (unsigned i = 0; i < block->num_dom_children; i++) {
>     +      nir_block *child = block->dom_children[i];
>     +
>     +      if (comparison_pre_block(child, bq, bld))
>     +         progress = true;
>     +   }
>     +
>     +   pop_block(bq, bi);
>     +
>     +   return progress;
>     +}
>     +
>     +static bool
>     +nir_opt_comparison_pre_impl(nir_function_impl *impl)
>     +{
>     +   struct block_queue bq;
>     +   nir_builder bld;
>     +
>     +   block_queue_init(&bq);
>     +   nir_builder_init(&bld, impl);
>     +
>     +   nir_metadata_require(impl, nir_metadata_dominance);
>     +
>     +   const bool progress =
>     +      comparison_pre_block(nir_start_block(impl), &bq, &bld);
>     +
>     +   block_queue_finish(&bq);
>     +
>     +   if (progress)
>     +      nir_metadata_preserve(impl, nir_metadata_block_index |
>     +                                  nir_metadata_dominance);
>     +
>     +   return progress;
>     +}
>     +
>     +bool
>     +nir_opt_comparison_pre(nir_shader *shader)
>     +{
>     +   bool progress = false;
>     +
>     +   nir_foreach_function(function, shader) {
>     +      if (function->impl)
>     +         progress |= nir_opt_comparison_pre_impl(function->impl);
>     +   }
>     +
>     +   return progress;
>     +}
>     diff --git a/src/compiler/nir/nir_search_helpers.h
>     b/src/compiler/nir/nir_search_helpers.h
>     index 8bc6d723c34..85af2f12f9e 100644
>     --- a/src/compiler/nir/nir_search_helpers.h
>     +++ b/src/compiler/nir/nir_search_helpers.h
>     @@ -109,6 +109,29 @@ is_zero_to_one(nir_alu_instr *instr, unsigned
>     src, unsigned num_components,
>         return true;
>      }
> 
>     +static inline bool
>     +is_not_const_zero(nir_alu_instr *instr, unsigned src, unsigned
>     num_components,
>     +                  const uint8_t *swizzle)
>     +{
>     +   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
>     +
>     +   if (!val)
>     +      return true;
>     +
>     +   for (unsigned i = 0; i < num_components; i++) {
>     +      switch (nir_op_infos[instr->op].input_types[src]) {
>     +      case nir_type_float:
>     +         if (val->f32[swizzle[i]] == 0.0f)
>     +            return false;
>     +         break;
>     +      default:
>     +         return false;
>     +      }
>     +   }
>     +
>     +   return true;
>     +}
>     +
>      static inline bool
>      is_not_const(nir_alu_instr *instr, unsigned src, UNUSED unsigned
>     num_components,
>                   UNUSED const uint8_t *swizzle)
>     @@ -159,6 +182,12 @@ is_used_once(nir_alu_instr *instr)
>         return true;
>      }
> 
>     +static inline bool
>     +is_used_by_if(nir_alu_instr *instr)
>     +{
>     +   return !list_empty(&instr->dest.dest.ssa.if_uses);
>     +}
>     +
>      static inline bool
>      is_not_used_by_if(nir_alu_instr *instr)
>      {
>     -- 
>     2.14.4
> 
>     _______________________________________________
>     mesa-dev mailing list
>     mesa-dev@lists.freedesktop.org <mailto:mesa-dev@lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 8/9 v2] nir: Add partial redundancy elimination for compares

Reply via email to