Signed-off-by: Thomas Helland <thomashellan...@gmail.com> --- src/glsl/Makefile.sources | 1 + src/glsl/nir/nir.h | 2 + src/glsl/nir/nir_opt_value_range.c | 1330 ++++++++++++++++++++++++++++++++++++ 3 files changed, 1333 insertions(+) create mode 100644 src/glsl/nir/nir_opt_value_range.c
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index b938f1e..720ff70 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -56,6 +56,7 @@ NIR_FILES = \ nir/nir_opt_peephole_ffma.c \ nir/nir_opt_peephole_select.c \ nir/nir_opt_remove_phis.c \ + nir/nir_opt_value_range.c \ nir/nir_print.c \ nir/nir_remove_dead_variables.c \ nir/nir_search.c \ diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 6efbfbd..44dd015 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -1693,6 +1693,8 @@ bool nir_opt_peephole_ffma(nir_shader *shader); bool nir_opt_remove_phis(nir_shader *shader); +bool nir_opt_value_range(nir_shader *shader); + void nir_sweep(nir_shader *shader); #ifdef __cplusplus diff --git a/src/glsl/nir/nir_opt_value_range.c b/src/glsl/nir/nir_opt_value_range.c new file mode 100644 index 0000000..1e6ff0e --- /dev/null +++ b/src/glsl/nir/nir_opt_value_range.c @@ -0,0 +1,1330 @@ +/* + * Copyright © 2014 Thomas Helland + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * + */ + +#include "nir.h" +#include "nir_ssa_def_worklist.h" +#include "nir_block_worklist.h" +#include "nir_constant_expressions.h" + +/* This pass implements an extension of + * "Constant Propagation with Conditional Branches" by Wegman and Zadeck + * that also handles value ranges. This is useful as a lot of shaders have + * min/max expressions that can be eliminated, or conditionals that we can + * prove to be false or true due to previously applied restrictions on range. + * Value range propagation is a superset of tracking constant values, + * and due to that this pass eliminates the need for a separate constant + * propagation pass. This pass is optimistic, meaning we assume all variables + * are constant (or have restricted range) and disprove it. + * A pessimistic algorithm would assume all values where undeterminable, + * and then propagate expressions we know to be constant through the program. + * An optimistic algorithm gets better results than a pessimistic, with the + * downside being that it can not be aborted midway through the pass as the + * results gathered may be wrong (based on wrong assumptions). + * + * The lattice types are: + * undefined: Variable may be constant or range-restricted (not yet processed) + * constant: Value is determined to be constant + * range: Value is determined to be range-restricted + * overdefined: We cannot guarantee the value is constant or range-restricted + * + * We extend the lattice so that constant entries are changed to inclusive + * ranges for each vector component. The join rules are: + * + * undefined join undefined = undefined + * undefined join overdefined = overdefined + * overdefined join overdefined = overdefined + * [low, high] join overdefined = overdefined + * [low, high] join undefined = [low, high] + * [low1, high1] join [low2, high2] = [min(low1, low2), max(high1, high2)] + * + * These rules are general pessimistic rules. There may situations where we + * can still determine parts of the range of the variable, even though it + * has an overdefined input (max, min, sat, abs). This is also true for + * boolean operations like AND and OR. These can be determined even if + * we know only one of the operators. + * + * We don't preserve the range perfectly for a variable, as we combine + * two ranges for a variable into a range of + * [min(low1, low2), max(high1, high2)] + * Preserving the non-continuous range information would greatly complicate + * the pass, and is therefore not implemented. + * + * There is one interesting situation that is hard to deal with: + * When we find that something is dead code, but it does not become a + * constant value. Examples are things like min(sin(x), y) where y > 2. + * We know sin(x) is dead code, but the result is not a constant but instead + * an ssa-def with a range. We mark this in the range-state so that we can + * eliminate it after the pass is done. This means that the pass should be + * rerun if we resolve one of these, as we will then have simplified + * the program, and new ranges may be resolved. + * + * When the pass is done all "undefined" values should be determined as + * either const, range, or overdefined. (Except for in blocks that are + * marked as unreachable) + */ + +/* An idea for doing simultaneous rewriting and analysis can be + * to use the dynamic array Jason created. (Assuming we always start of + * at the highest ssa-index when we make new defs). This allows us to set + * new defs as we go, and will make dealing with inserting movs in if's and + * inserting constants for constant defs a bit simpler. One issue with this + * is that since the pass is optimistic there will be no guarantee that the + * information is correct until the pass has terminated. + */ + +typedef enum { + undefined, + range, + constant, + overdefined +} lattice_type; + +typedef struct { + /* Is this entry float, unsigned or something else? */ + nir_alu_type range_type; + + nir_const_value low; + nir_const_value high; + + /* What type of lattice is this */ + lattice_type type; + + /* Whether we can remove the expression itself and replace it with one + * of its operands. Intended to be used for things like min(a, b) + * where a < 4 and b > 5. We know that the expression will choose a, + * but it is not constant so we cannot mark it as such. + */ + bool can_be_predetermined; + + nir_ssa_def *ssa_def; + boolean in_loop; +} lattice_entry; + +#define IS_FLOAT_CONSTANT(const_value, operator, operand, num_components) \ + ((num_components == 4) ? \ + const_value.f[0] operator operand && \ + const_value.f[1] operator operand && \ + const_value.f[2] operator operand && \ + const_value.f[3] operator operand : \ + ((num_components == 3) ? \ + const_value.f[0] operator operand && \ + const_value.f[1] operator operand && \ + const_value.f[2] operator operand : \ + ((num_components == 2) ? \ + const_value.f[0] operator operand && \ + const_value.f[1] operator operand : \ + ((num_components == 1) ? \ + const_value.f[0] operator operand : \ + false)))) + +#define IS_INT_CONSTANT(const_value, operator, operand, num_components) \ + ((num_components == 4) ? \ + const_value.i[0] operator operand && \ + const_value.i[1] operator operand && \ + const_value.i[2] operator operand && \ + const_value.i[3] operator operand : \ + ((num_components == 3) ? \ + const_value.i[0] operator operand && \ + const_value.i[1] operator operand && \ + const_value.i[2] operator operand : \ + ((num_components == 2) ? \ + const_value.i[0] operator operand && \ + const_value.i[1] operator operand : \ + ((num_components == 1) ? \ + const_value.i[0] operator operand : \ + false)))) + +#define IS_UNSIGNED_CONSTANT(const_value, operator, operand, num_components) \ + ((num_components == 4) ? \ + const_value.u[0] operator operand && \ + const_value.u[1] operator operand && \ + const_value.u[2] operator operand && \ + const_value.u[3] operator operand : \ + ((num_components == 3) ? \ + const_value.u[0] operator operand && \ + const_value.u[1] operator operand && \ + const_value.u[2] operator operand : \ + ((num_components == 2) ? \ + const_value.u[0] operator operand && \ + const_value.u[1] operator operand : \ + ((num_components == 1) ? \ + const_value.u[0] operator operand : \ + false)))) + +typedef struct { + nir_shader *shader; + + /* An array of lattice_antries for all the ssa_defs */ + lattice_entry *entries; + + /* Corresponds to SSA Work in the original paper */ + nir_ssa_def_worklist *ssa_worklist; + + /* Work list of blocks, corresponding to the papers Flow work list */ + nir_block_worklist *block_worklist; + + /* Keep track of which blocks are reachable */ + BITSET_WORD *reachable_blocks; + + nir_function_impl *impl; + void *mem_ctx; +} value_range_state; + + +static lattice_entry * +get_lattice_entry(nir_ssa_def *value, value_range_state *state) +{ + lattice_entry *entry = &state->entries[value->index]; + return entry; +} + +/* Returns true if this is a change in status of the entry. This simplifies + * checking if users of this entry should be added to the worklist. + */ +static bool +set_as_overdefined(lattice_entry *entry, nir_alu_type type) +{ + if (entry->type == overdefined) + return false; + + entry->type = overdefined; + + /* XXX: This may not be useful. Might just say that if the variable + * is undefined then we don't care about the value. + * However, it allows us to propagate the upper and lower range onto + * other ranges without checking if it is undefined and then find that + * that range is -inf - inf and set as overdefined. + */ + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + switch (type) { + case nir_type_float: + entry->low.f[i] = -INFINITY; + entry->high.f[i] = INFINITY; + break; + case nir_type_int: + entry->low.i[i] = INT32_MIN; + entry->high.i[i] = INT32_MAX; + break; + case nir_type_bool: + case nir_type_unsigned: + entry->low.u[i] = 0; + entry->high.u[i] = UINT32_MAX; + break; + case nir_type_invalid: + break; + } + } + + return true; +} + +static inline void +set_range_float_value(lattice_entry *entry, float value, boolean low) +{ + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + if (low) { + entry->low.f[i] = value; + } else { + entry->high.f[i] = value; + } + } +} + +static inline void +set_range_float_constant(lattice_entry *entry, float value) +{ + set_range_float_value(entry, value, true); + set_range_float_value(entry, value, false); + entry->type = constant; +} + +static inline void +set_range_int_value(lattice_entry *entry, int32_t value, boolean low) +{ + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + if (low) { + entry->low.i[i] = value; + } else { + entry->high.i[i] = value; + } + } +} + +static inline void +set_range_int_constant(lattice_entry *entry, int32_t value) +{ + set_range_int_value(entry, value, true); + set_range_int_value(entry, value, false); + entry->type = constant; +} + +static inline void +set_range_unsigned_value(lattice_entry *entry, uint32_t value, boolean low) +{ + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + if (low) { + entry->low.u[i] = value; + } else { + entry->high.u[i] = value; + } + } +} + +static inline void +set_range_unsigned_constant(lattice_entry *entry, uint32_t value) +{ + set_range_unsigned_value(entry, value, true); + set_range_unsigned_value(entry, value, false); + entry->type = constant; +} + +static nir_const_value +get_type_max(nir_alu_type type, unsigned num_components) +{ + nir_const_value value; + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + value.f[i] = INFINITY; + break; + case nir_type_int: + value.i[i] = INT32_MAX; + break; + case nir_type_bool: + case nir_type_unsigned: + value.u[i] = UINT32_MAX; + break; + case nir_type_invalid: + break; + } + } + return value; +} + +static nir_const_value +get_type_min(nir_alu_type type, unsigned num_components) +{ + nir_const_value value; + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + value.f[i] = -INFINITY; + break; + case nir_type_int: + value.i[i] = INT32_MIN; + break; + case nir_type_bool: + case nir_type_unsigned: + value.u[i] = 0; + break; + case nir_type_invalid: + break; + } + } + + return value; +} + +static bool +initialize_entry(nir_ssa_def *def, void *state) +{ + lattice_entry *entry = get_lattice_entry(def, state); + + entry->ssa_def = def; + entry->can_be_predetermined = false; + + if (def->parent_instr->type == nir_instr_type_load_const) { + nir_load_const_instr *instr = nir_instr_as_load_const(def->parent_instr); + entry->type = constant; + entry->low = instr->value; + entry->high = instr->value; + entry->range_type = nir_type_invalid; + return true; + } + + if (def->parent_instr->type == nir_instr_type_alu) { + nir_alu_instr *instr = nir_instr_as_alu(def->parent_instr); + + /* I'm not sure if this is ideal. It allows us to push the inherent + * range of an instruction into the pass before running it. This + * means that we can do this also for loops, which will be harder + * if we do it all in the evaluate_ssa_def function. It also means + * that we will know a lot of range information at the get-go, which + * may be a benefit + */ + switch(instr->op) { + case nir_op_fsat: + set_range_float_value(entry, 0.0f, true); + set_range_float_value(entry, 1.0f, false); + entry->type = range; + entry->range_type = nir_type_float; + return true; + case nir_op_fsin: + case nir_op_fcos: + case nir_op_fsign: + set_range_float_value(entry, -1.0f, true); + set_range_float_value(entry, 1.0f, false); + entry->type = range; + entry->range_type = nir_type_float; + return true; + case nir_op_fabs: + case nir_op_fexp2: + set_range_float_value(entry, 0.0f, true); + set_range_float_value(entry, INFINITY, false); + entry->type = range; + entry->range_type = nir_type_float; + return true; + case nir_op_iabs: + set_range_int_value(entry, 0, true); + set_range_int_value(entry, INT32_MAX, false); + entry->type = range; + entry->range_type = nir_type_int; + return true; + case nir_op_isign: + set_range_int_value(entry, -1, true); + set_range_int_value(entry, 1, false); + entry->type = range; + entry->range_type = nir_type_int; + return true; + default: + break; + } + + /* We are now done special-casing for operations with a range + * associated with them. If it's in a loop we can not do better. + * (Well, we can with loop invariants, but LICM will move those out) + */ + if (entry->in_loop) { + set_as_overdefined(entry, nir_op_infos[instr->op].output_type); + return true; + } + + entry->type = undefined; + entry->low = get_type_min(entry->range_type, def->num_components); + entry->high = get_type_max(entry->range_type, def->num_components); + entry->range_type = nir_op_infos[instr->op].output_type; + return false; + } + + if (def->parent_instr->type == nir_instr_type_phi) { + entry->type = undefined; + entry->range_type = nir_type_invalid; // XXX: Are phi's also typeless? Should check this up closely + return false; + } + + entry->type = overdefined; + entry->range_type = nir_type_invalid; + return false; +} + +static bool +initialize_block(nir_block *block, void *state) { + nir_foreach_instr(block, instr) { + nir_foreach_ssa_def(instr, initialize_entry, block); + } + return true; +} + +static bool +mark_ssa_def_as_in_loop(nir_ssa_def *def, void *state) +{ + lattice_entry *entry = get_lattice_entry(def, state); + entry->in_loop = true; + return true; +} + +static bool +initialize_block_as_in_loop(nir_block *block, void *state) +{ + nir_foreach_instr(block, instr) { + nir_foreach_ssa_def(instr, mark_ssa_def_as_in_loop, state); + nir_foreach_ssa_def(instr, initialize_entry, block); + } + return true; +} + +static bool +is_type_max(nir_const_value value, nir_alu_type type, + unsigned num_components) +{ + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + if (value.f[i] != INFINITY) + return false; + break; + + case nir_type_int: + if (value.i[i] != INT32_MAX) + return false; + break; + + case nir_type_bool: + case nir_type_unsigned: + if (value.u[i] != UINT32_MAX) + return false; + break; + + case nir_type_invalid: + break; + } + } + + return true; +} + +static bool +is_type_min(nir_const_value value, nir_alu_type type, + unsigned num_components) +{ + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + if (value.f[i] != -INFINITY) + return false; + break; + + case nir_type_int: + if (value.i[i] != INT32_MIN) + return false; + break; + + case nir_type_bool: + case nir_type_unsigned: + if (value.u[i] != 0) + return false; + break; + + case nir_type_invalid: + break; + } + } + + return true; +} + +static nir_const_value +per_component_max(nir_const_value src0, nir_const_value src1, + nir_alu_type type, unsigned num_components) +{ + nir_const_value value; + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + value.f[i] = MAX2(src0.f[i], src1.f[i]); + break; + case nir_type_int: + value.i[i] = MAX2(src0.i[i], src1.i[i]); + break; + case nir_type_bool: + case nir_type_unsigned: + value.u[i] = MAX2(src0.u[i], src1.u[i]); + break; + case nir_type_invalid: + break; + } + } + + return value; +} + +static nir_const_value +per_component_min(nir_const_value src0, nir_const_value src1, + nir_alu_type type, unsigned num_components) +{ + nir_const_value value; + for (unsigned i = 0; i < num_components; i++) { + switch (type) { + case nir_type_float: + value.f[i] = MIN2(src0.f[i], src1.f[i]); + break; + case nir_type_int: + value.i[i] = MIN2(src0.i[i], src1.i[i]); + break; + case nir_type_bool: + case nir_type_unsigned: + value.u[i] = MIN2(src0.u[i], src1.u[i]); + break; + case nir_type_invalid: + break; + } + } + + return value; +} + +static bool +component_is_true(lattice_entry *entry, unsigned component) +{ + /* XXX: This check may be removed if the function is + * never used for anything but is_entry_true. + * It already checks if the entry s overdefined. + */ + if (entry->type == overdefined) + return false; + + switch (entry->range_type) { + case nir_type_int: + return entry->low.i[component] > 0 || + entry->high.i[component] < 0; + case nir_type_unsigned: + case nir_type_bool: + return entry->low.u[component] > 0; + case nir_type_float: + return entry->low.f[component] > 0.0f || + entry->high.f[component] < 0.0f; + case nir_type_invalid: + break; + } + + return false; +} + +static bool +is_entry_true(lattice_entry *entry) +{ + bool is_true = true; + + if (entry->type == overdefined) + return false; + + for (int i = 0; i < entry->ssa_def->num_components; i++) + is_true = is_true && component_is_true(entry, i); + + return is_true; +} + +static bool +is_entry_overdefined(lattice_entry *entry) +{ + if (entry->type == overdefined) + return true; + + /* This checks high and low to find out if the range is indeeed + * maximum and mininum of the type, and therefore in fact is overdefined. + * This can happen in a very trivial case like phi(a, b) + * where a = abs(x) and b = neg(abs(y)) and we don't know the range + * of either x or y. + */ + if (is_type_max(entry->high, entry->range_type, + entry->ssa_def->num_components) && + is_type_min(entry->low, entry->range_type, + entry->ssa_def->num_components)) + return true; + + return false; +} + +static bool +component_is_false(lattice_entry *entry, unsigned component) +{ + /* XXX: This check may be removed if the function is + * never used for anything but is_entry_false. + * It already checks if the entry s overdefined. + */ + if (entry->type == overdefined) + return false; + + switch (entry->range_type) { + case nir_type_int: + return entry->low.i[component] == 0; + case nir_type_unsigned: + case nir_type_bool: + return entry->low.u[component] == 0; + case nir_type_float: + return entry->low.f[component] == 0.0f; + case nir_type_invalid: + break; + } + + return false; +} + +static bool +is_entry_false(lattice_entry *entry) +{ + bool is_false = true; + + if (entry->type == overdefined) + return false; + + for (int i = 0; i < entry->ssa_def->num_components; i++) + is_false = is_false && component_is_false(entry, i); + + return is_false; +} + +static bool +is_lattice_entry_constant(lattice_entry *entry) +{ + if (entry->type == constant) + return true; + + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + if (entry->low.u[i] != entry->high.u[i]) + return false; + } + + entry->type = constant; + return true; +} + +static void +mark_block_reachable(nir_block *block, value_range_state *state) +{ + BITSET_SET(state->reachable_blocks, block->index); +} + +static bool +is_block_reachable(nir_block *block, value_range_state *state) +{ + return BITSET_TEST(state->reachable_blocks, block->index); +} + +static void +evaluate_alu_instr(nir_alu_instr *alu, value_range_state *state) +{ + lattice_entry *entry = get_lattice_entry(&alu->dest.dest.ssa, state); + lattice_entry *src[4]; + boolean all_constant = true; + + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + src[i] = get_lattice_entry(alu->src[i].src.ssa, state); + all_constant = all_constant && is_lattice_entry_constant(src[i]); + } + + if (all_constant) { + nir_const_value const_src[4]; + + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) + const_src[i] = src[i]->low; + + nir_const_value dest = + nir_eval_const_opcode(alu->op, + alu->dest.dest.ssa.num_components, + const_src); + + entry->type = constant; + entry->low = dest; + entry->high = dest; + entry->range_type = undefined; + return; + } + + switch(alu->op) { + case nir_op_fabs: + set_range_float_value(entry, 0.0f, true); + entry->type = range; + break; + + case nir_op_fsat: + if (IS_FLOAT_CONSTANT(src[0]->low, <, 0.0f, 1 /* XXX */)) { + set_range_float_constant(entry, 0.0f); + break; + } + + if (IS_FLOAT_CONSTANT(src[0]->low, >, 1.0f, 1 /* XXX */)) { + set_range_float_constant(entry, 1.0f); + break; + } + + set_range_float_value(entry, 0.0f, true); + set_range_float_value(entry, 1.0f, false); + entry->type = range; + break; + + case nir_op_fsign: + if (IS_FLOAT_CONSTANT(src[0]->low, <, 0.0f, 1 /* XXX */)) { + set_range_float_constant(entry, -1.0f); + break; + } + if (IS_FLOAT_CONSTANT(src[0]->low, >, 0.0f, 1 /* XXX */)) { + set_range_float_constant(entry, 1.0f); + break; + } + break; + + case nir_op_fneg: + entry->high = src[0]->low; + entry->low = src[0]->high; + entry->type = src[0]->type; + entry->range_type = src[0]->range_type; + break; + + case nir_op_fmov: + case nir_op_imov: + entry->high = src[0]->high; + entry->low = src[0]->low; + entry->type = src[0]->type; + entry->range_type = src[0]->range_type; + break; + + /* This may be a no-issue? */ + case nir_op_vec4: + entry->high.f[3] = src[0]->high.f[3]; + entry->low.f[3] = src[0]->low.f[3]; + /* Fallthrough */ + case nir_op_vec3: + entry->high.f[2] = src[0]->high.f[2]; + entry->low.f[2] = src[0]->low.f[2]; + /* Fallthrough */ + case nir_op_vec2: + entry->high.f[1] = src[0]->high.f[1]; + entry->low.f[1] = src[0]->low.f[1]; + entry->high.f[0] = src[0]->high.f[0]; + entry->low.f[0] = src[0]->low.f[0]; + entry->type = src[0]->type; + entry->range_type = src[0]->range_type; + break; + + case nir_op_ffma: + case nir_op_flog2: + case nir_op_flrp: + case nir_op_fpow: + case nir_op_frcp: + case nir_op_fround_even: + case nir_op_frsq: + + case nir_op_fxor: + case nir_op_fnot: + case nir_op_for: + case nir_op_fand: + + case nir_op_feq: + case nir_op_fge: + case nir_op_flt: + case nir_op_fne: + + case nir_op_fsub: + case nir_op_fadd: + case nir_op_fdiv: + case nir_op_fmul: + + case nir_op_fcos: + case nir_op_fsin: + + case nir_op_fcsel: + case nir_op_fmax: + case nir_op_fmin: + + case nir_op_iabs: + case nir_op_ineg: + + case nir_op_isign: + + case nir_op_iadd: + case nir_op_isub: + + case nir_op_idiv: + case nir_op_imul: + case nir_op_imul_high: + + case nir_op_ilt: + case nir_op_ieq: + case nir_op_ine: + case nir_op_ige: + + case nir_op_imax: + case nir_op_imin: + + case nir_op_inot: + case nir_op_ior: + case nir_op_ixor: + case nir_op_iand: + + case nir_op_ishl: + case nir_op_ishr: + case nir_op_ifind_msb: + + case nir_op_seq: + case nir_op_sge: + case nir_op_slt: + case nir_op_sne: + + case nir_op_udiv: + case nir_op_uge: + case nir_op_ult: + case nir_op_umax: + case nir_op_umin: + + default: + break; + } + + /* I've been trying to solve this in some kind of automagicall way + * but there are so many special cases that implementing all of them + * "the boring way" will probably be best as we can possibly + * do something "smart" for most of the opcodes. + */ +} + +static void +evaluate_phi_instr(nir_phi_instr *phi, value_range_state *state) +{ + lattice_entry *entry = get_lattice_entry(&phi->dest.ssa, state); + bool first_range = true; + + nir_const_value low; + nir_const_value high; + + lattice_entry *src_entry; + nir_foreach_phi_src(phi, src) { + + src_entry = get_lattice_entry(src->src.ssa, state); + + /* If the block the source is in is not reachable we should not + * add it to the total phi value as it may never be executed. + * If it will it will eventually be marked executable, + * the ssa-defs in the block, along with the phi's, will be processed, + * and therefore this phi will be revisited, and so will be + * resolved correctly. + */ + if (!is_block_reachable(src->pred, state)) + continue; + + /* If one of the sources is overdefined then we can't compute a + * a valid range, and so we should mark it as overdefined + */ + if (is_entry_overdefined(src_entry)) { + set_as_overdefined(entry, nir_type_invalid); + return; + } + + if (src_entry->type == range || src_entry->type == constant) { + if (first_range) { + first_range = false; + + for (unsigned i = 0; i < entry->ssa_def->num_components; i++) { + low.u[i] = src_entry->low.u[i]; + high.u[i] = src_entry->high.u[i]; + } + + } else { + low = per_component_min(low, src_entry->low, entry->range_type, + entry->ssa_def->num_components); + high = per_component_max(high, src_entry->high, entry->range_type, + entry->ssa_def->num_components); + } + } + } + return; +} + +static bool +evaluate_ssa_def(nir_ssa_def *def, value_range_state *state) +{ + lattice_entry *entry = get_lattice_entry(def, state); + lattice_type old_type = entry->type; + nir_const_value old_high; + nir_const_value old_low; + + /* If it is already overdefined then that can not change. + * XXX: This is only true until we implement things like max, min, + * or, and, etc. Those are special, and can therefore change status + * "upwards" in the rule-hierarchy. This can be an issue, as it can + * possibly cause issues with the pass never terminating? + * This needs to be researched and debugged further. + */ + if (entry->type == overdefined) + return false; + + for (unsigned i = 0; i < 4; i++) { + old_high.f[i] = entry->high.f[i]; + old_low.f[i] = entry->low.f[i]; + } + + switch (entry->ssa_def->parent_instr->type) { + case nir_instr_type_load_const: + /* We should have already marked the load_consts as + * constant so there is no use evaluating it. + */ + return false; + + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(entry->ssa_def->parent_instr); + + /* The entry can not be in a loop, as we skip those since we do not + * yet support finding a range for those defs. + */ + assert(!entry->in_loop); + + evaluate_alu_instr(alu, state); + break; + } + + case nir_instr_type_phi: { + nir_phi_instr *phi = nir_instr_as_phi(entry->ssa_def->parent_instr); + + evaluate_phi_instr(phi, state); + entry->range_type = nir_type_invalid; // XXX: Are phi's also typeless? Should check this up closely + break; + } + + default: + return set_as_overdefined(entry, nir_type_invalid); + } + + /* Now we check if the information for the instruction has changed. + * If it has then we return true, so that we can evaluate the users. + */ + if (entry->type != old_type) + return true; + + for (unsigned i = 0; i < 4; i++) { + if (old_high.f[i] != entry->high.f[i] || + old_low.f[i] != entry->low.f[i]) + return true; + } + + return false; +} + +/* Coordinates finding the uses of the ssa_def corresponding to the entry + * and sticking them in the ssa_worklist. + * Should be run on every lattice_entry that we change the information of. + */ +static void +coordinate_uses(lattice_entry *entry, value_range_state *state) +{ + nir_foreach_use(entry->ssa_def, src) { + nir_instr *user = src->ssa->parent_instr; + + /* No point in checking the use if it is not yet found reachable */ + if (!is_block_reachable(user->block, state)) + continue; + + nir_ssa_def *def = nir_instr_get_dest_ssa_def(user); + + /* If it is overdefined we want to push it to head of the list. + * That way we will propagate those faster, avoiding visiting + * ssa-defs with overdefined sources multiple times. */ + if (is_entry_overdefined(entry)) { + nir_ssa_def_worklist_push_head(state->ssa_worklist, def); + } else { + nir_ssa_def_worklist_push_tail(state->ssa_worklist, def); + } + } + + nir_foreach_if_use(entry->ssa_def, src) { + /* If the condition was used for an if then we should do something + * about the if to "push our range" into the then and else branch + * by inserting a copy in each of the blocks where we apply the + * range implied by the if-statement. + * + * XXX: + * We should make sure we add one, the other, or both branches to + * the block worklist, as is implied by the if-statement. + * Here is probably the right place to do that as there is no + * guarantee that the conditional statement will have been processed + * before the "get_following_if" in the block-pass is run, and so we + * may end up not adding a branch that we should've added. + * This may give us some headache as the "find out what the result + * of this divergence is" may not have been resolved before we end + * up adding both paths to the list. However, that may not be an + * issue as the if will be resolved as constant if that's the case, + * and the pass will eventually be repeated without those blocks + * due to the dead control flow optimization. + */ + nir_if *if_statement = src.parent_if; + + nir_cf_node *then_node = nir_if_first_then_node(if_statement); + nir_cf_node *else_node = nir_if_first_else_node(if_statement); + + nir_block *then_block = nir_cf_node_as_block(then_node); + nir_block *else_block = nir_cf_node_as_block(else_node); + + if (is_entry_true(entry)) { + nir_block_worklist_push_tail(state.block_worklist, then_block); + continue; + } + + if (is_entry_false(entry)) { + nir_block_worklist_push_tail(state.block_worklist, else_block); + continue; + } + + if (is_entry_overdefined(entry)) { + nir_block_worklist_push_tail(state.block_worklist, then_block); + nir_block_worklist_push_tail(state.block_worklist, else_block); + continue; + } + } +} + +/* On the first run of a block we want to always check all the uses + * of the instructions that we process. + */ +static void +evaluate_block(nir_block *block, value_range_state *state) +{ + nir_foreach_instr_safe(block, instr) { + nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr); + lattice_entry *entry = get_lattice_entry(def, state); + + /* If the entry stems from a loop then we don't yet support processing + * it, so we skip those and go straight to finding the users. + * This because it's the first time the def is being checked. + */ + if (!entry->in_loop) + evaluate_ssa_def(def, state); + + coordinate_uses(get_lattice_entry(def, state), state); + } +} + +static bool +nir_opt_value_range_impl(nir_function_impl *impl, nir_shader *shader) +{ + /* We might want to run a pass to insert "pi-nodes" into the + * ssa-tree before running the pass. This is essentially just + * a mov x2 = x1 that we use to have something to "store" the + * range implied by things like if's. + * This will also lead to a need of inserting more phi-nodes, + * as one gets variables that diverge and then converge. + * + * x1 = ....; [-unlimited, unlimited] + * if (x1 < 7) + * x2 = x1; [-unlimited, 7] + * | + * | + * else + * x3 = x1; [7, unlimited] + * | + * | + * x4 = phi(x2, x3); + */ + + bool progress = false; + + value_range_state state; + lattice_entry *entries = ralloc_array(state.mem_ctx, lattice_entry, + impl->ssa_alloc); + + state.impl = impl; + state.entries = entries; + state.shader = shader; + nir_block_worklist_init(state.block_worklist, impl->num_blocks, + state.mem_ctx); + nir_ssa_def_worklist_init(state.ssa_worklist, impl->ssa_alloc, + state.mem_ctx); + state.reachable_blocks = rzalloc_array(state.mem_ctx, BITSET_WORD, + BITSET_WORDS(state.impl->ssa_alloc)); + + /* Initialize all lattice entries. We want to mark them as in a loop + * if they are, to simplify checking for this later on. */ + foreach_list_typed_safe(nir_cf_node, node, node, &impl->body) { + switch (node->type) { + case nir_cf_node_block: + initialize_block(nir_cf_node_as_block(node), &state); + break; + case nir_cf_node_if: + nir_foreach_block_in_cf_node(node, initialize_block, &state); + break; + case nir_cf_node_loop: + nir_foreach_block_in_cf_node(node, initialize_block_as_in_loop, &state); + break; + case nir_cf_node_function: + /* XXX: Well, we don't want these, and currently we inline the world. + * Should probably just bail with a lot of noise if we hit this. + */ + break; + } + } + + nir_block_worklist_push_head(state.block_worklist, impl->start_block); + + /* Process the work lists until they are empty */ + while (state.block_worklist->count > 0 || + state.ssa_worklist->count > 0) { + + /* Process the instruction work list + * This doesn't do things exactly like in the paper. + * Instead of storing the instructions that have changed and processing + * each user we are instead adding to the list all the users of + * instructions that have changed. In practice there is no difference, + * apart from dealing with uses is moved out to a separate function. + */ + while (state.ssa_worklist->count > 0) { + + /* All instructions in the list are here because + * we got new information about the range of an operand. + * + * XXX: + * If the instruction is overdefined we don't need to process + * it as it has reached the "lowest status", and therefore + * there should be no way it can be elevated again. + * Exceptions to this rule are things like "&&", "||", "min" or max". + */ + nir_ssa_def *def = nir_ssa_def_worklist_pop_head(state.ssa_worklist); + + /* If the def is in a loop we don't want to do anything. + * (The instruction is initialized as best we can.) + * When the block it's in is added to the worklist the entry + * will get processed, and so we will evaluate its users. + */ + if (get_lattice_entry(def, &state)->in_loop) + continue; + + /* Evaluate the ssa_def. If it has changed then add users to list */ + if (evaluate_ssa_def(def, &state)) + coordinate_uses(get_lattice_entry(def, &state), &state); + } + + /* Process the basic block work list */ + while (state.block_worklist->count > 0) { + nir_block *block = nir_block_worklist_pop_head(state.block_worklist); + + /* Since we have our "coordinate_uses" function that also + * handles phi nodes we can skip the block if it is already set + * as reachable, as the phi's will get automagically added to the + * ssa-def-worklist as they are users of the defs. + */ + if (is_block_reachable(block, &state)) + continue; + + /* Block has been determined to be reachable, mark it */ + mark_block_reachable(block, &state); + + /* XXX: + * We don't yet handle loops. They are initialized to the best + * of our knowledge in a small pass at the start. + * Handling loops here is not necessary as we bail on all "in-loop" + * ssa-defs, but it's just plain dumb to loop over all defs in a + * block when we know we will bail on each and every one of them. + * This is also an issue further down in this section. + * A possibility is to add a "is-in-loop" bitset for blocks. + */ + + /* Evaluate all phi's and expressions of the block. */ + evaluate_block(block, &state); + + /* If the block has only one successor then add it to the worklist */ + if ((block->successors[0] != NULL) && + (block->successors[1] == NULL)) { + nir_block_worklist_push_tail(state.block_worklist, + block->successors[0]); + continue; + } + + /* If the above failed we have ended up in a block that is either + * the last cf_node, or it is an endless loop. The case with + * the block being the last node is easy enough to test for, + * but how we're gonna deal with an endless loop? + */ + if (nir_cf_node_is_last(&block->cf_node)) { + /* This is the last node. This probably doesn't mean that + * the pass is done with its job of analyzing. + */ + } + } + } + + /* We can now traverse over blocks and delete those that + * are still marked as unreachable. If we delete a basic block + * we need to first rewrite the phi's that use the results from + * the BB. + * + * This may however not be without issues. + * The following is an excerpt from LLVM SCCP: + * + * "ResolvedUndefsIn - While solving the dataflow for a function, we assume + * that branches on undef values cannot reach any of their successors. + * However, this is not a safe assumption. After we solve dataflow, this + * method should be use to handle this. If this returns true, the solver + * should be rerun. + * + * This method handles this by finding an unresolved branch and marking + * one of the edges from the block as being feasible, even though the + * condition doesn't say it would be. This allows SCCP to find the rest + * of the CFG and only slightly pessimizes the analysis results + * (by marking one, potentially infeasible, edge feasible). This cannot + * usefully modify the constraints on the condition of the branch, + * as that would impact other users of the value. + * + * This scan also checks for values that use undefs, whose results are + * actually defined. For example, 'zext i8 undef to i32' should produce + * all zeros conservatively, as "(zext i8 X -> i32) & 0xFF00" must always + * return zero, even if X isn't defined. " + * + * For now we want to leave the blocks in place, as when the + * conditional for the block that is unreachable is set as a constant + * Connor's pass for removing dead control flow will come along + * and clean up the blocks that can not be reached. + */ + + /* Check for all values that are proved to be constant, + * and replace them with their constant instruction counterpart. */ + for (unsigned i = 0; i < state.impl->ssa_alloc; i++) { + lattice_entry *entry = &(state.entries[i]); + + /* If it's a constant thats not a load_const then make a load_const + * instruction and replace the uses of the old instr with that. + */ + if (is_lattice_entry_constant(entry) && + entry->ssa_def->parent_instr->type != nir_instr_type_load_const) { + + nir_load_const_instr *instr = + nir_load_const_instr_create(state.shader, + entry->ssa_def->num_components); + + nir_instr_insert_before(entry->ssa_def->parent_instr, + &instr->instr); + + nir_src src = nir_src_for_ssa(&(instr->def)); + nir_ssa_def_rewrite_uses(entry->ssa_def, src, + state.mem_ctx); + + nir_instr_remove(entry->ssa_def->parent_instr); + progress = true; + } + + if (entry->can_be_predetermined) { + /* We have found that this entry can be predetermined. + * However it is not constant. This calls for a bit more + * difficult solving of the expression. + * Things like min/max with ranges that do not intersect + * may end up here. Also things that can be determined due to sat, + * and things that are known to be useless. + * + * A list of functions to try out might be the simplest idea here. + * Basically a checklist of things that we can remove if we are lucky + * with the range, and work our way through that. + */ + } + } + return progress; +} + +bool +nir_opt_value_range(nir_shader *shader) +{ + bool progress = false; + nir_foreach_overload(shader, overload) { + if (overload->impl && nir_opt_value_range_impl(overload->impl, shader)) + progress = true; + } + + return progress; +} -- 2.4.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev