From: Iago Toral Quiroga <ito...@igalia.com> For now, this pass can handle ssbo load combines within the same block. This is useful, for example, to make code such as this:
buffer SSBO { mat4 sm4; }; uniform mat4 um4; void main() { sm4 *= um4; } go from 16 SSBO loads down to only 4. v2: (elima) Updated to rebase against recent trunk, removed separation of intrinsics into direct/indirect. Prepared code to support other intrinsic groups (shared-vars and images) in the future. Made some functions inline. v3: (elima) Implemented the instruction cache as a list instead of (ab)using nir_instr_set. --- src/compiler/Makefile.sources | 1 + src/compiler/nir/nir.h | 2 + src/compiler/nir/nir_opt_load_combine.c | 447 ++++++++++++++++++++++++++++++++ 3 files changed, 450 insertions(+) create mode 100644 src/compiler/nir/nir_opt_load_combine.c diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 6f09abf..3f9773c 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -216,6 +216,7 @@ NIR_FILES = \ nir/nir_opt_dead_cf.c \ nir/nir_opt_gcm.c \ nir/nir_opt_global_to_local.c \ + nir/nir_opt_load_combine.c \ nir/nir_opt_peephole_select.c \ nir/nir_opt_remove_phis.c \ nir/nir_opt_undef.c \ diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 8067b41..1d96966 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2330,6 +2330,8 @@ bool nir_opt_dead_cf(nir_shader *shader); void nir_opt_gcm(nir_shader *shader); +bool nir_opt_load_combine(nir_shader *shader); + bool nir_opt_peephole_select(nir_shader *shader); bool nir_opt_remove_phis(nir_shader *shader); diff --git a/src/compiler/nir/nir_opt_load_combine.c b/src/compiler/nir/nir_opt_load_combine.c new file mode 100644 index 0000000..25354c8 --- /dev/null +++ b/src/compiler/nir/nir_opt_load_combine.c @@ -0,0 +1,447 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * Implements a load-combine pass for load/store instructions. Similar to a + * CSE pass, but needs to consider invalidation of cached loads by stores + * or memory barriers. It only works on local blocks for now. + */ + +#include "nir.h" + +/* + * SSBO stores won't invalidate image loads for example, so we want to + * classify load/store operations in groups and only invalidate / reuse + * intrinsics in the same group. + */ +enum intrinsic_groups { + INTRINSIC_GROUP_NONE = 0, + INTRINSIC_GROUP_ALL, + INTRINSIC_GROUP_SSBO +}; + +struct cache_node { + struct list_head list; + nir_instr *instr; +}; + +/* SSBO load/store */ +static bool +is_atomic_ssbo(nir_intrinsic_instr *intrinsic) +{ + switch (intrinsic->intrinsic) { + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + return true; + default: + return false; + } +} + +static inline bool +is_store_ssbo(nir_intrinsic_instr *intrinsic) +{ + return intrinsic->intrinsic == nir_intrinsic_store_ssbo || + is_atomic_ssbo(intrinsic); +} + +static inline bool +is_load_ssbo(nir_intrinsic_instr *intrinsic) +{ + return intrinsic->intrinsic == nir_intrinsic_load_ssbo; +} + +static inline bool +is_memory_barrier_buffer(nir_intrinsic_instr *intrinsic) +{ + return intrinsic->intrinsic == nir_intrinsic_memory_barrier_buffer; +} + +/* + * General load/store functions: we'll add more groups to this as needed. + * For now we only support SSBOs. + */ +static inline bool +is_store(nir_intrinsic_instr *intrinsic) +{ + return is_store_ssbo(intrinsic); +} + +static inline bool +is_load(nir_intrinsic_instr *intrinsic) +{ + return is_load_ssbo(intrinsic); +} + +static inline bool +is_atomic(nir_intrinsic_instr *intrinsic) +{ + return is_atomic_ssbo(intrinsic); +} + +static inline bool +is_memory_barrier(nir_intrinsic_instr *intrinsic) +{ + return intrinsic->intrinsic == nir_intrinsic_memory_barrier || + is_memory_barrier_buffer(intrinsic); +} + +static unsigned +intrinsic_group(nir_intrinsic_instr *intrinsic) +{ + if (intrinsic->intrinsic == nir_intrinsic_memory_barrier) + return INTRINSIC_GROUP_ALL; + else if (is_load_ssbo(intrinsic) || is_store_ssbo(intrinsic) || + is_memory_barrier_buffer(intrinsic)) + return INTRINSIC_GROUP_SSBO; + else + return INTRINSIC_GROUP_NONE; +} + +static bool +intrinsic_group_match(nir_intrinsic_instr *intrinsic1, + nir_intrinsic_instr *intrinsic2) +{ + int group1 = intrinsic_group(intrinsic1); + int group2 = intrinsic_group(intrinsic2); + + return group1 == INTRINSIC_GROUP_ALL || group2 == INTRINSIC_GROUP_ALL + || group1 == group2; +} + +static void +cache_add(struct cache_node *cache, nir_instr *instr) +{ + struct cache_node *node = ralloc(NULL, struct cache_node); + node->instr = instr; + list_addtail(&node->list, &cache->list); +} + +static void +cache_clear(struct cache_node *cache) +{ + list_for_each_entry(struct cache_node, item, &cache->list, list) { + ralloc_free(item); + } + + list_empty(&cache->list); + list_inithead(&cache->list); +} + +/** + * Returns true if a nir_src is direct, defined here as an SSA value + * whose parent instruction is a load_const. + */ +static bool +nir_src_is_direct(nir_src *src) +{ + if (!src->is_ssa) + return false; + + nir_instr *parent_instr = src->ssa->parent_instr; + if (parent_instr->type != nir_instr_type_load_const) + return false; + + return true; +} + +/** + * Gets the block and offset of a load/store instruction. + * + * @instr: the intrinsic load/store operation + * @block: the output block + * @offset: the output offset + */ +static void +get_load_store_address(nir_intrinsic_instr *instr, + nir_src **block, + nir_src **offset) +{ + int block_index = -1; + int offset_index = -1; + + assert(block && offset); + + switch (instr->intrinsic) { + /* SSBO */ + case nir_intrinsic_store_ssbo: + block_index = 1; + offset_index = 2; + break; + + case nir_intrinsic_load_ssbo: + case nir_intrinsic_ssbo_atomic_add: + case nir_intrinsic_ssbo_atomic_imin: + case nir_intrinsic_ssbo_atomic_umin: + case nir_intrinsic_ssbo_atomic_imax: + case nir_intrinsic_ssbo_atomic_umax: + case nir_intrinsic_ssbo_atomic_and: + case nir_intrinsic_ssbo_atomic_or: + case nir_intrinsic_ssbo_atomic_xor: + case nir_intrinsic_ssbo_atomic_exchange: + case nir_intrinsic_ssbo_atomic_comp_swap: + block_index = 0; + offset_index = 1; + break; + + default: + assert(!"not implemented"); + } + + assert(block_index >= 0 && offset_index >= 0); + + *block = &instr->src[block_index]; + *offset = &instr->src[offset_index]; +} + +/** + * Determines whether two instrinsic instructions conflict with each other, + * meaning that a) they access the same memory area, or b) a non-conflict + * cannot be determined (because at least one access is indirect). + * + * @full_match serves as an output flag to signal that + * the conflict ocurred because both instructions access the exact same + * memory region. This is used in the pass to know that two intructions + * are safe to combine. + * + * Returns true upon conflict, false otherwise. + */ +static bool +detect_memory_access_conflict(nir_intrinsic_instr *instr1, + nir_intrinsic_instr *instr2, + bool *full_match) +{ + nir_src *instr1_block = NULL; + nir_src *instr1_offset = NULL; + nir_src *instr2_block = NULL; + nir_src *instr2_offset = NULL; + bool blocks_match = false; + bool offsets_match = false; + + if (full_match) + *full_match = false; + + /* if intrinsic groups don't match, there can't be any conflict */ + if (!intrinsic_group_match(instr1, instr2)) + return false; + + get_load_store_address(instr1, &instr1_block, &instr1_offset); + get_load_store_address(instr2, &instr2_block, &instr2_offset); + + /* There is conflict if the blocks and the offsets of each instruction + * are not both direct or both indirect. If that's not the case, then + * there is conflict if the blocks and offsets match. + */ + + /* For SSBOs the block is an SSA value, but it can still be direct, + * if defined by a load_const instruction. + */ + if (nir_src_is_direct(instr1_block) != nir_src_is_direct(instr2_block)) + return true; + + blocks_match = nir_srcs_equal(*instr1_block, *instr2_block); + + /* For SSBOs, the offset is an SSA value, but it can still be direct, + *if defined by a load_const instruction. + */ + if (nir_src_is_direct(instr1_offset) != nir_src_is_direct(instr2_offset)) + return true; + + offsets_match = nir_srcs_equal(*instr1_offset, *instr2_offset); + + /* finally, if both blocks and offsets match, it means conflict */ + if (offsets_match && blocks_match) { + if (full_match) + *full_match = true; + + return true; + } + + return false; +} + +/** + * Traverses the set of cached load/store intrinsics and invalidates all that + * conflict with @store. + */ +static void +cache_invalidate_for_store(struct cache_node *cache, + nir_intrinsic_instr *store) +{ + assert(is_store(store)); + + list_for_each_entry_safe(struct cache_node, item, &cache->list, list) { + nir_instr *instr = item->instr; + assert(instr->type == nir_instr_type_intrinsic); + + nir_intrinsic_instr *cached = nir_instr_as_intrinsic(instr); + + if (detect_memory_access_conflict(store, cached, NULL)) { + /* remove the cached instruction from the list */ + list_del(&item->list); + ralloc_free(item); + } + } +} + +/** + * Traverses the set of cached load/store intrinsics and tries to + * rewrite the given load instruction with a previous compatible load. + */ +static bool +rewrite_load_with_load(struct cache_node *cache, + nir_intrinsic_instr *load) +{ + assert(is_load(load)); + + list_for_each_entry(struct cache_node, item, &cache->list, list) { + nir_instr *instr = item->instr; + assert(instr->type == nir_instr_type_intrinsic); + + nir_intrinsic_instr *prev_load = nir_instr_as_intrinsic(instr); + if (!is_load(prev_load)) + continue; + + /* Both intrinsics must access same memory area (block, offset, etc). + * + * Here. we reuse detect_memory_access_conflict(), which meets this + * purpose semantically, except that we need to know if the conflict + * happened because the blocks and offsets match. + */ + bool blocks_and_offsets_match = false; + if (!detect_memory_access_conflict(load, prev_load, + &blocks_and_offsets_match)) { + continue; + } + + if (blocks_and_offsets_match) { + /* rewrite the new load with the cached load instruction */ + nir_ssa_def *def = &load->dest.ssa; + nir_ssa_def *new_def = &prev_load->dest.ssa; + nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def)); + + return true; + } + } + + cache_add(cache, &load->instr); + + return false; +} + +/** + * Traverses the set of cached load/store intrinsics, and remove those + * whose intrinsic group matches @group. + */ +static void +cache_invalidate_for_group(struct cache_node *cache, unsigned group) +{ + list_for_each_entry_safe(struct cache_node, item, &cache->list, list) { + nir_instr *instr = item->instr; + assert(instr->type == nir_instr_type_intrinsic); + + nir_intrinsic_instr *cached = nir_instr_as_intrinsic(instr); + + if (group == INTRINSIC_GROUP_ALL || intrinsic_group(cached) == group) { + list_del(&item->list); + ralloc_free(item); + } + } +} + +static bool +load_combine_block(nir_block *block) +{ + bool progress = false; + + /* This pass only works on local blocks for now, so we create and destroy + * the instruction cache with each block. + */ + struct cache_node cache = {0}; + list_inithead(&cache.list); + + nir_foreach_instr_safe(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr); + if (is_load(intrinsic)) { + /* Try to rewrite with a previous load */ + if (rewrite_load_with_load(&cache, intrinsic)) { + nir_instr_remove(instr); + progress = true; + } + } else if (is_store(intrinsic)) { + /* Invalidate conflicting load/stores */ + cache_invalidate_for_store(&cache, intrinsic); + } else if (is_memory_barrier(intrinsic)) { + /* If we see a memory barrier we have to invalidate all cached + * load/store operations from the same intrinsic group. + */ + cache_invalidate_for_group(&cache, intrinsic_group(intrinsic)); + } + } + + cache_clear(&cache); + + for (unsigned i = 0; i < block->num_dom_children; i++) { + nir_block *child = block->dom_children[i]; + progress |= load_combine_block(child); + } + + return progress; +} + +static bool +nir_opt_load_combine_impl(nir_function_impl *impl) +{ + nir_metadata_require(impl, nir_metadata_dominance); + + bool progress = load_combine_block(nir_start_block(impl)); + + if (progress) + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + return progress; +} + +bool +nir_opt_load_combine(nir_shader *shader) +{ + bool progress = false; + + nir_foreach_function(shader, function) { + if (function->impl) + progress |= nir_opt_load_combine_impl(function->impl); + } + + return progress; +} -- 2.7.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev