On 14/11/2018 00:23, Jason Ekstrand wrote: > We have a bunch of code to do this in the back-end compiler but it's > fairly specific to typed surface messages and the way we emit them. > This breaks it out into NIR were it's easier to do things a bit more > generally. It also means we can easily share the code between the bec4
vec4 Reviewed-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> > and FS back-ends if we wish. > --- > src/intel/Makefile.sources | 1 + > src/intel/compiler/brw_fs_nir.cpp | 381 ++++-------------- > src/intel/compiler/brw_nir.c | 2 + > src/intel/compiler/brw_nir.h | 2 + > .../brw_nir_lower_mem_access_bit_sizes.c | 313 ++++++++++++++ > src/intel/compiler/brw_vec4_nir.cpp | 126 +----- > src/intel/compiler/meson.build | 1 + > 7 files changed, 421 insertions(+), 405 deletions(-) > create mode 100644 src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c > > diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources > index 4da887f7ed2..5e7d32293b7 100644 > --- a/src/intel/Makefile.sources > +++ b/src/intel/Makefile.sources > @@ -85,6 +85,7 @@ COMPILER_FILES = \ > compiler/brw_nir_attribute_workarounds.c \ > compiler/brw_nir_lower_cs_intrinsics.c \ > compiler/brw_nir_lower_image_load_store.c \ > + compiler/brw_nir_lower_mem_access_bit_sizes.c \ > compiler/brw_nir_opt_peephole_ffma.c \ > compiler/brw_nir_tcs_workarounds.c \ > compiler/brw_packed_float.c \ > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 2b36171136e..84d0c6be6c3 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -26,6 +26,7 @@ > #include "brw_fs_surface_builder.h" > #include "brw_nir.h" > #include "util/u_math.h" > +#include "util/bitscan.h" > > using namespace brw; > using namespace brw::surface_access; > @@ -2250,107 +2251,6 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr > *instr) > return get_nir_src(*offset_src); > } > > -static void > -do_untyped_vector_read(const fs_builder &bld, > - const fs_reg dest, > - const fs_reg surf_index, > - const fs_reg offset_reg, > - unsigned num_components) > -{ > - if (type_sz(dest.type) <= 2) { > - assert(dest.stride == 1); > - boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE; > - > - if (is_const_offset) { > - uint32_t start = offset_reg.ud & ~3; > - uint32_t end = offset_reg.ud + num_components * type_sz(dest.type); > - end = ALIGN(end, 4); > - assert (end - start <= 16); > - > - /* At this point we have 16-bit component/s that have constant > - * offset aligned to 4-bytes that can be read with untyped_reads. > - * untyped_read message requires 32-bit aligned offsets. > - */ > - unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type); > - unsigned num_components_32bit = (end - start) / 4; > - > - fs_reg read_result = > - emit_untyped_read(bld, surf_index, brw_imm_ud(start), > - 1 /* dims */, > - num_components_32bit, > - BRW_PREDICATE_NONE); > - shuffle_from_32bit_read(bld, dest, read_result, first_component, > - num_components); > - } else { > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - for (unsigned i = 0; i < num_components; i++) { > - if (i == 0) { > - bld.MOV(read_offset, offset_reg); > - } else { > - bld.ADD(read_offset, offset_reg, > - brw_imm_ud(i * type_sz(dest.type))); > - } > - /* Non constant offsets are not guaranteed to be aligned 32-bits > - * so they are read using one byte_scattered_read message > - * for each component. > - */ > - fs_reg read_result = > - emit_byte_scattered_read(bld, surf_index, read_offset, > - 1 /* dims */, 1, > - type_sz(dest.type) * 8 /* bit_size > */, > - BRW_PREDICATE_NONE); > - bld.MOV(offset(dest, bld, i), > - subscript (read_result, dest.type, 0)); > - } > - } > - } else if (type_sz(dest.type) == 4) { > - fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, > - num_components, > - BRW_PREDICATE_NONE); > - read_result.type = dest.type; > - for (unsigned i = 0; i < num_components; i++) > - bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); > - } else if (type_sz(dest.type) == 8) { > - /* Reading a dvec, so we need to: > - * > - * 1. Multiply num_components by 2, to account for the fact that we > - * need to read 64-bit components. > - * 2. Shuffle the result of the load to form valid 64-bit elements > - * 3. Emit a second load (for components z/w) if needed. > - */ > - fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > - bld.MOV(read_offset, offset_reg); > - > - int iters = num_components <= 2 ? 1 : 2; > - > - /* Load the dvec, the first iteration loads components x/y, the second > - * iteration, if needed, loads components z/w > - */ > - for (int it = 0; it < iters; it++) { > - /* Compute number of components to read in this iteration */ > - int iter_components = MIN2(2, num_components); > - num_components -= iter_components; > - > - /* Read. Since this message reads 32-bit components, we need to > - * read twice as many components. > - */ > - fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset, > - 1 /* dims */, > - iter_components * 2, > - BRW_PREDICATE_NONE); > - > - /* Shuffle the 32-bit load result into valid 64-bit data */ > - shuffle_from_32bit_read(bld, offset(dest, bld, it * 2), > - read_result, 0, iter_components); > - > - bld.ADD(read_offset, read_offset, brw_imm_ud(16)); > - } > - } else { > - unreachable("Unsupported type"); > - } > -} > - > void > fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr) > @@ -3572,93 +3472,64 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder > &bld, > > case nir_intrinsic_load_shared: { > assert(devinfo->gen >= 7); > + assert(stage == MESA_SHADER_COMPUTE); > > - fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); > + const unsigned bit_size = nir_dest_bit_size(instr->dest); > + fs_reg offset_reg = retype(get_nir_src(instr->src[0]), > + BRW_REGISTER_TYPE_UD); > > - /* Get the offset to read from */ > - fs_reg offset_reg; > - if (nir_src_is_const(instr->src[0])) { > - offset_reg = brw_imm_ud(instr->const_index[0] + > - nir_src_as_uint(instr->src[0])); > - } else { > - offset_reg = vgrf(glsl_type::uint_type); > - bld.ADD(offset_reg, > - retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), > - brw_imm_ud(instr->const_index[0])); > - } > + /* Make dest unsigned because that's what the temporary will be */ > + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); > > /* Read the vector */ > - do_untyped_vector_read(bld, dest, surf_index, offset_reg, > - instr->num_components); > + if (nir_intrinsic_align(instr) >= 4) { > + assert(nir_dest_bit_size(instr->dest) == 32); > + fs_reg read_result = emit_untyped_read(bld, > brw_imm_ud(GEN7_BTI_SLM), > + offset_reg, 1 /* dims */, > + instr->num_components, > + BRW_PREDICATE_NONE); > + for (unsigned i = 0; i < instr->num_components; i++) > + bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); > + } else { > + assert(nir_dest_bit_size(instr->dest) <= 32); > + assert(nir_dest_num_components(instr->dest) == 1); > + fs_reg read_result = > + emit_byte_scattered_read(bld, brw_imm_ud(GEN7_BTI_SLM), > offset_reg, > + 1 /* dims */, 1, bit_size, > + BRW_PREDICATE_NONE); > + bld.MOV(dest, read_result); > + } > break; > } > > case nir_intrinsic_store_shared: { > assert(devinfo->gen >= 7); > + assert(stage == MESA_SHADER_COMPUTE); > > - /* Block index */ > - fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); > - > - /* Value */ > + const unsigned bit_size = nir_src_bit_size(instr->src[0]); > fs_reg val_reg = get_nir_src(instr->src[0]); > + fs_reg offset_reg = retype(get_nir_src(instr->src[1]), > + BRW_REGISTER_TYPE_UD); > > - /* Writemask */ > - unsigned writemask = instr->const_index[1]; > - > - /* get_nir_src() retypes to integer. Be wary of 64-bit types though > - * since the untyped writes below operate in units of 32-bits, which > - * means that we need to write twice as many components each time. > - * Also, we have to suffle 64-bit data to be in the appropriate layout > - * expected by our 32-bit write messages. > - */ > - unsigned type_size = 4; > - if (nir_src_bit_size(instr->src[0]) == 64) { > - type_size = 8; > - val_reg = shuffle_for_32bit_write(bld, val_reg, 0, > - instr->num_components); > - } > - > - unsigned type_slots = type_size / 4; > - > - /* Combine groups of consecutive enabled channels in one write > - * message. We use ffs to find the first enabled channel and then ffs > on > - * the bit-inverse, down-shifted writemask to determine the length of > - * the block of enabled bits. > - */ > - while (writemask) { > - unsigned first_component = ffs(writemask) - 1; > - unsigned length = ffs(~(writemask >> first_component)) - 1; > - > - /* We can't write more than 2 64-bit components at once. Limit the > - * length of the write to what we can do and let the next iteration > - * handle the rest > - */ > - if (type_size > 4) > - length = MIN2(2, length); > - > - fs_reg offset_reg; > - if (nir_src_is_const(instr->src[1])) { > - offset_reg = brw_imm_ud(instr->const_index[0] + > - nir_src_as_uint(instr->src[1]) + > - type_size * first_component); > - } else { > - offset_reg = vgrf(glsl_type::uint_type); > - bld.ADD(offset_reg, > - retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), > - brw_imm_ud(instr->const_index[0] + type_size * > first_component)); > - } > + val_reg.type = brw_reg_type_from_bit_size(bit_size, > BRW_REGISTER_TYPE_UD); > > - emit_untyped_write(bld, surf_index, offset_reg, > - offset(val_reg, bld, first_component * > type_slots), > - 1 /* dims */, length * type_slots, > + assert(nir_intrinsic_write_mask(instr) == > + (1 << instr->num_components) - 1); > + if (nir_intrinsic_align(instr) >= 4) { > + assert(nir_src_bit_size(instr->src[0]) == 32); > + assert(nir_src_num_components(instr->src[0]) <= 4); > + emit_untyped_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg, > val_reg, > + 1 /* dims */, instr->num_components, > BRW_PREDICATE_NONE); > - > - /* Clear the bits in the writemask that we just wrote, then try > - * again to see if more channels are left. > - */ > - writemask &= (15 << (first_component + length)); > + } else { > + assert(nir_src_bit_size(instr->src[0]) <= 32); > + assert(nir_src_num_components(instr->src[0]) == 1); > + fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.MOV(write_src, val_reg); > + emit_byte_scattered_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg, > + write_src, 1 /* dims */, bit_size, > + BRW_PREDICATE_NONE); > } > - > break; > } > > @@ -4155,13 +4026,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > case nir_intrinsic_load_ssbo: { > assert(devinfo->gen >= 7); > > + const unsigned bit_size = nir_dest_bit_size(instr->dest); > fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr); > - fs_reg offset_reg = get_nir_src_imm(instr->src[1]); > + fs_reg offset_reg = retype(get_nir_src(instr->src[1]), > + BRW_REGISTER_TYPE_UD); > > - /* Read the vector */ > - do_untyped_vector_read(bld, dest, surf_index, offset_reg, > - instr->num_components); > + /* Make dest unsigned because that's what the temporary will be */ > + dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); > > + /* Read the vector */ > + if (nir_intrinsic_align(instr) >= 4) { > + assert(nir_dest_bit_size(instr->dest) == 32); > + fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > + 1 /* dims */, > + instr->num_components, > + BRW_PREDICATE_NONE); > + for (unsigned i = 0; i < instr->num_components; i++) > + bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); > + } else { > + assert(nir_dest_bit_size(instr->dest) <= 32); > + assert(nir_dest_num_components(instr->dest) == 1); > + fs_reg read_result = > + emit_byte_scattered_read(bld, surf_index, offset_reg, > + 1 /* dims */, 1, bit_size, > + BRW_PREDICATE_NONE); > + bld.MOV(dest, read_result); > + } > break; > } > > @@ -4171,125 +4061,30 @@ fs_visitor::nir_emit_intrinsic(const fs_builder > &bld, nir_intrinsic_instr *instr > if (stage == MESA_SHADER_FRAGMENT) > brw_wm_prog_data(prog_data)->has_side_effects = true; > > - fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr); > - > - /* Value */ > + const unsigned bit_size = nir_src_bit_size(instr->src[0]); > fs_reg val_reg = get_nir_src(instr->src[0]); > + fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr); > + fs_reg offset_reg = retype(get_nir_src(instr->src[2]), > + BRW_REGISTER_TYPE_UD); > > - /* Writemask */ > - unsigned writemask = instr->const_index[0]; > - > - /* get_nir_src() retypes to integer. Be wary of 64-bit types though > - * since the untyped writes below operate in units of 32-bits, which > - * means that we need to write twice as many components each time. > - * Also, we have to suffle 64-bit data to be in the appropriate layout > - * expected by our 32-bit write messages. > - */ > - unsigned bit_size = nir_src_bit_size(instr->src[0]); > - unsigned type_size = bit_size / 8; > - > - /* Combine groups of consecutive enabled channels in one write > - * message. We use ffs to find the first enabled channel and then ffs > on > - * the bit-inverse, down-shifted writemask to determine the > num_components > - * of the block of enabled bits. > - */ > - while (writemask) { > - unsigned first_component = ffs(writemask) - 1; > - unsigned num_components = ffs(~(writemask >> first_component)) - 1; > - fs_reg write_src = offset(val_reg, bld, first_component); > - > - if (type_size > 4) { > - /* We can't write more than 2 64-bit components at once. Limit > - * the num_components of the write to what we can do and let the > next > - * iteration handle the rest. > - */ > - num_components = MIN2(2, num_components); > - write_src = shuffle_for_32bit_write(bld, write_src, 0, > - num_components); > - } else if (type_size < 4) { > - /* For 16-bit types we pack two consecutive values into a 32-bit > - * word and use an untyped write message. For single values or > not > - * 32-bit-aligned we need to use byte-scattered writes because > - * untyped writes works with 32-bit components with 32-bit > - * alignment. byte_scattered_write messages only support one > - * 16-bit component at a time. As VK_KHR_relaxed_block_layout > - * could be enabled we can not guarantee that not constant > offsets > - * to be 32-bit aligned for 16-bit types. For example an array, > of > - * 16-bit vec3 with array element stride of 6. > - * > - * In the case of 32-bit aligned constant offsets if there is > - * a 3-components vector we submit one untyped-write message > - * of 32-bit (first two components), and one byte-scattered > - * write message (the last component). > - */ > - > - if (!nir_src_is_const(instr->src[2]) || > - ((nir_src_as_uint(instr->src[2]) + > - type_size * first_component) % 4)) { > - /* If we use a .yz writemask we also need to emit 2 > - * byte-scattered write messages because of y-component not > - * being aligned to 32-bit. > - */ > - num_components = 1; > - } else if (num_components * type_size > 4 && > - (num_components * type_size % 4)) { > - /* If the pending components size is not a multiple of 4 bytes > - * we left the not aligned components for following emits of > - * length == 1 with byte_scattered_write. > - */ > - num_components -= (num_components * type_size % 4) / > type_size; > - } else if (num_components * type_size < 4) { > - num_components = 1; > - } > - /* For num_components == 1 we are also shuffling the component > - * because byte scattered writes of 16-bit need values to be > dword > - * aligned. Shuffling only one component would be the same as > - * striding it. > - */ > - write_src = shuffle_for_32bit_write(bld, write_src, 0, > - num_components); > - } > - > - fs_reg offset_reg; > - > - if (nir_src_is_const(instr->src[2])) { > - offset_reg = brw_imm_ud(nir_src_as_uint(instr->src[2]) + > - type_size * first_component); > - } else { > - offset_reg = vgrf(glsl_type::uint_type); > - bld.ADD(offset_reg, > - retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD), > - brw_imm_ud(type_size * first_component)); > - } > - > - if (type_size < 4 && num_components == 1) { > - /* Untyped Surface messages have a fixed 32-bit size, so we need > - * to rely on byte scattered in order to write 16-bit elements. > - * The byte_scattered_write message needs that every written > 16-bit > - * type to be aligned 32-bits (stride=2). > - */ > - emit_byte_scattered_write(bld, surf_index, offset_reg, > - write_src, > - 1 /* dims */, > - bit_size, > - BRW_PREDICATE_NONE); > - } else { > - assert(num_components * type_size <= 16); > - assert((num_components * type_size) % 4 == 0); > - assert(offset_reg.file != BRW_IMMEDIATE_VALUE || > - offset_reg.ud % 4 == 0); > - unsigned num_slots = (num_components * type_size) / 4; > - > - emit_untyped_write(bld, surf_index, offset_reg, > - write_src, > - 1 /* dims */, num_slots, > - BRW_PREDICATE_NONE); > - } > + val_reg.type = brw_reg_type_from_bit_size(bit_size, > BRW_REGISTER_TYPE_UD); > > - /* Clear the bits in the writemask that we just wrote, then try > - * again to see if more channels are left. > - */ > - writemask &= (15 << (first_component + num_components)); > + assert(nir_intrinsic_write_mask(instr) == > + (1 << instr->num_components) - 1); > + if (nir_intrinsic_align(instr) >= 4) { > + assert(nir_src_bit_size(instr->src[0]) == 32); > + assert(nir_src_num_components(instr->src[0]) <= 4); > + emit_untyped_write(bld, surf_index, offset_reg, val_reg, > + 1 /* dims */, instr->num_components, > + BRW_PREDICATE_NONE); > + } else { > + assert(nir_src_bit_size(instr->src[0]) <= 32); > + assert(nir_src_num_components(instr->src[0]) == 1); > + fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.MOV(write_src, val_reg); > + emit_byte_scattered_write(bld, surf_index, offset_reg, > + write_src, 1 /* dims */, bit_size, > + BRW_PREDICATE_NONE); > } > break; > } > diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c > index 26a5ea04605..f5afeec9946 100644 > --- a/src/intel/compiler/brw_nir.c > +++ b/src/intel/compiler/brw_nir.c > @@ -714,6 +714,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, > nir_shader *nir) > brw_nir_no_indirect_mask(compiler, nir->info.stage); > OPT(nir_lower_indirect_derefs, indirect_mask); > > + OPT(brw_nir_lower_mem_access_bit_sizes); > + > /* Get rid of split copies */ > nir = brw_nir_optimize(nir, compiler, is_scalar, false); > > diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h > index 2ff8c72b94f..bc81950d47e 100644 > --- a/src/intel/compiler/brw_nir.h > +++ b/src/intel/compiler/brw_nir.h > @@ -119,6 +119,8 @@ bool brw_nir_lower_image_load_store(nir_shader *nir, > void brw_nir_rewrite_image_intrinsic(nir_intrinsic_instr *intrin, > nir_ssa_def *index); > > +bool brw_nir_lower_mem_access_bit_sizes(nir_shader *shader); > + > nir_shader *brw_postprocess_nir(nir_shader *nir, > const struct brw_compiler *compiler, > bool is_scalar); > diff --git a/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c > b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c > new file mode 100644 > index 00000000000..a3320521f49 > --- /dev/null > +++ b/src/intel/compiler/brw_nir_lower_mem_access_bit_sizes.c > @@ -0,0 +1,313 @@ > +/* > + * Copyright © 2018 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > + * IN THE SOFTWARE. > + */ > + > +#include "brw_nir.h" > +#include "compiler/nir/nir_builder.h" > +#include "util/u_math.h" > +#include "util/bitscan.h" > + > +static nir_ssa_def * > +dup_mem_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, > + nir_ssa_def *store_src, int offset, > + unsigned num_components, unsigned bit_size, > + unsigned align) > +{ > + const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic]; > + > + nir_intrinsic_instr *dup = > + nir_intrinsic_instr_create(b->shader, intrin->intrinsic); > + > + nir_src *intrin_offset_src = nir_get_io_offset_src(intrin); > + for (unsigned i = 0; i < info->num_srcs; i++) { > + assert(intrin->src[i].is_ssa); > + if (i == 0 && store_src) { > + assert(!info->has_dest); > + assert(&intrin->src[i] != intrin_offset_src); > + dup->src[i] = nir_src_for_ssa(store_src); > + } else if (&intrin->src[i] == intrin_offset_src) { > + dup->src[i] = nir_src_for_ssa(nir_iadd_imm(b, intrin->src[i].ssa, > + offset)); > + } else { > + dup->src[i] = nir_src_for_ssa(intrin->src[i].ssa); > + } > + } > + > + dup->num_components = num_components; > + > + for (unsigned i = 0; i < info->num_indices; i++) > + dup->const_index[i] = intrin->const_index[i]; > + > + nir_intrinsic_set_align(dup, align, 0); > + > + if (info->has_dest) { > + assert(intrin->dest.is_ssa); > + nir_ssa_dest_init(&dup->instr, &dup->dest, > + num_components, bit_size, > + intrin->dest.ssa.name); > + } else { > + nir_intrinsic_set_write_mask(dup, (1 << num_components) - 1); > + } > + > + nir_builder_instr_insert(b, &dup->instr); > + > + return info->has_dest ? &dup->dest.ssa : NULL; > +} > + > +static bool > +lower_mem_load_bit_size(nir_builder *b, nir_intrinsic_instr *intrin) > +{ > + assert(intrin->dest.is_ssa); > + if (intrin->dest.ssa.bit_size == 32) > + return false; > + > + const unsigned bit_size = intrin->dest.ssa.bit_size; > + const unsigned num_components = intrin->dest.ssa.num_components; > + const unsigned bytes_read = num_components * (bit_size / 8); > + const unsigned align = nir_intrinsic_align(intrin); > + > + nir_ssa_def *result[4] = { NULL, }; > + > + nir_src *offset_src = nir_get_io_offset_src(intrin); > + if (bit_size < 32 && nir_src_is_const(*offset_src)) { > + /* The offset is constant so we can use a 32-bit load and just shift it > + * around as needed. > + */ > + const int load_offset = nir_src_as_uint(*offset_src) % 4; > + assert(load_offset % (bit_size / 8) == 0); > + const unsigned load_comps32 = DIV_ROUND_UP(bytes_read + load_offset, > 4); > + /* A 16-bit vec4 is a 32-bit vec2. We add an extra component in case > + * we offset into a component with load_offset. > + */ > + assert(load_comps32 <= 3); > + > + nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, -load_offset, > + load_comps32, 32, 4); > + nir_ssa_def *unpacked[3]; > + for (unsigned i = 0; i < load_comps32; i++) > + unpacked[i] = nir_unpack_bits(b, nir_channel(b, load, i), bit_size); > + > + assert(load_offset % (bit_size / 8) == 0); > + const unsigned divisor = 32 / bit_size; > + > + for (unsigned i = 0; i < num_components; i++) { > + unsigned load_i = i + load_offset / (bit_size / 8); > + result[i] = nir_channel(b, unpacked[load_i / divisor], > + load_i % divisor); > + } > + } else { > + /* Otherwise, we have to break it into smaller loads */ > + unsigned res_idx = 0; > + int load_offset = 0; > + while (load_offset < bytes_read) { > + const unsigned bytes_left = bytes_read - load_offset; > + unsigned load_bit_size, load_comps; > + if (align < 4) { > + load_comps = 1; > + /* Choose a byte, word, or dword */ > + load_bit_size = util_next_power_of_two(MIN2(bytes_left, 4)) * 8; > + } else { > + assert(load_offset % 4 == 0); > + load_bit_size = 32; > + load_comps = DIV_ROUND_UP(MIN2(bytes_left, 16), 4); > + } > + > + nir_ssa_def *load = dup_mem_intrinsic(b, intrin, NULL, load_offset, > + load_comps, load_bit_size, > + align); > + > + nir_ssa_def *unpacked = nir_bitcast_vector(b, load, bit_size); > + for (unsigned i = 0; i < unpacked->num_components; i++) { > + if (res_idx < num_components) > + result[res_idx++] = nir_channel(b, unpacked, i); > + } > + > + load_offset += load_comps * (load_bit_size / 8); > + } > + } > + > + nir_ssa_def *vec_result = nir_vec(b, result, num_components); > + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, > + nir_src_for_ssa(vec_result)); > + nir_instr_remove(&intrin->instr); > + > + return true; > +} > + > +static bool > +lower_mem_store_bit_size(nir_builder *b, nir_intrinsic_instr *intrin) > +{ > + assert(intrin->src[0].is_ssa); > + nir_ssa_def *value = intrin->src[0].ssa; > + > + assert(intrin->num_components == value->num_components); > + const unsigned bit_size = value->bit_size; > + const unsigned num_components = intrin->num_components; > + const unsigned bytes_written = num_components * (bit_size / 8); > + const unsigned align_mul = nir_intrinsic_align_mul(intrin); > + const unsigned align_offset = nir_intrinsic_align_offset(intrin); > + const unsigned align = nir_intrinsic_align(intrin); > + > + nir_component_mask_t writemask = nir_intrinsic_write_mask(intrin); > + assert(writemask < (1 << num_components)); > + > + if ((value->bit_size <= 32 && num_components == 1) || > + (value->bit_size == 32 && writemask == (1 << num_components) - 1)) > + return false; > + > + nir_src *offset_src = nir_get_io_offset_src(intrin); > + const bool offset_is_const = nir_src_is_const(*offset_src); > + const unsigned const_offset = > + offset_is_const ? nir_src_as_uint(*offset_src) : 0; > + > + assert(num_components * (bit_size / 8) <= 32); > + uint32_t byte_mask = 0; > + for (unsigned i = 0; i < num_components; i++) { > + if (writemask & (1 << i)) > + byte_mask |= ((1 << (bit_size / 8)) - 1) << i * (bit_size / 8); > + } > + > + while (byte_mask) { > + const int start = ffs(byte_mask) - 1; > + assert(start % (bit_size / 8) == 0); > + > + int end; > + for (end = start + 1; end < bytes_written; end++) { > + if (!(byte_mask & (1 << end))) > + break; > + } > + /* The size of the current contiguous chunk in bytes */ > + const unsigned chunk_bytes = end - start; > + > + const bool is_dword_aligned = > + (align_mul >= 4 && (align_offset + start) % 4 == 0) || > + (offset_is_const && (start + const_offset) % 4 == 0); > + > + unsigned store_comps, store_bit_size, store_align; > + if (chunk_bytes >= 4 && is_dword_aligned) { > + store_align = MAX2(align, 4); > + store_bit_size = 32; > + store_comps = MIN2(chunk_bytes, 16) / 4; > + } else { > + store_align = align; > + store_comps = 1; > + store_bit_size = MIN2(chunk_bytes, 4) * 8; > + /* The bit size must be a power of two */ > + if (store_bit_size == 24) > + store_bit_size = 16; > + } > + > + const unsigned store_bytes = store_comps * (store_bit_size / 8); > + assert(store_bytes % (bit_size / 8) == 0); > + const unsigned store_first_src_comp = start / (bit_size / 8); > + const unsigned store_src_comps = store_bytes / (bit_size / 8); > + assert(store_first_src_comp + store_src_comps <= num_components); > + > + unsigned src_swiz[4]; > + for (unsigned i = 0; i < store_src_comps; i++) > + src_swiz[i] = store_first_src_comp + i; > + nir_ssa_def *store_value = > + nir_swizzle(b, value, src_swiz, store_src_comps, false); > + nir_ssa_def *packed = nir_bitcast_vector(b, store_value, > store_bit_size); > + > + dup_mem_intrinsic(b, intrin, packed, start, > + store_comps, store_bit_size, store_align); > + > + byte_mask &= ~(((1u << store_bytes) - 1) << start); > + } > + > + nir_instr_remove(&intrin->instr); > + > + return true; > +} > + > +static bool > +lower_mem_access_bit_sizes_impl(nir_function_impl *impl) > +{ > + bool progress = false; > + > + nir_builder b; > + nir_builder_init(&b, impl); > + > + nir_foreach_block(block, impl) { > + nir_foreach_instr_safe(instr, block) { > + if (instr->type != nir_instr_type_intrinsic) > + continue; > + > + b.cursor = nir_after_instr(instr); > + > + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); > + switch (intrin->intrinsic) { > + case nir_intrinsic_load_ssbo: > + case nir_intrinsic_load_shared: > + if (lower_mem_load_bit_size(&b, intrin)) > + progress = true; > + break; > + > + case nir_intrinsic_store_ssbo: > + case nir_intrinsic_store_shared: > + if (lower_mem_store_bit_size(&b, intrin)) > + progress = true; > + break; > + > + default: > + break; > + } > + } > + } > + > + if (progress) { > + nir_metadata_preserve(impl, nir_metadata_block_index | > + nir_metadata_dominance); > + } > + > + return progress; > +} > + > +/** > + * This pass loads arbitrary SSBO and shared memory load/store operations to > + * intrinsics which are natively handleable by GEN hardware. In particular, > + * we have two general types of memory load/store messages: > + * > + * - Untyped surface read/write: These can load/store between one and four > + * dword components to/from a dword-aligned offset. > + * > + * - Byte scattered read/write: These can load/store a single byte, word, > or > + * dword scalar to/from an unaligned byte offset. > + * > + * Neither type of message can do a write-masked store. This pass converts > + * all nir load/store intrinsics into a series of either 8 or 32-bit > + * load/store intrinsics with a number of components that we can directly > + * handle in hardware and with a trivial write-mask. > + */ > +bool > +brw_nir_lower_mem_access_bit_sizes(nir_shader *shader) > +{ > + bool progress = false; > + > + nir_foreach_function(func, shader) { > + if (func->impl && lower_mem_access_bit_sizes_impl(func->impl)) > + progress = true; > + } > + > + return progress; > +} > diff --git a/src/intel/compiler/brw_vec4_nir.cpp > b/src/intel/compiler/brw_vec4_nir.cpp > index 564be7e5eee..26ca2ddd8dc 100644 > --- a/src/intel/compiler/brw_vec4_nir.cpp > +++ b/src/intel/compiler/brw_vec4_nir.cpp > @@ -500,6 +500,11 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr > *instr) > case nir_intrinsic_store_ssbo: { > assert(devinfo->gen >= 7); > > + /* brw_nir_lower_mem_access_bit_sizes takes care of this */ > + assert(nir_src_bit_size(instr->src[0]) == 32); > + assert(nir_intrinsic_write_mask(instr) == > + (1 << instr->num_components) - 1); > + > src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); > src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]), > BRW_REGISTER_TYPE_UD); > @@ -507,9 +512,6 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr > *instr) > /* Value */ > src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4); > > - /* Writemask */ > - unsigned write_mask = instr->const_index[0]; > - > /* IvyBridge does not have a native SIMD4x2 untyped write message so > untyped > * writes will use SIMD8 mode. In order to hide this and keep symmetry > across > * typed and untyped messages and across hardware platforms, the > @@ -551,92 +553,18 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr > *instr) > const vec4_builder bld = vec4_builder(this).at_end() > .annotate(current_annotation, base_ir); > > - unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32; > - if (type_slots == 2) { > - dst_reg tmp = dst_reg(this, glsl_type::dvec4_type); > - shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true); > - val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F)); > - } > - > - uint8_t swizzle[4] = { 0, 0, 0, 0}; > - int num_channels = 0; > - unsigned skipped_channels = 0; > - int num_components = instr->num_components; > - for (int i = 0; i < num_components; i++) { > - /* Read components Z/W of a dvec from the appropriate place. We will > - * also have to adjust the swizzle (we do that with the '% 4' below) > - */ > - if (i == 2 && type_slots == 2) > - val_reg = byte_offset(val_reg, REG_SIZE); > - > - /* Check if this channel needs to be written. If so, record the > - * channel we need to take the data from in the swizzle array > - */ > - int component_mask = 1 << i; > - int write_test = write_mask & component_mask; > - if (write_test) { > - /* If we are writing doubles we have to write 2 channels worth of > - * of data (64 bits) for each double component. > - */ > - swizzle[num_channels++] = (i * type_slots) % 4; > - if (type_slots == 2) > - swizzle[num_channels++] = (i * type_slots + 1) % 4; > - } > - > - /* If we don't have to write this channel it means we have a gap in > the > - * vector, so write the channels we accumulated until now, if any. > Do > - * the same if this was the last component in the vector, if we have > - * enough channels for a full vec4 write or if we have processed > - * components XY of a dvec (since components ZW are not in the same > - * SIMD register) > - */ > - if (!write_test || i == num_components - 1 || num_channels == 4 || > - (i == 1 && type_slots == 2)) { > - if (num_channels > 0) { > - /* We have channels to write, so update the offset we need to > - * write at to skip the channels we skipped, if any. > - */ > - if (skipped_channels > 0) { > - if (offset_reg.file == IMM) { > - offset_reg.ud += 4 * skipped_channels; > - } else { > - emit(ADD(dst_reg(offset_reg), offset_reg, > - brw_imm_ud(4 * skipped_channels))); > - } > - } > - > - /* Swizzle the data register so we take the data from the > channels > - * we need to write and send the write message. This will > write > - * num_channels consecutive dwords starting at offset. > - */ > - val_reg.swizzle = > - BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], > swizzle[3]); > - emit_untyped_write(bld, surf_index, offset_reg, val_reg, > - 1 /* dims */, num_channels /* size */, > - BRW_PREDICATE_NONE); > - > - /* If we have to do a second write we will have to update the > - * offset so that we jump over the channels we have just > written > - * now. > - */ > - skipped_channels = num_channels; > - > - /* Restart the count for the next write message */ > - num_channels = 0; > - } > - > - /* If we didn't write the channel, increase skipped count */ > - if (!write_test) > - skipped_channels += type_slots; > - } > - } > - > + emit_untyped_write(bld, surf_index, offset_reg, val_reg, > + 1 /* dims */, instr->num_components /* size */, > + BRW_PREDICATE_NONE); > break; > } > > case nir_intrinsic_load_ssbo: { > assert(devinfo->gen >= 7); > > + /* brw_nir_lower_mem_access_bit_sizes takes care of this */ > + assert(nir_dest_bit_size(instr->dest) == 32); > + > src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); > src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]), > BRW_REGISTER_TYPE_UD); > @@ -645,36 +573,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr > *instr) > const vec4_builder bld = vec4_builder(this).at_end() > .annotate(current_annotation, base_ir); > > - src_reg read_result; > + src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > + 1 /* dims */, 4 /* size*/, > + BRW_PREDICATE_NONE); > dst_reg dest = get_nir_dest(instr->dest); > - if (type_sz(dest.type) < 8) { > - read_result = emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, 4 /* size*/, > - BRW_PREDICATE_NONE); > - } else { > - src_reg shuffled = src_reg(this, glsl_type::dvec4_type); > - > - src_reg temp; > - temp = emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, 4 /* size*/, > - BRW_PREDICATE_NONE); > - emit(MOV(dst_reg(retype(shuffled, temp.type)), temp)); > - > - if (offset_reg.file == IMM) > - offset_reg.ud += 16; > - else > - emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16))); > - > - temp = emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, 4 /* size*/, > - BRW_PREDICATE_NONE); > - emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), > temp.type)), > - temp)); > - > - read_result = src_reg(this, glsl_type::dvec4_type); > - shuffle_64bit_data(dst_reg(read_result), shuffled, false); > - } > - > read_result.type = dest.type; > read_result.swizzle = brw_swizzle_for_size(instr->num_components); > emit(MOV(dest, read_result)); > diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build > index 3cdeb6214a8..953e8dcc971 100644 > --- a/src/intel/compiler/meson.build > +++ b/src/intel/compiler/meson.build > @@ -78,6 +78,7 @@ libintel_compiler_files = files( > 'brw_nir_attribute_workarounds.c', > 'brw_nir_lower_cs_intrinsics.c', > 'brw_nir_lower_image_load_store.c', > + 'brw_nir_lower_mem_access_bit_sizes.c', > 'brw_nir_opt_peephole_ffma.c', > 'brw_nir_tcs_workarounds.c', > 'brw_packed_float.c', >
signature.asc
Description: OpenPGP digital signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev