On Fri, Feb 23, 2018 at 1:26 AM, Jose Maria Casanova Crespo < jmcasan...@igalia.com> wrote:
> 16-bit load_ubo/ssbo operations that call do_untyped_read_vector doesn't > guarantee that offsets are multiple of 4-bytes as required by untyped_read > message. This happens for example on 16-bit scalar arrays and in the case > of f16vec3 when then VK_KHR_relaxed_block_layoud is enabled. > > Vectors reads when we have non-constant offsets are implemented with > multiple byte_scattered_read messages that not require 32-bit aligned > offsets. > The same applies for constant offsets not aligned to 32-bits. > > Untyped_read_surface is used message when there is a constant offset > 32-bit aligned and we have more than 1 component to read. > --- > src/intel/compiler/brw_fs_nir.cpp | 60 ++++++++++++++++++++++++++++-- > --------- > 1 file changed, 44 insertions(+), 16 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 8efec34cc9..45b8e8b637 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2305,27 +2305,55 @@ do_untyped_vector_read(const fs_builder &bld, > if (type_sz(dest.type) <= 2) { > assert(dest.stride == 1); > > - if (num_components > 1) { > - /* Pairs of 16-bit components can be read with untyped read, for > 16-bit > - * vec3 4th component is ignored. > + unsigned pending_components = num_components; > + unsigned first_component = 0; > + boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE; > + fs_reg read_offset; > + if (is_const_offset) > + read_offset = offset_reg; > + else { > + read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.MOV(read_offset, offset_reg); > + } > + while (pending_components > 0 && > + (pending_components == 1 || > + !is_const_offset || > + (offset_reg.ud + first_component * 2) % 4)) { > I think this would be easier to read as while (pending_components > 0) { if (!is_const_offset || (offset_reg.ud + first_component * type_sz(dst.type)) % 4) { /* Do a single-component load with a byte message */ first_compoent++; pending_components--; } else { /* Do a load with an untyped read */ pending_components = 0; break; } } Or, alternatively, if (is_const_offset) { uint32_t start = offset_reg.ud & 3; uint32_t end = offset_reg + num_components * type_sz(dst.type); end = ALIGN(end, 4); assert(end - start <= 16); /* Do a load with an untyped read and then throw away unneeded data at the beginning and the end */ } else { /* Do the load using a loop and byte scattered reads */ } This second one would require a small extension to shuffle_32bit_load_result_to_16bit_data to handle throwing away data at the start. > + /* Non constant offsets, 16-bit scalars and constant offsets not > + * aligned 32-bits are read using one byte_scattered_read message > + * for eache component untyped_read requires 32-bit aligned > offsets. > */ > fs_reg read_result = > - emit_untyped_read(bld, surf_index, offset_reg, > - 1 /* dims */, DIV_ROUND_UP(num_components, > 2), > - BRW_PREDICATE_NONE); > - shuffle_32bit_load_result_to_16bit_data(bld, > - retype(dest, BRW_REGISTER_TYPE_W), > - retype(read_result, BRW_REGISTER_TYPE_D), > - num_components); > - } else { > - assert(num_components == 1); > - /* scalar 16-bit are read using one byte_scattered_read message > */ > - fs_reg read_result = > - emit_byte_scattered_read(bld, surf_index, offset_reg, > + emit_byte_scattered_read(bld, surf_index, read_offset, > 1 /* dims */, 1, > type_sz(dest.type) * 8 /* bit_size > */, > BRW_PREDICATE_NONE); > - bld.MOV(dest, subscript(read_result, dest.type, 0)); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(offset(dest, bld, first_component), > BRW_REGISTER_TYPE_W), > + retype(read_result, BRW_REGISTER_TYPE_D), > + 1); > + pending_components--; > + first_component ++; > + if (is_const_offset) > + read_offset.ud += 2; > + else > + bld.ADD(read_offset, offset_reg, brw_imm_ud(2 * > first_component)); > + } > + assert(pending_components != 1); > + if (pending_components > 1) { > + assert (is_const_offset && > + (offset_reg.ud + first_component * 2) % 4 == 0); > + /* At this point we have multiple 16-bit components that have > constant > + * offset multiple of 4-bytes that can be read with > untyped_reads. > + */ > + fs_reg read_result = > + emit_untyped_read(bld, surf_index, read_offset, > + 1 /* dims */, DIV_ROUND_UP(pending_components, > 2), > + BRW_PREDICATE_NONE); > + shuffle_32bit_load_result_to_16bit_data(bld, > + retype(offset(dest,bld,first_component), > BRW_REGISTER_TYPE_W), > + retype(read_result, BRW_REGISTER_TYPE_D), > + pending_components); > } > } else if (type_sz(dest.type) == 4) { > fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, > -- > 2.14.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev