We also pack in the same byte_scattered_write message the maximum number of 8/16-bit components.
Comments have been rewritten to adapt them to the 8-bit case. --- src/intel/compiler/brw_fs_nir.cpp | 66 ++++++++++++++++++------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index a1f946708ed..7259acb862e 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4263,6 +4263,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg write_src = offset(val_reg, bld, first_component); nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); + bool use_scattered_write = false; if (type_size > 4) { /* We can't write more than 2 64-bit components at once. Limit @@ -4273,29 +4274,38 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr write_src = shuffle_for_32bit_write(bld, write_src, 0, num_components); } else if (type_size < 4) { - /* For 16-bit types we pack two consecutive values into a 32-bit - * word and use an untyped write message. For single values or not - * 32-bit-aligned we need to use byte-scattered writes because - * untyped writes works with 32-bit components with 32-bit - * alignment. byte_scattered_write messages only support one - * 16-bit component at a time. As VK_KHR_relaxed_block_layout - * could be enabled we can not guarantee that not constant offsets - * to be 32-bit aligned for 16-bit types. For example an array, of - * 16-bit vec3 with array element stride of 6. + /* For 8/16-bit types we pack consecutive values into a 32-bit + * type and use an untyped write message. When size is not + * multiple of 4-bytes or offset is not 32-bit-aligned we need to + * use byte-scattered writes because they didn't require 32-bit + * components or 32-bit offset alignment. We can pack multiple + * 8/16-bit components on one 8/16/32-bit component used by the + * byte_scattered_write message. + * + * As VK_KHR_relaxed_block_layout could be requested and it is + * core in VK 1.1 we can not guarantee not constant offsets to be + * 32-bit aligned for 8/16-bit types. For example a 16-bit vec3 + * begin with at offset 2 in a structure. * * In the case of 32-bit aligned constant offsets if there is - * a 3-components vector we submit one untyped-write message + * a 16-bit vec3 we submit one untyped-write message * of 32-bit (first two components), and one byte-scattered * write message (the last component). */ - - if ( !const_offset || ((const_offset->u32[0] + - type_size * first_component) % 4)) { - /* If we use a .yz writemask we also need to emit 2 - * byte-scattered write messages because of y-component not - * being aligned to 32-bit. + if (!const_offset || ((const_offset->u32[0] + + type_size * first_component) % 4) || + num_components * type_size < 4) { + /* If we don't have a constant offset or a constant offset + * not 32-bit aligned or we are reading less than 32-bits then + * we use byte_scattered_write with the maximum number of + * components we can pack exactly into one 8/16/32-bit component. + * So for a int8 vec3 we have to split into two one 16-bit and + * another 8-bit writtings. */ - num_components = 1; + use_scattered_write = true; + num_components = MIN2(4 / type_size, num_components); + if (num_components == 3) + num_components = 2; } else if (num_components * type_size > 4 && (num_components * type_size % 4)) { /* If the pending components size is not a multiple of 4 bytes @@ -4303,13 +4313,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr * length == 1 with byte_scattered_write. */ num_components -= (num_components * type_size % 4) / type_size; - } else if (num_components * type_size < 4) { - num_components = 1; } /* For num_components == 1 we are also shuffling the component - * because byte scattered writes of 16-bit need values to be dword - * aligned. Shuffling only one component would be the same as - * striding it. + * because byte scattered writes of 8/16-bit need values 32-bit + * aligned. */ write_src = shuffle_for_32bit_write(bld, write_src, 0, num_components); @@ -4327,16 +4334,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr brw_imm_ud(type_size * first_component)); } - if (type_size < 4 && num_components == 1) { - /* Untyped Surface messages have a fixed 32-bit size, so we need - * to rely on byte scattered in order to write 16-bit elements. - * The byte_scattered_write message needs that every written 16-bit - * type to be aligned 32-bits (stride=2). + if (use_scattered_write) { + assert(num_components * bit_size <= 32); + assert(util_is_power_of_two_nonzero(num_components * bit_size)); + /* Untyped Surface messages have a fixed 32-bit size, and are + * limited to use 32-bit offsets, so we use on byte + * scattered_writes in order to write 8/16-bit elements. We pack + * pack multiple 8/16 bits components in one single message, using + * the suitable bitsize. It is limited to 8/16/32-bits. */ emit_byte_scattered_write(bld, surf_index, offset_reg, write_src, 1 /* dims */, 1, - bit_size, + bit_size * num_components, BRW_PREDICATE_NONE); } else { assert(num_components * type_size <= 16); -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev