load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit surface format defined. So when reading 16-bit components with the sampler we need to unshuffle two 16-bit components from each 32-bit component.
Using the sampler avoids the use of the byte_scattered_read message that needs one message for each component and is supposed to be slower. In the case of SKL+ we take advantage of a hardware feature that automatically defines a channel mask based on the rlen value, so on SKL+ we only use half of the registers without using a header in the payload. --- src/intel/compiler/brw_fs.cpp | 31 +++++++++++++++++++++++++++---- src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 1ca4d416b2..9c543496ba 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, * a double this means we are only loading 2 elements worth of data. * We also want to use a 32-bit data type for the dst of the load operation * so other parts of the driver don't get confused about the size of the - * result. + * result. On the case of 16-bit data we only need half of the 32-bit + * components on SKL+ as we take advance of using message return size to + * define an xy channel mask. */ - fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + fs_reg vec4_result; + if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) { + vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF); + } else { + vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); + } fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, vec4_result, surf_index, vec4_offset); inst->size_written = 4 * vec4_result.component_size(inst->exec_size); @@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, } vec4_result.type = dst.type; - bld.MOV(dst, offset(vec4_result, bld, - (const_offset & 0xf) / type_sz(vec4_result.type))); + + if (type_sz(dst.type) == 2) { + /* 16-bit types need to be unshuffled as each pair of 16-bit components + * is packed on a 32-bit compoment because we are using a 32-bit format + * in the surface of uniform that is read by the sampler. + * TODO: On BDW+ mark when an uniform has 16-bit type so we could setup a + * surface format of 16-bit and use the 16-bit return format at the + * sampler. + */ + vec4_result.stride = 2; + bld.MOV(dst, byte_offset(offset(vec4_result, bld, + (const_offset & 0x7) / 4), + (const_offset & 0x7) / 2 % 2 * 2)); + } else { + bld.MOV(dst, offset(vec4_result, bld, + (const_offset & 0xf) / type_sz(vec4_result.type))); + } } /** diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index a3861cd68e..00a4e29147 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1381,12 +1381,18 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, uint32_t simd_mode, rlen, mlen; if (inst->exec_size == 16) { mlen = 2; - rlen = 8; + if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) + rlen = 4; + else + rlen = 8; simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; } else { assert(inst->exec_size == 8); mlen = 1; - rlen = 4; + if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) + rlen = 2; + else + rlen = 4; simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8; } -- 2.14.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev