On Thu, Oct 12, 2017 at 08:38:19PM +0200, Jose Maria Casanova Crespo wrote: > Enables the support of 16-bit types on load_input and > store_outputs intrinsics intra-stages. > > The approach was based on re-using the 32-bit URB read > and writes between stages, shuffling pairs of 16-bit values into > 32-bit values at load_store intrinsic and un-shuffling the values > at load_inputs. > > shuffle_32bit_load_result_to_16bit_data and > shuffle_32bit_load_result_to_16bit_data are implemented in a similar > way than the analogous functions for handling 64-bit types. > --- > src/intel/compiler/brw_fs.h | 11 ++++ > src/intel/compiler/brw_fs_nir.cpp | 119 > +++++++++++++++++++++++++++++++++++++- > 2 files changed, 129 insertions(+), 1 deletion(-) > > diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h > index b9476e69ed..90ada3ef4b 100644 > --- a/src/intel/compiler/brw_fs.h > +++ b/src/intel/compiler/brw_fs.h > @@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_write(const > brw::fs_builder &bld, > const fs_reg &dst, > const fs_reg &src, > uint32_t components); > + > +void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder &bld, > + const fs_reg &dst, > + const fs_reg &src, > + uint32_t components); > + > +void shuffle_16bit_data_for_32bit_write(const brw::fs_builder &bld, > + const fs_reg &dst, > + const fs_reg &src, > + uint32_t components); > + > fs_reg setup_imm_df(const brw::fs_builder &bld, > double v); > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > index 83ff0607a7..9c694a1c53 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, > first_component = first_component / 2; > } > > + if (type_sz(dst.type) == 2) { > + num_components = DIV_ROUND_UP(num_components, 2); > + tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components); > + } > + > for (unsigned iter = 0; iter < num_iterations; iter++) { > if (offset_const) { > /* Constant indexing - use global offset. */ > if (first_component != 0) { > unsigned read_components = num_components + first_component; > - fs_reg tmp = bld.vgrf(dst.type, read_components); > + fs_reg tmp = bld.vgrf(tmp_dst.type, read_components); > inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); > inst->size_written = read_components * > tmp.component_size(inst->exec_size); > @@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, > bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c)); > } > > + if (type_sz(dst.type) == 2) { > + shuffle_32bit_load_result_to_16bit_data( > + bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), > orig_num_components);
We don't need retype() here, do we? Looking the block added a little earlier where "tmp_dst" gets created. > + } > + > if (num_iterations > 1) { > num_components = orig_num_components - 2; > if(offset_const) { > @@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder > &bld, > dst = tmp; > } > > + if (type_sz(dst.type) == 2) { > + num_components = DIV_ROUND_UP(num_components, 2); > + dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components); > + } > + > for (unsigned iter = 0; iter < num_iterations; iter++) { > if (indirect_offset.file == BAD_FILE) { > /* Constant indexing - use global offset. */ > @@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder > &bld, > } > } > > + if (type_sz(orig_dst.type) == 2) { > + shuffle_32bit_load_result_to_16bit_data( > + bld, orig_dst, dst, instr->num_components); > + } > + > /* Copy the temporary to the destination to deal with writemasking. > * > * Also attempt to deal with gl_PointSize being in the .w component. > @@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder > &bld, > fs_reg value = get_nir_src(instr->src[0]); > bool is_64bit = (instr->src[0].is_ssa ? > instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == > 64; > + bool is_16bit = (instr->src[0].is_ssa ? > + instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == > 16; > fs_reg indirect_offset = get_indirect_offset(instr); > unsigned imm_offset = instr->const_index[0]; > unsigned swiz = BRW_SWIZZLE_XYZW; > @@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder > &bld, > num_iterations = 2; > iter_components = 2; > } > + } else { > + if (is_16bit) { > + iter_components = DIV_ROUND_UP(num_components, 2); > + } > } > > /* 64-bit data needs to me shuffled before we can write it to the URB. > @@ -2711,6 +2737,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder > &bld, > continue; > > if (!is_64bit) { > + if (is_16bit) { > + shuffle_16bit_data_for_32bit_write(bld, > + retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), > BRW_REGISTER_TYPE_F), > + retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), > BRW_REGISTER_TYPE_HF), > + 2); > + } > srcs[header_regs + i + first_component] = > offset(value, bld, BRW_GET_SWZ(swiz, i)); > } else { > @@ -2862,6 +2894,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder > &bld, > dest = tmp; > } > > + if (type_sz(dest.type) == 2) { > + num_components = DIV_ROUND_UP(num_components, 2); > + dest = bld.vgrf(BRW_REGISTER_TYPE_F, num_components); > + } > + > for (unsigned iter = 0; iter < num_iterations; iter++) { > const fs_reg srcs[] = { > retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), > @@ -2904,6 +2941,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder > &bld, > } > } > > + if (type_sz(dest.type) == 2) { > + shuffle_32bit_load_result_to_16bit_data( > + bld, orig_dest, retype(dest, BRW_REGISTER_TYPE_F), > num_components); Here also retype() looks unnecessary. > + } > + > /* If we are loading double data and we need a second read > message > * adjust the offset > */ > @@ -3257,6 +3299,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder > &bld, > num_components *= 2; > } > > + fs_reg orig_dest = dest; > + if (nir_dest_bit_size(instr->dest) == 16) { > + type = BRW_REGISTER_TYPE_F; > + num_components = DIV_ROUND_UP(num_components, 2); > + dest = bld.vgrf(type, num_components); > + } > + > for (unsigned int i = 0; i < num_components; i++) { > struct brw_reg interp = interp_reg(base, component + i); > interp = suboffset(interp, 3); > @@ -3270,6 +3319,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder > &bld, > retype(dest, type), > instr->num_components); > } > + if (nir_dest_bit_size(instr->dest) == 16) { > + shuffle_32bit_load_result_to_16bit_data(bld, > + orig_dest, > + retype(dest, type), Same here, "dest" gets created against "type". > + instr->num_components); > + } > break; > } > > @@ -4182,6 +4237,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > unsigned first_component = nir_intrinsic_component(instr); > unsigned bit_size = instr->src[0].is_ssa ? > instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size; > + Leftover? > if (bit_size == 64) { > fs_reg tmp = > fs_reg(VGRF, alloc.allocate(2 * num_components), > @@ -4192,6 +4248,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > num_components *= 2; > } > > + if (bit_size == 16) { > + fs_reg tmp = > + fs_reg(VGRF, alloc.allocate(DIV_ROUND_UP(num_components, 2)), > + BRW_REGISTER_TYPE_F); Earlier you used builder, perhaps here also: fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, DIV_ROUND_UP(num_components, 2)); > + shuffle_16bit_data_for_32bit_write( > + bld, tmp, retype(src, BRW_REGISTER_TYPE_HF), num_components); Isn't "src" already a 16-bit type, it is gotten from "instr->src[0]" same as "bit_size"? > + src = retype(tmp, src.type); > + num_components = DIV_ROUND_UP(num_components, 2); > + } > + > for (unsigned j = 0; j < num_components; j++) { > bld.MOV(offset(new_dest, bld, j + first_component), > offset(src, bld, j)); > @@ -4815,6 +4881,33 @@ shuffle_32bit_load_result_to_64bit_data(const > fs_builder &bld, > } > } > > +void > +shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld, > + const fs_reg &dst, > + const fs_reg &src, > + uint32_t components) > +{ > + assert(type_sz(src.type) == 4); > + assert(type_sz(dst.type) == 2); > + > + fs_reg tmp = retype(bld.vgrf(src.type), dst.type); > + > + for (unsigned i = 0; i < components; i++) { > + const fs_reg component_i = subscript(offset(src, bld, i / 2), > dst.type, i % 2); > + > + bld.MOV(offset(tmp, bld, i % 2), component_i); > + > + if (i % 2) { > + bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0)); > + bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1)); > + } > + } > + if (components % 2) { > + bld.MOV(offset(dst, bld, components - 1), tmp); > + } > +} > + > + > /** > * This helper does the inverse operation of > * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA. > @@ -4849,6 +4942,30 @@ shuffle_64bit_data_for_32bit_write(const fs_builder > &bld, > } > } > > +void > +shuffle_16bit_data_for_32bit_write(const fs_builder &bld, > + const fs_reg &dst, > + const fs_reg &src, > + uint32_t components) > +{ > + assert(type_sz(src.type) == 2); > + assert(type_sz(dst.type) == 4); > + > + fs_reg tmp = bld.vgrf(dst.type); > + > + for (unsigned i = 0; i < components; i++) { > + const fs_reg component_i = offset(src, bld, i); > + bld.MOV(subscript(tmp, src.type, i % 2), component_i); > + if (i % 2) { > + bld.MOV(offset(dst, bld, i / 2), tmp); > + } > + if (components % 2) { > + bld.MOV(offset(dst, bld, components / 2), tmp); > + } > + } > +} > + > + > fs_reg > setup_imm_df(const fs_builder &bld, double v) > { > -- > 2.13.6 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev