From: Jose Maria Casanova Crespo <jmcasan...@igalia.com> From Skylake PRM, Volume 07 3D Media GPGPU, Section Register Region Restrictions, SubSection 5. Special Cases for Word Operations (page 781):
"There is a relaxed alignment rule for word destinations. When the destination type is word (UW, W, HF), destination data types can be aligned to either the lowest word or the second lowest word of the execution channel. This means the destination data words can be either all in the even word locations or all in the odd word locations. // Example: add (8) r10.0<2>:hf r11.0<8;8,1>:f r12.0<8;8,1>:hf add (8) r10.1<2>:hf r11.0<8;8,1>:f r12.0<8;8,1>:hf // Note: The destination offset may be .0 or .1 although the destination subregister // is required to be aligned to execution datatype." In practice, that means that for alu operations like float to half float conversions, we need to align the 16-bit to 32-bit (so setting the stride to 2). As a first approach, we are setting the stride to 2 in general, even if it is not an alu operation, as simplifies handling with 16-bit data types. We can revisit this in the future. Signed-off-by: Jose Maria Casanova Crespo <jmcasan...@igalia.com> Signed-off-by: Alejandro Piñeiro <apinhe...@igalia.com> --- src/intel/compiler/brw_fs_nir.cpp | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index e4eba1401f8..c469baf42a1 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -585,6 +585,21 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) result.type = brw_type_for_nir_type(devinfo, (nir_alu_type)(nir_op_infos[instr->op].output_type | nir_dest_bit_size(instr->dest.dest))); + /* According to PRM, Volume 07 3D Media GPGPU, Section Register Region + * Restrictions, SubSection 5. Special Cases for Word Operations, GEN >= 8 + * can only handle writting on 16-bit (UW,HF,W) registers on the lower + * word or in the higher word for each double word. It needs to be the + * same offset for for all 16-bit elements of the register. So as + * consequence all destinations of 16-bit registers should use a stride + * equal to 2. + * + * We also need alignment to 32-bit for conversions operations + * form 32-bit to 16-bit. So this is also useful for this operations + * that are the only alu operation supported right now. + */ + + if (nir_dest_bit_size(instr->dest.dest) == 16) + result.stride = 2; fs_reg op[4]; for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { @@ -594,6 +609,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) nir_src_bit_size(instr->src[i].src))); op[i].abs = instr->src[i].abs; op[i].negate = instr->src[i].negate; + /* Previous comment still aplies here for source elements */ + if (nir_src_bit_size(instr->src[i].src) == 16) + op[i].stride = 2; } /* We get a bunch of mov's out of the from_ssa pass and they may still @@ -2083,6 +2101,9 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, first_component = first_component / 2; } + if (type_sz(tmp_dst.type) == 2) + tmp_dst.stride = 2; + for (unsigned iter = 0; iter < num_iterations; iter++) { if (offset_const) { /* Constant indexing - use global offset. */ @@ -2414,6 +2435,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, dst = tmp; } + if (type_sz(dst.type) == 2) + dst.stride = 2; + for (unsigned iter = 0; iter < num_iterations; iter++) { if (indirect_offset.file == BAD_FILE) { /* Constant indexing - use global offset. */ @@ -2725,6 +2749,9 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, first_component = first_component / 2; } + if (type_sz(dest.type) == 2) + dest.stride = 2; + fs_inst *inst; if (indirect_offset.file == BAD_FILE) { /* Arbitrarily only push up to 32 vec4 slots worth of data, @@ -2735,7 +2762,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); for (int i = 0; i < instr->num_components; i++) { unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + - i + first_component; + i * dest.stride + first_component; bld.MOV(offset(dest, bld, i), component(src, comp)); } tes_prog_data->base.urb_read_length = @@ -3182,6 +3209,9 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, num_components *= 2; } + if (type_sz(dest.type) == 2) + dest.stride = 2; + for (unsigned int i = 0; i < num_components; i++) { struct brw_reg interp = interp_reg(base, component + i); interp = suboffset(interp, 3); @@ -3576,6 +3606,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); + if (instr->dest.is_ssa && nir_dest_bit_size(instr->dest) == 16) + dest.stride = 2; + switch (instr->intrinsic) { case nir_intrinsic_atomic_counter_inc: case nir_intrinsic_atomic_counter_dec: -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev