Based on my (fairly weak) understanding of vstrides, this patch is Reviewed-by: Ian Romanick <ian.d.roman...@intel.com>
On 10/11/2016 02:01 AM, Iago Toral Quiroga wrote: > These need to be emitted as align1 MOV's, since they need to have a > stride of 2 on the float register (whether src or dest) so that data > from another thread doesn't cross the middle of a SIMD8 register. > > v2 (Iago): > - The float-to-double needs to align 32-bit data to 64-bit before doing the > conversion. This was doable in align16 when we tried to use an execsize > of 4, but with an execsize of 8 we would need another align1 opcode to do > that (since we need data to cross the middle of a SIMD register). Just > making the opcode handle this internally seems more practical that adding > another opcode just for this purpose and having the caller know about this > before converting. > - The double-to-float conversion produces 32-bit elements aligned to 64-bit > so we make the opcode re-pack the result to 32-bit and fit in one register, > as expected by SIMD4x2 operation. This still requires that callers reserve > two registers for the float data destination because we need to produce > 64-bit aligned data first, and repack it later on the same destination > register, but it saves the need for a re-pack opcode only to achieve this > making the operation complete in a single opcode. Hopefully that is worth > the weirdness of the double register allocation... > > Signed-off-by: Connor Abbott <connor.w.abb...@intel.com> > Signed-off-by: Iago Toral Quiroga <ito...@igalia.com> > --- > src/mesa/drivers/dri/i965/brw_defines.h | 2 ++ > src/mesa/drivers/dri/i965/brw_shader.cpp | 4 +++ > src/mesa/drivers/dri/i965/brw_vec4.cpp | 8 +++++ > src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 44 > ++++++++++++++++++++++++ > 4 files changed, 58 insertions(+) > > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index c4e0f27..79b96a4 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -1098,6 +1098,8 @@ enum opcode { > VEC4_OPCODE_MOV_BYTES, > VEC4_OPCODE_PACK_BYTES, > VEC4_OPCODE_UNPACK_UNIFORM, > + VEC4_OPCODE_DOUBLE_TO_FLOAT, > + VEC4_OPCODE_FLOAT_TO_DOUBLE, > > FS_OPCODE_DDX_COARSE, > FS_OPCODE_DDX_FINE, > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp > b/src/mesa/drivers/dri/i965/brw_shader.cpp > index ed81563..b063f77 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp > @@ -317,6 +317,10 @@ brw_instruction_name(const struct gen_device_info > *devinfo, enum opcode op) > return "pack_bytes"; > case VEC4_OPCODE_UNPACK_UNIFORM: > return "unpack_uniform"; > + case VEC4_OPCODE_DOUBLE_TO_FLOAT: > + return "double_to_float"; > + case VEC4_OPCODE_FLOAT_TO_DOUBLE: > + return "float_to_double"; > > case FS_OPCODE_DDX_COARSE: > return "ddx_coarse"; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > index c29cfb5..32c04b2 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > @@ -253,6 +253,8 @@ vec4_instruction::can_do_writemask(const struct > gen_device_info *devinfo) > { > switch (opcode) { > case SHADER_OPCODE_GEN4_SCRATCH_READ: > + case VEC4_OPCODE_DOUBLE_TO_FLOAT: > + case VEC4_OPCODE_FLOAT_TO_DOUBLE: > case VS_OPCODE_PULL_CONSTANT_LOAD: > case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: > case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: > @@ -505,6 +507,12 @@ vec4_visitor::opt_reduce_swizzle() > case BRW_OPCODE_DP2: > swizzle = brw_swizzle_for_size(2); > break; > + > + case VEC4_OPCODE_FLOAT_TO_DOUBLE: > + case VEC4_OPCODE_DOUBLE_TO_FLOAT: > + swizzle = brw_swizzle_for_size(4); > + break; > + > default: > swizzle = brw_swizzle_for_mask(inst->dst.writemask); > break; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > index 163cf9d..6f4c438 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > @@ -1896,6 +1896,50 @@ generate_code(struct brw_codegen *p, > break; > } > > + case VEC4_OPCODE_DOUBLE_TO_FLOAT: { > + assert(src[0].type == BRW_REGISTER_TYPE_DF); > + assert(dst.type == BRW_REGISTER_TYPE_F); > + > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + > + dst.hstride = BRW_HORIZONTAL_STRIDE_2; > + dst.width = BRW_WIDTH_4; > + src[0].vstride = BRW_VERTICAL_STRIDE_4; > + src[0].width = BRW_WIDTH_4; > + brw_MOV(p, dst, src[0]); > + > + struct brw_reg dst_as_src = dst; > + dst.hstride = BRW_HORIZONTAL_STRIDE_1; > + dst.width = BRW_WIDTH_8; > + brw_MOV(p, dst, dst_as_src); > + > + brw_set_default_access_mode(p, BRW_ALIGN_16); > + break; > + } > + > + case VEC4_OPCODE_FLOAT_TO_DOUBLE: { > + assert(src[0].type == BRW_REGISTER_TYPE_F); > + assert(dst.type == BRW_REGISTER_TYPE_DF); > + > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + > + struct brw_reg tmp = retype(dst, src[0].type); > + tmp.hstride = BRW_HORIZONTAL_STRIDE_2; > + tmp.width = BRW_WIDTH_4; > + src[0].vstride = BRW_VERTICAL_STRIDE_4; > + src[0].hstride = BRW_HORIZONTAL_STRIDE_1; > + src[0].width = BRW_WIDTH_4; > + brw_MOV(p, tmp, src[0]); > + > + tmp.vstride = BRW_VERTICAL_STRIDE_8; > + tmp.hstride = BRW_HORIZONTAL_STRIDE_2; > + tmp.width = BRW_WIDTH_4; > + brw_MOV(p, dst, tmp); > + > + brw_set_default_access_mode(p, BRW_ALIGN_16); > + break; > + } > + > case VEC4_OPCODE_PACK_BYTES: { > /* Is effectively: > * > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev