Signed-off-by: Topi Pohjolainen <topi.pohjolai...@intel.com> --- src/mesa/drivers/dri/i965/brw_defines.h | 1 + src/mesa/drivers/dri/i965/brw_fs.h | 5 ++ src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 96 ++++++++++++++++++++++++++ 3 files changed, 102 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 4a173db..88097b7 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -919,6 +919,7 @@ enum opcode { FS_OPCODE_SET_SAMPLE_ID, FS_OPCODE_SET_SIMD4X2_OFFSET, FS_OPCODE_PACK_HALF_2x16_SPLIT, + FS_OPCODE_PACK_DOUBLE_2x32, FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, FS_OPCODE_PLACEHOLDER_HALT, diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 8c11c32..66173fe 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -759,6 +759,11 @@ private: struct brw_reg dst, struct brw_reg src); + void generate_pack_double_2x32(fs_inst *inst, + struct brw_reg dst, + struct brw_reg hi, + struct brw_reg lo); + void generate_shader_time_add(fs_inst *inst, struct brw_reg payload, struct brw_reg offset, diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 21c9660..2b20f7c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -1413,6 +1413,98 @@ fs_generator::generate_pack_half_2x16_split(fs_inst *inst, } void +fs_generator::generate_pack_double_2x32(fs_inst *inst, + struct brw_reg dst, + struct brw_reg hi, + struct brw_reg lo) +{ + assert(brw->gen >= 7); + assert(dst.type == BRW_REGISTER_TYPE_DF); + assert(hi.type == BRW_REGISTER_TYPE_UD); + assert(lo.type == BRW_REGISTER_TYPE_UD); + + /** + * Double precision floats take 64-bits channel meaning that two registers + * are needed to hold 8 elements. The values are constructed in two steps: + * first high 32-bits are copied and then the low 32. The destination is + * treated as having unsigned type but a horizontal stride telling that two + * consecutive channels are 64-bits apart. Both high bits and low bits + * require two moves each - hardware allows sources to spand over mulitple + * physical registers but destination not. Hence four moves in total are + * required. + * + * TODO: If "hi" and "lo" are both uniforms and in consecutive slots then + * on HSW and newer one could simply omit the copy. The pair of + * 32-bit slots could be treated as double precision scalar instead. + * On IVB the copy is still needed but could be done with two + * instructions each moving hi-lo-pairs. + */ + dst.type = BRW_REGISTER_TYPE_UD; + dst.width = BRW_WIDTH_4; + dst.hstride = BRW_HORIZONTAL_STRIDE_2; + dst.vstride = BRW_VERTICAL_STRIDE_8; + + if (!brw_is_scalar(hi)) { + assert(hi.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(hi.vstride == BRW_VERTICAL_STRIDE_8); + hi.width = BRW_WIDTH_4; + hi.vstride = BRW_VERTICAL_STRIDE_4; + } + if (!brw_is_scalar(lo)) { + assert(lo.hstride == BRW_HORIZONTAL_STRIDE_1); + assert(lo.vstride == BRW_VERTICAL_STRIDE_8); + lo.width = BRW_WIDTH_4; + lo.vstride = BRW_VERTICAL_STRIDE_4; + } + + struct brw_reg dst_2nd_half = dst; + ++dst_2nd_half.nr; + + /* In terms of SIMD8: + * +--+--+--+--+--+--+--+--+ +--+--+--+--+--+--+--+--+ + * dst.reg |H0| |H1| |H2| |H3| | hi.reg |H0|H1|H2|H3|H4|H5|H6|H7| + * +--+--+--+--+--+--+--+--+ +--+--+--+--+--+--+--+--+ + * dst.reg+1 | | | | | | | | | lo.reg |L0|L1|L2|L3|L4|L5|L6|L7| + * +--+--+--+--+--+--+--+--+ +--+--+--+--+--+--+--+--+ + */ + brw_MOV(p, dst, hi); + + /* +--+--+--+--+--+--+--+--+ + * dst.reg |H0| |H1| |H2| |H3| | + * +--+--+--+--+--+--+--+--+ + * dst.reg+1 |H4| |H5| |H6| |H7| | + * +--+--+--+--+--+--+--+--+ + */ + if (!brw_is_scalar(hi)) { + assert(hi.subnr == 0); + hi.subnr = 4 * 4; + } + brw_MOV(p, dst_2nd_half, hi); + + /* +--+--+--+--+--+--+--+--+ + * dst.reg |H0|L0|H1|L1|H2|L2|H3|L3| + * +--+--+--+--+--+--+--+--+ + * dst.reg+1 |H4| |H5| |H6| |H7| | + * +--+--+--+--+--+--+--+--+ + */ + dst.subnr += 4; + brw_MOV(p, dst, lo); + + /* +--+--+--+--+--+--+--+--+ + * dst.reg |H0|L0|H1|L1|H2|L2|H3|L3| + * +--+--+--+--+--+--+--+--+ + * dst.reg+1 |H4|L4|H5|L5|H6|L6|H7|L7| + * +--+--+--+--+--+--+--+--+ + */ + if (!brw_is_scalar(lo)) { + assert(lo.subnr == 0); + lo.subnr = 4 * 4; + } + dst_2nd_half.subnr += 4; + brw_MOV(p, dst_2nd_half, lo); +} + +void fs_generator::generate_unpack_half_2x16_split(fs_inst *inst, struct brw_reg dst, struct brw_reg src) @@ -1932,6 +2024,10 @@ fs_generator::generate_code(const cfg_t *cfg) generate_pack_half_2x16_split(inst, dst, src[0], src[1]); break; + case FS_OPCODE_PACK_DOUBLE_2x32: + generate_pack_double_2x32(inst, dst, src[0], src[1]); + break; + case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X: case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y: generate_unpack_half_2x16_split(inst, dst, src[0]); -- 1.8.3.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev