Signed-off-by: Chad Versace <chad.vers...@linux.intel.com> --- src/mesa/drivers/dri/i965/brw_vec4.h | 3 + src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 8 ++ src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 155 +++++++++++++++++++++++++ 3 files changed, 166 insertions(+)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index e65b92c..43d0454 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -431,6 +431,9 @@ public: void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1); src_reg fix_math_operand(src_reg src); + void emit_pack_half_2x16(dst_reg dst, src_reg src0); + void emit_unpack_half_2x16(dst_reg dst, src_reg src0); + void swizzle_result(ir_texture *ir, src_reg orig_val, int sampler); void emit_ndc_computation(); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp index 747edc2..e395ada 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp @@ -808,6 +808,14 @@ vec4_generator::generate_code(exec_list *instructions) brw_DP2(p, dst, src[0], src[1]); break; + case BRW_OPCODE_F32TO16: + brw_F32TO16(p, dst, src[0]); + break; + + case BRW_OPCODE_F16TO32: + brw_F16TO32(p, dst, src[0]); + break; + case BRW_OPCODE_IF: if (inst->src[0].file != BAD_FILE) { /* The instruction has an embedded compare (only allowed on gen6) */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index ebf8990..b5f1aae 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -348,6 +348,143 @@ vec4_visitor::emit_math(enum opcode opcode, } void +vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) +{ + if (intel->gen < 7) + assert(!"ir_unop_pack_half_2x16 should be lowered"); + + /* uint dst; */ + assert(dst.type == BRW_REGISTER_TYPE_UD); + + /* vec2 src0; */ + assert(src0.type == BRW_REGISTER_TYPE_F); + + /* uvec2 tmp; + * + * The PRM lists the destination type of f32to16 as W. However, I've + * experimentally confirmed on gen7 that it must be a 32-bit size, such as + * UD, in align16 mode. + */ + dst_reg tmp_dst(this, glsl_type::uvec2_type); + src_reg tmp_src(tmp_dst); + + /* tmp.xy = f32to16(src0); */ + tmp_dst.writemask = WRITEMASK_XY; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F32TO16, + tmp_dst, src0)); + + /* The result's high 16 bits are in the low 16 bits of the temporary + * register's Y channel. The result's low 16 bits are in the low 16 bits + * of the X channel. + * + * In experiments on gen7 I've found the that, in the temporary register, + * the hight 16 bits of the X and Y channels are zeros. This is critical + * for the SHL and OR instructions below to work as expected. + */ + + /* dst = tmp.y << 16; */ + tmp_src.swizzle = SWIZZLE_Y; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHL, + dst, tmp_src, src_reg(16u))); + /* dst |= tmp.x; */ + tmp_src.swizzle = SWIZZLE_X; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_OR, + dst, src_reg(dst), tmp_src)); + + + /* Idea for reducing the above number of registers and instructions + * ---------------------------------------------------------------- + * + * It should be possible to remove the temporary register and replace the + * SHL and OR instructions above with a single MOV instruction mode in + * align1 mode that uses clever register region addressing. (It is + * impossible to specify the necessary register regions in align16 mode). + * Unfortunately, it is difficult to emit an align1 instruction here. + * + * In particular, I want to do this: + * + * # Give dst the form: + * # + * # w z y x w z y x + * # |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll| + * # + * f32to16(8) dst<1>.xy:UD src<4;4,1>:F {align16} + * + * # Transform dst into the form of packHalf2x16's output. + * # + * # w z y x w z y x + * # |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll| + * # + * # Use width=2 in order to move the Y channel's high 16 bits + * # into the low 16 bits, thus clearing the Y channel to zero. + * # + * mov(4) dst.1<1>:UW dst.2<8;2,1>:UW {align1} + */ +} + +void +vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) +{ + if (intel->gen < 7) + assert(!"ir_unop_unpack_half_2x16 should be lowered"); + + /* vec2 dst; */ + assert(dst.type == BRW_REGISTER_TYPE_F); + + /* uint src0; */ + assert(src0.type == BRW_REGISTER_TYPE_UD); + + /* uvec2 tmp; */ + dst_reg tmp_dst(this, glsl_type::uvec2_type); + src_reg tmp_src(tmp_dst); + + /* tmp.x = src0 & 0xffffu; */ + tmp_dst.writemask = WRITEMASK_X; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_AND, + tmp_dst, src0, src_reg(0xffffu))); + + /* tmp.y = src0 >> 16u; */ + tmp_dst.writemask = WRITEMASK_Y; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHR, + tmp_dst, src0, src_reg(16u))); + + /* dst = f16to32(tmp); */ + dst.writemask = WRITEMASK_XY; + emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F16TO32, + dst, tmp_src)); + + /* Idea for reducing the above number of registers and instructions + * ---------------------------------------------------------------- + * + * It should be possible to remove the temporary register and replace the + * SHR and AND instructions above with a single MOV instruction mode in + * align1 mode that uses clever register region addressing. (It is + * impossible to specify the necessary register regions in align16 mode). + * Unfortunately, it is difficult to emit an align1 instruction here. + * + * In particular, I want to do this: + * + * # Now, src has the form of unpackHalf2x16's input: + * # + * # w z y x w z y x + * # |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll| + * + * # Transform src into a form consumable by f16to32: + * # + * # w z y x w z y x + * # |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll| + * # + * # Use dst as the scratch register. + * # + * mov(2) dst.2<1>:UW dst.1<8;1,1>:UW {align1} + * + * # Give dst the form of unpackHalf2x16's output: + * # + * f16to32(4) dst<1>.xy:F src<4;4,1>:UD {align16} + */ +} + +void vec4_visitor::visit_instructions(const exec_list *list) { foreach_list(node, list) { @@ -1469,6 +1606,24 @@ vec4_visitor::visit(ir_expression *ir) case ir_quadop_vector: assert(!"not reached: should be handled by lower_quadop_vector"); break; + + case ir_unop_pack_half_2x16: + emit_pack_half_2x16(result_dst, op[0]); + break; + case ir_unop_unpack_half_2x16: + emit_unpack_half_2x16(result_dst, op[0]); + break; + case ir_unop_pack_snorm_2x16: + case ir_unop_pack_unorm_2x16: + case ir_unop_unpack_snorm_2x16: + case ir_unop_unpack_unorm_2x16: + assert(!"not reached: should be handled by lower_packing_builtins"); + break; + case ir_unop_unpack_half_2x16_split_x: + case ir_unop_unpack_half_2x16_split_y: + case ir_binop_pack_half_2x16_split: + assert(!"not reached: should not occur in vertex shader"); + break; } } -- 1.8.1.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev