Two shaders that appear in Unigine benchmarks (Heaven and Valley) unpack three bytes from an integer and convert each into a float:
float((val >> 16u) & 0xffu) float((val >> 8u) & 0xffu) float((val >> 0u) & 0xffu) Instead of shifting, masking, and type converting like this: shr(8) g15<1>UD g25<8,8,1>UD 0x00000010UD and(8) g16<1>UD g15<8,8,1>UD 0x000000ffUD mov(8) g17<1>F g16<8,8,1>UD shr(8) g18<1>UD g25<8,8,1>UD 0x00000008UD and(8) g19<1>UD g18<8,8,1>UD 0x000000ffUD mov(8) g20<1>F g19<8,8,1>UD and(8) g21<1>UD g25<8,8,1>UD 0x000000ffUD mov(8) g22<1>F g21<8,8,1>UD i965 can simply extract a byte and convert to float in a single instruction: mov(8) g17<1>F g25.2<16,4,4>UB mov(8) g20<1>F g25.1<16,4,4>UB mov(8) g22<1>F g25.0<16,4,4>UB Decreases the number of instructions and cycles in the two programs by: #1706: 3728 -> 3363 instructions (-9.79%), 9594 -> 9180 cycles (-4.32%) #1721: 4027 -> 3662 instructions (-9.06%), 10264 -> 9572 cycles (-6.74%) --- This is dependent on Connor's outstanding i965 scheduling patches and requires some benchmark data. Presumably we'll need a lower_byte_extract flag, or maybe we get lucky and both vc4 and freedreno are capable of handling this opcode like i965. I should probably split the nir and i965 pieces into separate patches. TODO: test the i965/vec4 code. src/glsl/nir/nir_opcodes.py | 4 ++ src/glsl/nir/nir_opt_algebraic.py | 6 +++ src/mesa/drivers/dri/i965/brw_defines.h | 6 +++ src/mesa/drivers/dri/i965/brw_fs.h | 2 + src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 1 + src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 6 +++ src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 47 ++++++++++++++++++++++++ src/mesa/drivers/dri/i965/brw_shader.cpp | 2 + src/mesa/drivers/dri/i965/brw_vec4_cse.cpp | 1 + src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 8 ++++ src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 +++ 11 files changed, 89 insertions(+) diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py index 37d3dfc..1d85ec9 100644 --- a/src/glsl/nir/nir_opcodes.py +++ b/src/glsl/nir/nir_opcodes.py @@ -540,6 +540,10 @@ dst.x = src0.x; dst.y = src1.x; """) +# Byte extraction +binop_convert("extract_byte", tuint, tuint, "", "(unsigned)src0 >> (src1 * 8) & 0xff") + + def triop(name, ty, const_expr): opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py index 6aa8b1f..a236ac6 100644 --- a/src/glsl/nir/nir_opt_algebraic.py +++ b/src/glsl/nir/nir_opt_algebraic.py @@ -202,6 +202,12 @@ optimizations = [ (('f2i', ('ftrunc', a)), ('f2i', a)), (('f2u', ('ftrunc', a)), ('f2u', a)), + # Byte extraction + (('iand', 0xff, ('ushr', a, 24)), ('extract_byte', a, 3)), + (('iand', 0xff, ('ushr', a, 16)), ('extract_byte', a, 2)), + (('iand', 0xff, ('ushr', a, 8)), ('extract_byte', a, 1)), + (('iand', 0xff, a), ('extract_byte', a, 0)), + # Subtracts (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), (('isub', a, ('isub', 0, b)), ('iadd', a, b)), diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index ade3ede..dc9069d 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1079,6 +1079,12 @@ enum opcode { */ SHADER_OPCODE_BROADCAST, + /** + * Pick the byte from its first source register given by the index + * specified as second source. + */ + SHADER_OPCODE_EXTRACT_BYTE, + VEC4_OPCODE_MOV_BYTES, VEC4_OPCODE_PACK_BYTES, VEC4_OPCODE_UNPACK_UNIFORM, diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 5729fdf..718ebf3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -263,6 +263,8 @@ public: void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst, unsigned wr_mask); + bool optimize_extract_byte_to_float(nir_alu_instr *instr, + const fs_reg &result); bool optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result); diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 3b65a38..c496036 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -78,6 +78,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst) case FS_OPCODE_LINTERP: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_EXTRACT_BYTE: case SHADER_OPCODE_MOV_INDIRECT: return true; case SHADER_OPCODE_RCP: diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index c25da07..b090824 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -2193,6 +2193,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_EXTRACT_BYTE: + brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB), + src[1].ud), + 16, 4, 4)); + break; + case FS_OPCODE_SET_SAMPLE_ID: generate_set_sample_id(inst, dst, src[0], src[1]); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 15bd98f..d6e414c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -453,6 +453,43 @@ fs_visitor::nir_emit_instr(nir_instr *instr) } } +/** + * Recognizes a parent instruction of nir_op_extract_byte and changes the type + * to match instr. + * + * Used as a peephole by i2f and u2f. nir_op_extract_byte returns an unsigned, + * but i965 can extract a byte and do type conversion in a single instruction. + */ +bool +fs_visitor::optimize_extract_byte_to_float(nir_alu_instr *instr, + const fs_reg &result) +{ + if (!instr->src[0].src.is_ssa || + !instr->src[0].src.ssa->parent_instr) + return false; + + if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *src0 = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + if (src0->op != nir_op_extract_byte) + return false; + + nir_const_value *byte = nir_src_as_const_value(src0->src[1].src); + assert(byte != NULL && byte->u[0] <= 3); + + fs_reg op0 = get_nir_src(src0->src[0].src); + op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]); + op0 = offset(op0, bld, src0->src[0].swizzle[0]); + + set_saturate(instr->dest.saturate, + bld.emit(SHADER_OPCODE_EXTRACT_BYTE, + result, op0, brw_imm_ud(byte->u[0]))); + return true; +} + bool fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, const fs_reg &result) @@ -624,6 +661,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) switch (instr->op) { case nir_op_i2f: case nir_op_u2f: + if (optimize_extract_byte_to_float(instr, result)) + return; + inst = bld.MOV(result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -1036,6 +1076,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) inst->predicate = BRW_PREDICATE_NORMAL; break; + case nir_op_extract_byte: { + nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); + bld.emit(SHADER_OPCODE_EXTRACT_BYTE, + result, op[0], brw_imm_ud(byte->u[0])); + break; + } + default: unreachable("unhandled instruction"); } diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 5a6752b..b0f0e5a 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -423,6 +423,8 @@ brw_instruction_name(enum opcode op) case SHADER_OPCODE_BROADCAST: return "broadcast"; + case SHADER_OPCODE_EXTRACT_BYTE: + return "extract_byte"; case VEC4_OPCODE_MOV_BYTES: return "mov_bytes"; case VEC4_OPCODE_PACK_BYTES: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp index 85cbf24..c344de7 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp @@ -75,6 +75,7 @@ is_expression(const vec4_instruction *const inst) case VEC4_OPCODE_UNPACK_UNIFORM: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case SHADER_OPCODE_EXTRACT_BYTE: return true; case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index acf9286..9bd4ae2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1463,6 +1463,14 @@ generate_code(struct brw_codegen *p, brw_broadcast(p, dst, src[0], src[1]); break; + case SHADER_OPCODE_EXTRACT_BYTE: + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB), + src[1].ud), + 16, 4, 4)); + brw_set_default_access_mode(p, BRW_ALIGN_16); + break; + case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: generate_unpack_flags(p, dst); break; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 4aed60e..622bd8c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -1473,6 +1473,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; } + case nir_op_extract_byte: { + nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); + emit(SHADER_OPCODE_EXTRACT_BYTE, dst, op[0], brw_imm_ud(byte->u[0])); + break; + } + case nir_op_fabs: case nir_op_iabs: case nir_op_fneg: -- 2.4.9 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev