If mid3 is called with two constants, the resulting IR was two maxes and three mins, when one max and one min would have sufficed. Make mid3() produce an ir_expression with ir_triop_mid3 (new ir_expression operation) and lower it in a lower_instructions pass to the needed amount of mins and maxs.
Tested on i965/Haswell. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76861 Signed-off-by: Petri Latvala <petri.latv...@intel.com> --- For the record, tested this with the following shader: #extension GL_AMD_shader_trinary_minmax : require uniform float zero; uniform float one; uniform float middle; float test_all_constants() { return mid3(0.0, 1.0, 0.5); } float test_two_constants() { return mid3(0.5, one, 0.0); } float test_one_constant() { return mid3(one, zero, 0.5); } float test_no_constants() { return mid3(middle, one, zero); } void main() { float r = test_all_constants(); float g = test_two_constants(); float b = test_one_constant(); float a = test_no_constants(); gl_FragColor = vec4(r, g, b, a); } total instructions in shared programs: 61 -> 57 (-6.56%) instructions in affected programs: 56 -> 52 (-7.14%) Existing piglit tests didn't stress the two-constants case at all so no results from there. Other than all tests passing, naturally. src/glsl/builtin_functions.cpp | 2 +- src/glsl/ir.cpp | 2 + src/glsl/ir.h | 9 +- src/glsl/ir_constant_expression.cpp | 22 ++++ src/glsl/ir_optimization.h | 1 + src/glsl/ir_validate.cpp | 6 ++ src/glsl/lower_instructions.cpp | 112 +++++++++++++++++++++ .../dri/i965/brw_fs_channel_expressions.cpp | 1 + src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 6 ++ src/mesa/drivers/dri/i965/brw_shader.cpp | 3 +- src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 3 + src/mesa/main/macros.h | 3 + src/mesa/program/ir_to_mesa.cpp | 5 + src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 2 + 14 files changed, 174 insertions(+), 3 deletions(-) diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp index 3991f9d..12bbfe0 100644 --- a/src/glsl/builtin_functions.cpp +++ b/src/glsl/builtin_functions.cpp @@ -4260,7 +4260,7 @@ builtin_builder::_mid3(const glsl_type *type) ir_variable *z = in_var(type, "z"); MAKE_SIG(type, shader_trinary_minmax, 3, x, y, z); - ir_expression *mid3 = max2(min2(x, y), max2(min2(x, z), min2(y, z))); + ir_expression *mid3 = expr(ir_triop_mid3, x, y, z); body.emit(ret(mid3)); return sig; diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index 1a18b47..bc585d6 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -436,6 +436,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1, case ir_triop_lrp: case ir_triop_bitfield_extract: case ir_triop_vector_insert: + case ir_triop_mid3: this->type = op0->type; break; @@ -566,6 +567,7 @@ static const char *const operator_strs[] = { "bfi", "bitfield_extract", "vector_insert", + "mid3", "bitfield_insert", "vector", }; diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 6c7c60a..399a4ce 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -1390,9 +1390,16 @@ enum ir_expression_operation { ir_triop_vector_insert, /** + * \name Yield the per-component median of three values, part of AMD_shader_trinary_minmax. + */ + /*@{*/ + ir_triop_mid3, + /*@}*/ + + /** * A sentinel marking the last of the ternary operations. */ - ir_last_triop = ir_triop_vector_insert, + ir_last_triop = ir_triop_mid3, ir_quadop_bitfield_insert, diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp index 8afe8f7..1d4c0e5 100644 --- a/src/glsl/ir_constant_expression.cpp +++ b/src/glsl/ir_constant_expression.cpp @@ -1575,6 +1575,28 @@ ir_expression::constant_expression_value(struct hash_table *variable_context) break; } + case ir_triop_mid3: { + assert(op[0]->type == op[1]->type); + assert(op[0]->type == op[2]->type); + + for (unsigned c = 0; c < components; c++) { + switch (op[0]->type->base_type) { + case GLSL_TYPE_UINT: + data.u[c] = MID3(op[0]->value.u[c], op[1]->value.u[c], op[2]->value.u[c]); + break; + case GLSL_TYPE_INT: + data.i[c] = MID3(op[0]->value.i[c], op[1]->value.i[c], op[2]->value.i[c]); + break; + case GLSL_TYPE_FLOAT: + data.f[c] = MID3(op[0]->value.f[c], op[1]->value.f[c], op[2]->value.f[c]); + break; + default: + assert(0); + } + } + break; + } + case ir_quadop_bitfield_insert: { int offset = op[2]->value.i[0]; int bits = op[3]->value.i[0]; diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index 40bb613..bea5ba0 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -38,6 +38,7 @@ #define INT_DIV_TO_MUL_RCP 0x40 #define BITFIELD_INSERT_TO_BFM_BFI 0x80 #define LDEXP_TO_ARITH 0x100 +#define MID3_TO_MIN_MAX 0x200 /** * \see class lower_packing_builtins_visitor diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp index 71defc8..67c711b 100644 --- a/src/glsl/ir_validate.cpp +++ b/src/glsl/ir_validate.cpp @@ -553,6 +553,12 @@ ir_validate::visit_leave(ir_expression *ir) assert(ir->type == ir->operands[0]->type); break; + case ir_triop_mid3: + assert(ir->operands[0]->type == ir->type); + assert(ir->operands[1]->type == ir->type); + assert(ir->operands[2]->type == ir->type); + break; + case ir_quadop_bitfield_insert: assert(ir->operands[0]->type == ir->type); assert(ir->operands[1]->type == ir->type); diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp index 49316d0..f42e217 100644 --- a/src/glsl/lower_instructions.cpp +++ b/src/glsl/lower_instructions.cpp @@ -39,6 +39,7 @@ * - MOD_TO_FRACT * - LDEXP_TO_ARITH * - BITFIELD_INSERT_TO_BFM_BFI + * - MID3_TO_MIN_MAX * * SUB_TO_ADD_NEG: * --------------- @@ -94,6 +95,11 @@ * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5 * with a pair of instructions. * + * MID3_TO_MIN_MAX: + * ---------------- + * Many GPUs don't have native a mid3 instructions. For these GPUs, convert + * ir_triop_mid3(x, y, z) to max(min(x, y), max(min(x, z), min(y, z))). + * */ #include "main/core.h" /* for M_LOG2E */ @@ -127,6 +133,7 @@ private: void log_to_log2(ir_expression *); void bitfield_insert_to_bfm_bfi(ir_expression *); void ldexp_to_arith(ir_expression *); + void mid3_to_min_max(ir_expression *); }; } /* anonymous namespace */ @@ -436,6 +443,106 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir) this->progress = true; } +void +lower_instructions_visitor::mid3_to_min_max(ir_expression *ir) +{ + /* Translates + * mid3 x y z + * into + * max(min(x, y), max(min(x, z), min(y, z))) + * + * If two of the operands are constants, instead translate to + * clamp(x, y, z) + * or rather, + * min(max(x, y), z) + * + * where y and z contain the lower and higher vector components of the + * constants, respectively. + * + * If all three operands are constants, the former translation is done, and + * constant folding optimization will handle it. + */ + + assert(ir->operation == ir_triop_mid3); + assert(ir->get_num_operands() == 3); + + ir_rvalue *nonconst = NULL; + ir_constant *constants[3] = { 0 }; + unsigned num_constants = 0; + + for (unsigned i = 0; i < 3; ++i) { + if (ir_constant *c = ir->operands[i]->constant_expression_value()) { + constants[num_constants++] = c; + } else { + nonconst = ir->operands[i]; + } + } + + if (num_constants == 2) { + ir_constant_data data[2]; + + memset(&data, 0, sizeof(data)); + + assert(nonconst != NULL); + assert(constants[0]->type == constants[1]->type); + assert(constants[0]->type == ir->type); + + for (unsigned i = 0; i < constants[0]->type->components(); ++i) { + switch (constants[0]->type->base_type) { + case GLSL_TYPE_UINT: + data[0].u[i] = MIN2(constants[0]->value.u[i], constants[1]->value.u[i]); + data[1].u[i] = MAX2(constants[0]->value.u[i], constants[1]->value.u[i]); + break; + case GLSL_TYPE_INT: + data[0].i[i] = MIN2(constants[0]->value.i[i], constants[1]->value.i[i]); + data[1].i[i] = MAX2(constants[0]->value.i[i], constants[1]->value.i[i]); + break; + case GLSL_TYPE_FLOAT: + data[0].f[i] = MIN2(constants[0]->value.f[i], constants[1]->value.f[i]); + data[1].f[i] = MAX2(constants[0]->value.f[i], constants[1]->value.f[i]); + break; + default: + /* unreachable */ + assert(0); + } + } + + /* c1 is the lower valued constant, c2 is the higher */ + ir_constant *c1 = new(ir) ir_constant(ir->type, &data[0]); + ir_constant *c2 = new(ir) ir_constant(ir->type, &data[1]); + + ir_expression *exprmax = new(ir) ir_expression(ir_binop_max, nonconst, c1); + ir->operation = ir_binop_min; + ir->operands[0] = exprmax; + ir->operands[1] = c2; + + this->progress = true; + return; + } + + ir_rvalue *x = ir->operands[0]; + ir_rvalue *y = ir->operands[1]; + ir_rvalue *z = ir->operands[2]; + + ir_rvalue *x2 = x->clone(ir, NULL); + ir_rvalue *y2 = y->clone(ir, NULL); + ir_rvalue *z2 = z->clone(ir, NULL); + + ir_expression *firstmin = new(ir) ir_expression(ir_binop_min, x, y); + ir_expression *secondmin = new(ir) ir_expression(ir_binop_min, x2, z); + ir_expression *thirdmin = new(ir) ir_expression(ir_binop_min, y2, z2); + + ir_expression *secondmax = new(ir) ir_expression(ir_binop_max, secondmin, thirdmin); + + ir->operation = ir_binop_max; + ir->operands[0] = firstmin; + ir->operands[1] = secondmax; + + this->progress = true; + return; +} + + ir_visitor_status lower_instructions_visitor::visit_leave(ir_expression *ir) { @@ -482,6 +589,11 @@ lower_instructions_visitor::visit_leave(ir_expression *ir) ldexp_to_arith(ir); break; + case ir_triop_mid3: + if (lowering(MID3_TO_MIN_MAX)) + mid3_to_min_max(ir); + break; + default: return visit_continue; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index ae5bc56..db33b68 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -410,6 +410,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) case ir_binop_ldexp: case ir_binop_vector_extract: case ir_triop_vector_insert: + case ir_triop_mid3: case ir_quadop_bitfield_insert: case ir_quadop_vector: assert(!"should have been lowered"); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 2aa3acd..7488c8e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -808,7 +808,13 @@ fs_visitor::visit(ir_expression *ir) inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]); inst->predicate = BRW_PREDICATE_NORMAL; break; + + case ir_triop_mid3: + assert(!"not reached: should be handled by " + "lower_instructions::mid3_to_min_max"); + break; } + } void diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 6e74803..604fdb7 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -154,7 +154,8 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg) EXP_TO_EXP2 | LOG_TO_LOG2 | bitfield_insert | - LDEXP_TO_ARITH); + LDEXP_TO_ARITH | + MID3_TO_MIN_MAX); /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, * if-statements need to be flattened. diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 8fa0aee..0aa8975 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1721,6 +1721,9 @@ vec4_visitor::visit(ir_expression *ir) case ir_binop_ldexp: assert(!"not reached: should be handled by ldexp_to_arith()"); break; + case ir_triop_mid3: + assert(!"not reached: should be handled by mid3_to_min_max()"); + break; } } diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h index 5228c3a..12ad287 100644 --- a/src/mesa/main/macros.h +++ b/src/mesa/main/macros.h @@ -678,6 +678,9 @@ INTERP_4F(GLfloat t, GLfloat dst[4], const GLfloat out[4], const GLfloat in[4]) #define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C)) #define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C)) +/** Median of three values: */ +#define MID3( A, B, C ) ((A) < (B) ? CLAMP(C, A, B) : CLAMP(C, B, A)) + static inline unsigned minify(unsigned value, unsigned levels) { diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 59cf123..88e2073 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -1450,6 +1450,10 @@ ir_to_mesa_visitor::visit(ir_expression *ir) emit(ir, OPCODE_LRP, result_dst, op[2], op[1], op[0]); break; + case ir_triop_mid3: + assert(!"not reached: should be handled by mid3_to_min_max"); + break; + case ir_binop_vector_extract: case ir_binop_bfm: case ir_triop_fma: @@ -3002,6 +3006,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) do_mat_op_to_vec(ir); lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2 | LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP + | MID3_TO_MIN_MAX | ((options->EmitNoPow) ? POW_TO_EXP2 : 0))); progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index d1c3856..4a61b86 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -2001,6 +2001,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir) case ir_binop_ldexp: case ir_binop_carry: case ir_binop_borrow: + case ir_triop_mid3: /* This operation is not supported, or should have already been handled. */ assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()"); @@ -5396,6 +5397,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) EXP_TO_EXP2 | LOG_TO_LOG2 | LDEXP_TO_ARITH | + MID3_TO_MIN_MAX | (options->EmitNoPow ? POW_TO_EXP2 : 0) | (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0)); -- 1.9.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev