From: Ian Romanick <ian.d.roman...@intel.com> On platforms that do not natively generate 0u and ~0u for Boolean results, b2f expressions that look like
f = b2f(expr cmp 0) will generate better code by pretending the expression is f = ir_triop_sel(0.0, 1.0, expr cmp 0) This is because the last instruction of "expr" can generate the condition code for the "cmp 0". This avoids having to do the "-(b & 1)" trick to generate 0u or ~0u for the Boolean result. This means code like mov(16) g16<1>F 1F mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F (+f0) sel(16) m6<1>F g16<8,8,1>F 0F will be generated instead of mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F and(16) g4<1>D g2<8,8,1>D 1D and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD When the comparison is either == 0.0 or != 0.0 it would seem that using the knowledge that the true (or false) case already results in zero would allow better code generation by possibly avoiding a load-immediate instruction. Some experimentation showed this to not be the case. Shader-db results: GM45 (0x2A42): total instructions in shared programs: 3542437 -> 3542267 (-0.00%) instructions in affected programs: 32947 -> 32777 (-0.52%) helped: 118 HURT: 0 GAINED: 0 LOST: 0 Iron Lake (0x0046): total instructions in shared programs: 4864785 -> 4864611 (-0.00%) instructions in affected programs: 33094 -> 32920 (-0.53%) helped: 122 HURT: 0 GAINED: 0 LOST: 0 No change on other platforms. Signed-off-by: Ian Romanick <ian.d.roman...@intel.com> --- src/mesa/drivers/dri/i965/brw_fs.h | 1 + src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 74 ++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 84e0b9e..cae55f4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -533,6 +533,7 @@ public: const fs_reg &a); void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst, const fs_reg &src0, const fs_reg &src1); + bool try_emit_b2f_of_comparison(ir_expression *ir); bool try_emit_saturate(ir_expression *ir); bool try_emit_line(ir_expression *ir); bool try_emit_mad(ir_expression *ir); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index f5d7383..2f74716 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -483,6 +483,75 @@ fs_visitor::try_emit_mad(ir_expression *ir) return true; } +bool +fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir) +{ + /* On platforms that do not natively generate 0u and ~0u for Boolean + * results, b2f expressions that look like + * + * f = b2f(expr cmp 0) + * + * will generate better code by pretending the expression is + * + * f = ir_triop_csel(0.0, 1.0, expr cmp 0) + * + * This is because the last instruction of "expr" can generate the + * condition code for the "cmp 0". This avoids having to do the "-(b & 1)" + * trick to generate 0u or ~0u for the Boolean result. This means code like + * + * mov(16) g16<1>F 1F + * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F + * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F + * + * will be generated instead of + * + * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F + * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F + * and(16) g4<1>D g2<8,8,1>D 1D + * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD + * + * When the comparison is either == 0.0 or != 0.0 it would seem that using + * the knowledge that the true (or false) case already results in zero + * would allow better code generation by possibly avoiding a load-immediate + * instruction. Some experimentation showed this to not be the case. + */ + ir_expression *const cmp = ir->operands[0]->as_expression(); + if (cmp == NULL || cmp->get_num_operands() != 2) + return false; + + unsigned i; + for (i = 0; i < 2; i++) { + ir_constant *c = cmp->operands[i]->as_constant(); + if (c == NULL) + continue; + + /* Both operands cannot be constants, and the constant has to be zero + * for the optimization to work. Therefore, if we got a constant and + * the constant is not zero, we fail. + */ + if (!c->is_zero()) + return false; + + ir_expression *expr = cmp->operands[i ^ 1]->as_expression(); + if (expr != NULL) + break; + } + + if (i == 2) + return false; + + emit_bool_to_cond_code(cmp); + + fs_reg temp = vgrf(ir->type); + emit(MOV(temp, fs_reg(1.0f))); + + this->result = vgrf(ir->type); + fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f))); + inst->predicate = BRW_PREDICATE_NORMAL; + + return true; +} + static int pack_pixel_offset(float x) { @@ -647,6 +716,11 @@ fs_visitor::visit(ir_expression *ir) inst->predicate = BRW_PREDICATE_NORMAL; return; + case ir_unop_b2f: + if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir)) + return; + break; + case ir_unop_interpolate_at_centroid: case ir_binop_interpolate_at_offset: case ir_binop_interpolate_at_sample: -- 2.1.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev