On Tue, Nov 27, 2018 at 12:55 PM Ian Romanick <i...@freedesktop.org> wrote:
> On 11/22/2018 10:46 AM, Jason Ekstrand wrote: > > The vec4 path has only been compile-tested as there's no easy way to > > generate a vec4 shader with an unordered equality. > > --- > > src/intel/compiler/brw_compiler.c | 3 +++ > > src/intel/compiler/brw_fs_nir.cpp | 20 +++++++++++++------- > > src/intel/compiler/brw_vec4_nir.cpp | 21 +++++++++++++++++++++ > > 3 files changed, 37 insertions(+), 7 deletions(-) > > > > diff --git a/src/intel/compiler/brw_compiler.c > b/src/intel/compiler/brw_compiler.c > > index fe632c5badc..f9e8fa09a34 100644 > > --- a/src/intel/compiler/brw_compiler.c > > +++ b/src/intel/compiler/brw_compiler.c > > @@ -42,6 +42,9 @@ > > .lower_fdiv = true, > \ > > .lower_flrp64 = true, > \ > > .lower_ldexp = true, > \ > > + .lower_fltu = true, > \ > > + .lower_fgeu = true, > \ > > Can't we use cmpn.l and cmpn.ge for these? On some platforms it has to > be split into two SIMD8 instructions, but that seems better than the > lowering... or maybe just use those on BDW+? > I didn't know about CMPN! I guess that would do the trick though I suppose we have a pile of back-end work to be able to optimize it. I think the back-end usually gets rid of the inot by flipping the cmod but maybe not. > > + .lower_fne_to_fequ = true, > \ > > .lower_cs_local_id_from_index = true, > \ > > .lower_device_index_to_zero = true, > \ > > .native_integers = true, > \ > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > > index a62d521bb5d..eba3611e447 100644 > > --- a/src/intel/compiler/brw_fs_nir.cpp > > +++ b/src/intel/compiler/brw_fs_nir.cpp > > @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) > > case nir_op_flt: > > case nir_op_fge: > > case nir_op_feq: > > + case nir_op_fne: > > + case nir_op_fequ: > > case nir_op_fneu: { > > fs_reg dest = result; > > > > @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, > nir_alu_instr *instr) > > if (bit_size != 32) > > dest = bld.vgrf(op[0].type, 1); > > > > - brw_conditional_mod cond; > > switch (instr->op) { > > case nir_op_flt: > > - cond = BRW_CONDITIONAL_L; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L); > > break; > > case nir_op_fge: > > - cond = BRW_CONDITIONAL_GE; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE); > > break; > > case nir_op_feq: > > - cond = BRW_CONDITIONAL_Z; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z); > > + break; > > + case nir_op_fequ: > > + bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ); > > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > > + bld.CMP(dest, op[1], op[1], > BRW_CONDITIONAL_NZ)); > > + set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */ > > + bld.CMP(dest, op[0], op[1], > BRW_CONDITIONAL_Z)); > > OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally > what this code does. It seems like this could be implemented (perhaps > as a lowering?) as (OpFUnordGreaterThanEqual(a,b) && > OpFUnordGreaterThanEqual(b, a)) via cmpn.ge. Using this same technique, > that would be 2 instructions (on BDW+) instead of 3. HSW has to do cmpn > as two SIMD8 instructions, so this may be better there. Dunno. > That's a good idea. Unfortunately, CMPN dosn't modify Z or NZ. We could also lower fequ(x, y) to inot(fne(x, y)) and implement fne(x, y) as (x < y) || (x > y) which wouldn't require CMPN. --Jason > > break; > > case nir_op_fneu: > > - cond = BRW_CONDITIONAL_NZ; > > + bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ); > > break; > > default: > > unreachable("bad opcode"); > > } > > > > - bld.CMP(dest, op[0], op[1], cond); > > - > > if (bit_size > 32) { > > bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); > > } else if(bit_size < 32) { > > diff --git a/src/intel/compiler/brw_vec4_nir.cpp > b/src/intel/compiler/brw_vec4_nir.cpp > > index f7f46f5034c..32559e1aade 100644 > > --- a/src/intel/compiler/brw_vec4_nir.cpp > > +++ b/src/intel/compiler/brw_vec4_nir.cpp > > @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) > > break; > > } > > > > + case nir_op_fequ: { > > + dst_reg cmp_res = dst; > > + if (nir_src_bit_size(instr->src[0].src) == 64) > > + cmp_res = dst_reg(this, glsl_type::dvec4_type); > > + > > + vec4_instruction *inst; > > + inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ)); > > + inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ)); > > + inst->predicate = BRW_PREDICATE_NORMAL; > > + inst->predicate_inverse = true; > > + inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z)); > > + inst->predicate = BRW_PREDICATE_NORMAL; > > + inst->predicate_inverse = true; > > + > > + if (nir_src_bit_size(instr->src[0].src) == 64) { > > + dst_reg cmp_res32 = dst_reg(this, glsl_type::bvec4_type); > > + emit(VEC4_OPCODE_PICK_LOW_32BIT, cmp_res32, src_reg(cmp_res)); > > + emit(MOV(dst, src_reg(cmp_res32))); > > + } > > + } > > + > > case nir_op_ball_iequal2: > > case nir_op_ball_iequal3: > > case nir_op_ball_iequal4: > > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev