Samuel Iglesias Gonsálvez <sigles...@igalia.com> writes: > From: "Juan A. Suarez Romero" <jasua...@igalia.com> > > When splitting a CMP/MOV instruction with NULL dest, DF sources, and > conditional modifier; we can't use directly the flag registers, as they will > have the wrong results in IVB/BYT after the scalarization. > > Rather, we need to store the result in a temporary register, and then use > that register to set proper the flags values. > > If a MOV has a null destination register and a conditional modifier, it > can be replaced with a CMP against zero with the same conditional > modifier. By doing this replacement, we can do the SIMD lowering > without any problem. > > v2: > - Fix typo (Matt) > > Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> > Signed-off-by: Juan A. Suarez Romero <jasua...@igalia.com > --- > src/mesa/drivers/dri/i965/brw_vec4.cpp | 80 > +++++++++++++++++++++++++++++++--- > 1 file changed, 74 insertions(+), 6 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp > b/src/mesa/drivers/dri/i965/brw_vec4.cpp > index adcde085305..819674e8cb9 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp > @@ -2177,6 +2177,46 @@ vec4_visitor::lower_simd_width() > * value of the instruction's dst. > */ > bool needs_temp = dst_src_regions_overlap(inst); > + > + /* When splitting instructions with conditional modifiers and NULL > + * dest we can't rely directly on the flags to store the result. > Rather, > + * we need first to enqueue the result in a temporary register, and > then > + * move those values into flags. > + */ > + bool inst_df_dst_null = > + inst->dst.is_null() && get_exec_type_size(inst) == 8 && > + inst->conditional_mod != BRW_CONDITIONAL_NONE; > + > + if (inst_df_dst_null) { > + /* If there are other DF instructions with NULL destination, > + * we need to verify if we can use the temporary register or > + * if we need an extra lowering step. > + */ > + assert(inst->opcode == BRW_OPCODE_MOV || > + inst->opcode == BRW_OPCODE_CMP); > + > + /* Replace MOV.XX with null destination with the equivalent CMP.XX > + * with null destination, so we can lower it as explained before. > + */ > + if (inst->opcode == BRW_OPCODE_MOV) { > + vec4_instruction *cmp = > + new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst_null_df(), > + inst->src[0], > + setup_imm_df(0.0, block, inst)); > + cmp->conditional_mod = inst->conditional_mod; > + cmp->exec_size = inst->exec_size; > + cmp->group = inst->group; > + cmp->size_written = inst->size_written; > + inst->insert_before(block, cmp); > + inst->remove(block); > + inst = cmp; > + } > + } > + dst_reg inst_dst; > + if (inst_df_dst_null) > + inst_dst = > + retype(dst_reg(VGRF, alloc.allocate(1)), BRW_REGISTER_TYPE_F); > + > for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) { > unsigned channel_offset = lowered_width * n; > > @@ -2199,7 +2239,7 @@ vec4_visitor::lower_simd_width() > bool d2f_pass = (inst->opcode == VEC4_OPCODE_FROM_DOUBLE && n > 0); > /* Compute split dst region */ > dst_reg dst; > - if (needs_temp || d2f_pass) { > + if (needs_temp || d2f_pass || inst_df_dst_null) { > unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); > dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), > inst->dst.type); > @@ -2229,24 +2269,52 @@ vec4_visitor::lower_simd_width() > > inst->insert_before(block, linst); > > + dst_reg d2f_dst; > + if (inst_df_dst_null) { > + unsigned num_regs = DIV_ROUND_UP(lowered_width, > type_sz(BRW_REGISTER_TYPE_F)); > + d2f_dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), > BRW_REGISTER_TYPE_F); > + vec4_instruction *d2f = new(mem_ctx) > vec4_instruction(VEC4_OPCODE_FROM_DOUBLE, d2f_dst, src_reg(dst)); > + d2f->group = channel_offset; > + d2f->exec_size = lowered_width; > + d2f->size_written = lowered_width * type_sz(d2f_dst.type); > + d2f->predicate = inst->predicate; > + inst->insert_before(block, d2f); > + } > + > /* If we used a temporary to store the result of the split > * instruction, copy the result to the original destination > */ > - if (needs_temp || d2f_pass) { > + if (needs_temp || d2f_pass || inst_df_dst_null) { > vec4_instruction *mov; > - if (d2f_pass) > + if (d2f_pass) { > mov = MOV(horiz_offset(inst->dst, n * > type_sz(inst->dst.type)), src_reg(dst)); > - else > + mov->size_written = size_written; > + } else if (inst_df_dst_null) { > + mov = MOV(horiz_offset(inst_dst, n * 4), src_reg(d2f_dst)); > + mov->size_written = lowered_width * type_sz(inst_dst.type); > + } else { > mov = MOV(offset(inst->dst, lowered_width, n), src_reg(dst)); > + mov->size_written = size_written; > + } > mov->exec_size = lowered_width; > mov->group = channel_offset; > - mov->size_written = size_written; > mov->predicate = inst->predicate; > inst->insert_before(block, mov); > } > } > > - inst->remove(block); > + /* For CMP.XX instruction, we need to set the flags correctly. We do > + * this by comparing the register that has the results against 0.0, > + * getting the values in the flags. > + */ > + if (inst_df_dst_null) { > + inst->dst.type = BRW_REGISTER_TYPE_F; > + inst->src[0] = src_reg(inst_dst); > + inst->src[1] = brw_imm_f(0.0f); > + inst->conditional_mod = BRW_CONDITIONAL_NZ; > + } else { > + inst->remove(block); > + } > progress = true; > } >
I have no idea why this is useful (Are the nib/quarter control bits being set correctly for the DF instructions with null register? What is exactly preventing you from doing SIMD lowering on those instructions? What are the symptoms?), why the code assumes that the null register destination will have an influence on the conditional mod result written to the flag register, or why the lowering pass is the place to do this. > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev