Jason Ekstrand <ja...@jlekstrand.net> writes: > On g4x through Sandy Bridge, src1 (the coordinates) of the PLN > instruction is required to be an even register number. When it's odd > (which can happen with SIMD32), we have to emit a LINE+MAC combination > instead. Unfortunately, we can't just fall through to the gen4 case > because the input registers are still set up for PLN which lays out the > four src1 registers differently in SIMD16 than LINE. > > v2 (Jason Ekstrand): > - Take advantage of both accumulators and emit LINE LINE MAC MAC > - Unify the gen4 and gen4x-6 cases using a loop
I don't think the commit message is giving fair credit to the origin of these ideas. v2 of this patch you sent shortly after I pointed you at the corresponding change in my branch is almost identical algorithmically to the original, even though I could believe that you wrote v1 independently. > --- > src/intel/compiler/brw_fs_generator.cpp | 55 > +++++++++++++++++++++++++-------- > src/intel/compiler/brw_shader.cpp | 3 +- > 2 files changed, 44 insertions(+), 14 deletions(-) > > diff --git a/src/intel/compiler/brw_fs_generator.cpp > b/src/intel/compiler/brw_fs_generator.cpp > index 548a208..7943914 100644 > --- a/src/intel/compiler/brw_fs_generator.cpp > +++ b/src/intel/compiler/brw_fs_generator.cpp > @@ -760,28 +760,57 @@ fs_generator::generate_linterp(fs_inst *inst, > brw_pop_insn_state(p); > > return true; > - } else if (devinfo->has_pln) { > + } else if (devinfo->has_pln && > + (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) { > + brw_PLN(p, dst, interp, delta_x); > + > + return false; > + } else { > /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane": > * > * "[DevSNB]:<src1> must be even register aligned. > * > * This restriction is lifted on Ivy Bridge. > + * > + * This means that we need to split PLN into LINE+MAC on-the-fly. > + * Unfortunately, the inputs are laid out for PLN and not LIN+MAC so > + * we have to split into SIMD8 pieces. For gen4 (!has_pln), the > + * coordinate registers are laid out differently so we leave it as a > + * SIMD16 instruction. > */ > - assert(devinfo->gen >= 7 || (delta_x.nr & 1) == 0); > - brw_PLN(p, dst, interp, delta_x); > - > - return false; > - } else { > - i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x); > - i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y); > + assert(inst->exec_size == 8 || inst->exec_size == 16); > + assert(inst->group % 16 == 0); > > - brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod); > + const unsigned lower_size = devinfo->has_pln ? 8 : inst->exec_size; > + brw_set_default_exec_size(p, cvt(lower_size) - 1); > > - /* brw_set_default_saturate() is called before emitting instructions, > so > - * the saturate bit is set in each instruction, so we need to unset it > on > - * the first instruction. > + /* Thanks to two accumulators, we can emit all the LINEs and then all > + * the MACs. This improves parallelism a bit. > */ > - brw_inst_set_saturate(p->devinfo, i[0], false); > + for (unsigned g = 0; g < lower_size / 8; g++) { > + brw_inst *line = brw_LINE(p, brw_null_reg(), interp, > + offset(delta_x, g * 2)); > + brw_inst_set_group(devinfo, line, inst->group + g * lower_size); > + > + /* LINE writes the accumulator automatically on gen4-5. On Sandy > + * Bridge and later, we have to explicitly enable it. > + */ > + if (devinfo->gen >= 6) > + brw_inst_set_acc_wr_control(p->devinfo, line, true); > + > + /* brw_set_default_saturate() is called before emitting > + * instructions, so the saturate bit is set in each instruction, > + * so we need to unset it on the LINE instructions. > + */ > + brw_inst_set_saturate(p->devinfo, line, false); > + } > + > + for (unsigned g = 0; g < lower_size / 8; g++) { > + brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1), > + offset(delta_x, g * 2 + 1)); > + brw_inst_set_group(devinfo, mac, inst->group + g * lower_size); > + brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod); > + } > > return true; > } > diff --git a/src/intel/compiler/brw_shader.cpp > b/src/intel/compiler/brw_shader.cpp > index dfd2c5c..8610c30 100644 > --- a/src/intel/compiler/brw_shader.cpp > +++ b/src/intel/compiler/brw_shader.cpp > @@ -985,7 +985,8 @@ backend_instruction::writes_accumulator_implicitly(const > struct gen_device_info > (devinfo->gen < 6 && > ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) || > (opcode >= FS_OPCODE_DDX_COARSE && opcode <= > FS_OPCODE_LINTERP))) || > - (opcode == FS_OPCODE_LINTERP && !devinfo->has_pln); > + (opcode == FS_OPCODE_LINTERP && > + (!devinfo->has_pln || devinfo->gen <= 6)); > } > > bool > -- > 2.5.0.400.gff86faf > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
signature.asc
Description: PGP signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev