On Tue, Apr 14, 2015 at 4:15 PM, Matt Turner <matts...@gmail.com> wrote: > The X and Y values come interleaved in g1 (.4-.11 inclusive), so we can > calculate them together with a single add(32) instruction on some > platforms like Broadwell and newer or in SIMD8 elsewhere. > > Note that I also moved the PIXEL_X/PIXEL_Y virtual opcodes from before > LINTERP to after it. That's because the writes_accumulator_implicitly() > function in backend_instruction tests for <= LINTERP for determining > whether the instruction indeed writes the accumulator implicitly. The > old FS_OPCODE_PIXEL_X/Y emitted ADD instructions, which did, but the new > opcodes just emit MOVs, which don't. It doesn't matter, since we don't > use these opcodes on Gen4/5 anymore, but in the case that we do... > > On Broadwell: > total instructions in shared programs: 7192355 -> 7186224 (-0.09%) > instructions in affected programs: 1190700 -> 1184569 (-0.51%) > helped: 6131 > > On Haswell: > total instructions in shared programs: 6155979 -> 6152800 (-0.05%) > instructions in affected programs: 652362 -> 649183 (-0.49%) > helped: 3179 > --- > src/mesa/drivers/dri/i965/brw_defines.h | 2 + > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 10 ++++ > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 71 > ++++++++++++++++++-------- > 3 files changed, 63 insertions(+), 20 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index 1afa34e..f99da12 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -925,6 +925,8 @@ enum opcode { > FS_OPCODE_DDY_FINE, > FS_OPCODE_CINTERP, > FS_OPCODE_LINTERP, > + FS_OPCODE_PIXEL_X, > + FS_OPCODE_PIXEL_Y, > FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, > FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, > FS_OPCODE_VARYING_PULL_CONSTANT_LOAD, > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index dba3286..7bc97a6 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -1909,6 +1909,16 @@ fs_generator::generate_code(const cfg_t *cfg, int > dispatch_width) > case FS_OPCODE_LINTERP: > generate_linterp(inst, dst, src); > break; > + case FS_OPCODE_PIXEL_X: > + assert(src[0].type == BRW_REGISTER_TYPE_UW); > + src[0].subnr = 0 * type_sz(src[0].type); > + brw_MOV(p, dst, stride(src[0], 8, 4, 1));
Why do we need the special opcode? Can we not do that stride with what we already have in the FS compiler? > + break; > + case FS_OPCODE_PIXEL_Y: > + assert(src[0].type == BRW_REGISTER_TYPE_UW); > + src[0].subnr = 4 * type_sz(src[0].type); > + brw_MOV(p, dst, stride(src[0], 8, 4, 1)); > + break; > case SHADER_OPCODE_TEX: > case FS_OPCODE_TXB: > case SHADER_OPCODE_TXD: > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index 0c2511a..1479333 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -3469,27 +3469,58 @@ fs_visitor::emit_interpolation_setup_gen6() > { > struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); > > - /* If the pixel centers end up used, the setup is the same as for gen4. */ > this->current_annotation = "compute pixel centers"; > - fs_reg int_pixel_x = vgrf(glsl_type::uint_type); > - fs_reg int_pixel_y = vgrf(glsl_type::uint_type); > - int_pixel_x.type = BRW_REGISTER_TYPE_UW; > - int_pixel_y.type = BRW_REGISTER_TYPE_UW; > - emit(ADD(int_pixel_x, > - fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), > - fs_reg(brw_imm_v(0x10101010)))); > - emit(ADD(int_pixel_y, > - fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), > - fs_reg(brw_imm_v(0x11001100)))); > - > - /* As of gen6, we can no longer mix float and int sources. We have > - * to turn the integer pixel centers into floats for their actual > - * use. > - */ > - this->pixel_x = vgrf(glsl_type::float_type); > - this->pixel_y = vgrf(glsl_type::float_type); > - emit(MOV(this->pixel_x, int_pixel_x)); > - emit(MOV(this->pixel_y, int_pixel_y)); > + if (brw->gen >= 8 || dispatch_width == 8) { > + /* The "Register Region Restrictions" page says for BDW (and newer, > + * presumably): > + * > + * "When destination spans two registers, the source may be one or > + * two registers. The destination elements must be evenly split > + * between the two registers." > + * > + * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to > + * compute our pixel centers. > + */ > + fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8), > + BRW_REGISTER_TYPE_UW, dispatch_width * 2); > + emit(ADD(int_pixel_xy, > + fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)), > + fs_reg(brw_imm_v(0x11001010)))) > + ->force_writemask_all = true; > + > + this->pixel_x = vgrf(glsl_type::float_type); > + this->pixel_y = vgrf(glsl_type::float_type); > + emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy); > + emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy); > + } else { > + /* The "Register Region Restrictions" page says for SNB, IVB, HSW: > + * > + * "When destination spans two registers, the source MUST span two > + * registers." > + * > + * Since the GRF source of the ADD will only read a single register, we > + * must do two separate ADDs in SIMD16. > + */ > + fs_reg int_pixel_x = vgrf(glsl_type::uint_type); > + fs_reg int_pixel_y = vgrf(glsl_type::uint_type); > + int_pixel_x.type = BRW_REGISTER_TYPE_UW; > + int_pixel_y.type = BRW_REGISTER_TYPE_UW; > + emit(ADD(int_pixel_x, > + fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), > + fs_reg(brw_imm_v(0x10101010)))); > + emit(ADD(int_pixel_y, > + fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), > + fs_reg(brw_imm_v(0x11001100)))); > + > + /* As of gen6, we can no longer mix float and int sources. We have > + * to turn the integer pixel centers into floats for their actual > + * use. > + */ > + this->pixel_x = vgrf(glsl_type::float_type); > + this->pixel_y = vgrf(glsl_type::float_type); > + emit(MOV(this->pixel_x, int_pixel_x)); > + emit(MOV(this->pixel_y, int_pixel_y)); > + } > > this->current_annotation = "compute pos.w"; > this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0)); > -- > 2.0.5 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev