On 24 August 2012 03:06, Kenneth Graunke <kenn...@whitecape.org> wrote:
> When assigning uniform locations, the linker assigns each sampler > uniform a sequential numerical ID. gl_shader_program::SamplerUnits maps > these sampler variable IDs to the actual texture units they reference > (specified via glUniform1i). > > Previously, we encoded this mapping in the SEND instruction encoding: > the "sampler" was the texture unit number, and the binding table index > was SURF_INDEX_TEXTURE(the texture unit number). This unfortunately > meant that whenever the application changed the value of a sampler > uniform, we had to recompile the shader to change the SEND instructions. > > This was horrible for the game Cogs, which repeatedly switches between > using texture unit 0 and 1. It also made fragment shader precompiles > useless: we'd do the precompile at glLinkShader() time, before the > application called glUniform1i to set the sampler values. As soon as > it did that, we'd have to recompile, wasting time and space in the > program cache. > > This patch encodes the SamplerUnits indirection in the binding table, > sampler state, and sampler default color tables. Instead of baking the > texture unit number into the shader, we bake in the sampler variable ID > assigned by the linker. Since those never change, we don't need to > recompile programs on uniform changes. > > This does mean that the tables now depend on the linked shader program > being used for rendering, rather than simply representing all available > texture units. This could cause an increase in state emission. > > Another plus is that the sampler state and sampler default color tables > are now compact: we only emit as many entries as there are sampler > uniforms, with no holes in the table since the new sampler IDs are > sequential. Previously we had to emit a full 16 entries every time, > since the tables tracked the state of all active texture units. > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 +- > src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 2 +- > src/mesa/drivers/dri/i965/brw_wm_sampler_state.c | 27 ++++++++----- > src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 51 > ++++++++++++++++++------ > src/mesa/drivers/dri/i965/gen7_sampler_state.c | 27 ++++++++----- > 5 files changed, 74 insertions(+), 35 deletions(-) > > The big patch that actually changes how we do things. > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index 601f83c..e3d3a98 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -1345,7 +1345,7 @@ fs_visitor::visit(ir_texture *ir) > if (ir->offset != NULL && ir->op != ir_txf) > inst->texture_offset = > brw_texture_offset(ir->offset->as_constant()); > > - inst->sampler = texunit; > + inst->sampler = sampler; > > if (ir->shadow_comparitor) > inst->shadow_compare = true; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > index c22ba60..629ecb0 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > @@ -1897,7 +1897,7 @@ vec4_visitor::visit(ir_texture *ir) > inst->header_present = ir->offset || intel->gen < 5; > inst->base_mrf = 2; > inst->mlen = inst->header_present + 1; /* always at least one */ > - inst->sampler = texunit; > + inst->sampler = sampler; > inst->dst = dst_reg(this, ir->type); > inst->shadow_compare = ir->shadow_comparitor != NULL; > > diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c > b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c > index 10deb1d..610ef34 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c > +++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c > @@ -334,13 +334,14 @@ brw_upload_samplers(struct brw_context *brw) > { > struct gl_context *ctx = &brw->intel.ctx; > struct brw_sampler_state *samplers; > - int i; > > - brw->sampler.count = 0; > - for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { > - if (ctx->Texture.Unit[i]._ReallyEnabled) > - brw->sampler.count = i + 1; > - } > + /* BRW_NEW_VERTEX_PROGRAM and BRW_NEW_FRAGMENT_PROGRAM */ > + struct gl_program *vs = (struct gl_program *) brw->vertex_program; > + struct gl_program *fs = (struct gl_program *) brw->fragment_program; > + > + GLbitfield SamplersUsed = vs->SamplersUsed | fs->SamplersUsed; > + > + brw->sampler.count = _mesa_bitcount(SamplersUsed); > > if (brw->sampler.count == 0) > return; > @@ -350,9 +351,13 @@ brw_upload_samplers(struct brw_context *brw) > 32, &brw->sampler.offset); > memset(samplers, 0, brw->sampler.count * sizeof(*samplers)); > > - for (i = 0; i < brw->sampler.count; i++) { > - if (ctx->Texture.Unit[i]._ReallyEnabled) > - brw_update_sampler_state(brw, i, i, &samplers[i]); > + for (unsigned s = 0; s < brw->sampler.count; s++) { > + if (SamplersUsed & (1 << s)) { > + const unsigned unit = (fs->SamplersUsed & (1 << s)) ? > + fs->SamplerUnits[s] : vs->SamplerUnits[s]; > Presumably the correctness of this code relies on the assumption that if a given sampler is used by both the VS and the FS, then fs->SamplerUnits[s] == vs->SamplerUnits[s]. Would you mind adding a comment explaining why that is the case? It's not obvious to me without digging around the linker code. (An assertion to this effect might be a good idea too, since we're relying on the behaviour of code way off in glsl land). > + if (ctx->Texture.Unit[unit]._ReallyEnabled) > + brw_update_sampler_state(brw, unit, s, &samplers[s]); > + } > } > > brw->state.dirty.cache |= CACHE_NEW_SAMPLER; > @@ -361,7 +366,9 @@ brw_upload_samplers(struct brw_context *brw) > const struct brw_tracked_state brw_samplers = { > .dirty = { > .mesa = _NEW_TEXTURE, > - .brw = BRW_NEW_BATCH, > + .brw = BRW_NEW_BATCH | > + BRW_NEW_VERTEX_PROGRAM | > + BRW_NEW_FRAGMENT_PROGRAM, > .cache = 0 > }, > .emit = brw_upload_samplers, > diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > index 0c87b84..eefa427 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > @@ -1238,22 +1238,45 @@ const struct brw_tracked_state > gen6_renderbuffer_surfaces = { > static void > brw_update_texture_surfaces(struct brw_context *brw) > { > - struct gl_context *ctx = &brw->intel.ctx; > + struct intel_context *intel = &brw->intel; > + struct gl_context *ctx = &intel->ctx; > + > + /* BRW_NEW_VERTEX_PROGRAM and BRW_NEW_FRAGMENT_PROGRAM: > + * Unfortunately, we're stuck using the gl_program structs until the > + * ARB_fragment_program front-end gets converted to GLSL IR. These > + * have the downside that SamplerUnits is split and only contains the > + * mappings for samplers active in that stage. > + */ > A duplicate of this comment would be helpful in brw_upload_samplers() and gen7_upload_samplers() too. > + struct gl_program *vs = (struct gl_program *) brw->vertex_program; > + struct gl_program *fs = (struct gl_program *) brw->fragment_program; > > - for (unsigned i = 0; i < BRW_MAX_TEX_UNIT; i++) { > - const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i]; > - const GLuint surf = SURF_INDEX_TEXTURE(i); > + unsigned num_samplers = _mesa_bitcount(vs->SamplersUsed | > fs->SamplersUsed); > > - /* _NEW_TEXTURE */ > - if (texUnit->_ReallyEnabled) { > - brw->intel.vtbl.update_texture_surface(ctx, i, > brw->wm.surf_offset, surf); > - } else { > - brw->wm.surf_offset[surf] = 0; > + for (unsigned s = 0; s < num_samplers; s++) { > + brw->vs.surf_offset[SURF_INDEX_VS_TEXTURE(s)] = 0; > + brw->wm.surf_offset[SURF_INDEX_TEXTURE(s)] = 0; > + > + if (vs->SamplersUsed & (1 << s)) { > + const unsigned unit = vs->SamplerUnits[s]; > + > + /* _NEW_TEXTURE */ > + if (ctx->Texture.Unit[unit]._ReallyEnabled) { > + intel->vtbl.update_texture_surface(ctx, unit, > + brw->vs.surf_offset, > + SURF_INDEX_VS_TEXTURE(s)); > + } > } > > - /* For now, just mirror the texture setup to the VS slots. */ > - brw->vs.surf_offset[SURF_INDEX_VS_TEXTURE(i)] = > - brw->wm.surf_offset[surf]; > + if (fs->SamplersUsed & (1 << s)) { > + const unsigned unit = fs->SamplerUnits[s]; > + > + /* _NEW_TEXTURE */ > + if (ctx->Texture.Unit[unit]._ReallyEnabled) { > + intel->vtbl.update_texture_surface(ctx, unit, > + brw->wm.surf_offset, > + SURF_INDEX_TEXTURE(s)); > + } > + } > } > > brw->state.dirty.brw |= BRW_NEW_SURFACES; > @@ -1262,7 +1285,9 @@ brw_update_texture_surfaces(struct brw_context *brw) > const struct brw_tracked_state brw_texture_surfaces = { > .dirty = { > .mesa = _NEW_TEXTURE, > - .brw = BRW_NEW_BATCH, > + .brw = BRW_NEW_BATCH | > + BRW_NEW_VERTEX_PROGRAM | > + BRW_NEW_FRAGMENT_PROGRAM, > .cache = 0 > }, > .emit = brw_update_texture_surfaces, > diff --git a/src/mesa/drivers/dri/i965/gen7_sampler_state.c > b/src/mesa/drivers/dri/i965/gen7_sampler_state.c > index 8969119..379a9c5 100644 > --- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c > @@ -188,13 +188,14 @@ gen7_upload_samplers(struct brw_context *brw) > { > struct gl_context *ctx = &brw->intel.ctx; > struct gen7_sampler_state *samplers; > - int i; > > - brw->sampler.count = 0; > - for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { > - if (ctx->Texture.Unit[i]._ReallyEnabled) > - brw->sampler.count = i + 1; > - } > + /* BRW_NEW_VERTEX_PROGRAM and BRW_NEW_FRAGMENT_PROGRAM */ > + struct gl_program *vs = (struct gl_program *) brw->vertex_program; > + struct gl_program *fs = (struct gl_program *) brw->fragment_program; > + > + GLbitfield SamplersUsed = vs->SamplersUsed | fs->SamplersUsed; > + > + brw->sampler.count = _mesa_bitcount(SamplersUsed); > > if (brw->sampler.count == 0) > return; > @@ -204,9 +205,13 @@ gen7_upload_samplers(struct brw_context *brw) > 32, &brw->sampler.offset); > memset(samplers, 0, brw->sampler.count * sizeof(*samplers)); > > - for (i = 0; i < brw->sampler.count; i++) { > - if (ctx->Texture.Unit[i]._ReallyEnabled) > - gen7_update_sampler_state(brw, i, i, &samplers[i]); > + for (unsigned s = 0; s < brw->sampler.count; s++) { > + if (SamplersUsed & (1 << s)) { > + const unsigned unit = (fs->SamplersUsed & (1 << s)) ? > + fs->SamplerUnits[s] : vs->SamplerUnits[s]; > + if (ctx->Texture.Unit[unit]._ReallyEnabled) > + gen7_update_sampler_state(brw, unit, s, &samplers[s]); > + } > } > > brw->state.dirty.cache |= CACHE_NEW_SAMPLER; > @@ -215,7 +220,9 @@ gen7_upload_samplers(struct brw_context *brw) > const struct brw_tracked_state gen7_samplers = { > .dirty = { > .mesa = _NEW_TEXTURE, > - .brw = BRW_NEW_BATCH, > + .brw = BRW_NEW_BATCH | > + BRW_NEW_VERTEX_PROGRAM | > + BRW_NEW_FRAGMENT_PROGRAM, > .cache = 0 > }, > .emit = gen7_upload_samplers, > -- > 1.7.11.4 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev > I admit to being pretty ignorant about this corner of the code, so for this patch let's just say Acked-by: Paul Berry <stereotype...@gmail.com>
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev