Reviewed-by: Ian Romanick <ian.d.roman...@intel.com> On 09/26/2016 11:23 AM, Kenneth Graunke wrote: > In core profile, we support up to 16 viewports. However, in the > majority of cases, only 1 of them is actually used - we only need > the others if the last shader stage prior to the rasterizer writes > gl_ViewportIndex. > > Processing all 16 viewports adds additional CPU overhead, which hurts > CPU-intensive workloads such as Glamor. This meant that switching to > core profile actually penalized Glamor to an extent, which is > unfortunate. > > This patch tracks the number of relevant viewports, switching between > 1 and ctx->Const.MaxViewports if gl_ViewportIndex is written. A new > BRW_NEW_VIEWPORT_COUNT flag tracks this. This could mean re-emitting > viewport state when switching, but hopefully this is offset by doing > 1/16th of the work in the common case. The new flag is also lighter > weight than BRW_NEW_VUE_MAP_GEOM_OUT, which we were using in one case. > > According to Eric Anholt, this reduces the CPU overhead of scissor and > viewport state changes n Glamor from 2.5% or so to .8% or so. > > Cc: Eric Anholt <e...@anholt.net> > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_cc.c | 10 +++++++--- > src/mesa/drivers/dri/i965/brw_context.c | 1 + > src/mesa/drivers/dri/i965/brw_context.h | 9 +++++++++ > src/mesa/drivers/dri/i965/brw_gs_state.c | 6 ++++-- > src/mesa/drivers/dri/i965/brw_state_upload.c | 11 +++++++++++ > src/mesa/drivers/dri/i965/gen6_clip_state.c | 16 +++++++--------- > src/mesa/drivers/dri/i965/gen6_scissor_state.c | 10 +++++++--- > src/mesa/drivers/dri/i965/gen6_viewport_state.c | 22 +++++++++++++++------- > src/mesa/drivers/dri/i965/gen7_viewport_state.c | 10 +++++++--- > src/mesa/drivers/dri/i965/gen8_viewport_state.c | 10 +++++++--- > 10 files changed, 75 insertions(+), 30 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_cc.c > b/src/mesa/drivers/dri/i965/brw_cc.c > index 5c58b44..b11d7c8 100644 > --- a/src/mesa/drivers/dri/i965/brw_cc.c > +++ b/src/mesa/drivers/dri/i965/brw_cc.c > @@ -44,12 +44,15 @@ brw_upload_cc_vp(struct brw_context *brw) > struct gl_context *ctx = &brw->ctx; > struct brw_cc_viewport *ccv; > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > ccv = brw_state_batch(brw, AUB_TRACE_CC_VP_STATE, > - sizeof(*ccv) * ctx->Const.MaxViewports, 32, > + sizeof(*ccv) * viewport_count, 32, > &brw->cc.vp_offset); > > /* _NEW_TRANSFORM */ > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > if (ctx->Transform.DepthClamp) { > /* _NEW_VIEWPORT */ > ccv[i].min_depth = MIN2(ctx->ViewportArray[i].Near, > @@ -77,7 +80,8 @@ const struct brw_tracked_state brw_cc_vp = { > .mesa = _NEW_TRANSFORM | > _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = brw_upload_cc_vp > }; > diff --git a/src/mesa/drivers/dri/i965/brw_context.c > b/src/mesa/drivers/dri/i965/brw_context.c > index 6efad78..b0eec16 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.c > +++ b/src/mesa/drivers/dri/i965/brw_context.c > @@ -1085,6 +1085,7 @@ brwCreateContext(gl_api api, > brw->prim_restart.enable_cut_index = false; > brw->gs.enabled = false; > brw->sf.viewport_transform_enable = true; > + brw->clip.viewport_count = 1; > > brw->predicate.state = BRW_PREDICATE_STATE_RENDER; > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 00f0adc..b27fe51 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -226,6 +226,7 @@ enum brw_state_id { > BRW_STATE_URB_SIZE, > BRW_STATE_CC_STATE, > BRW_STATE_BLORP, > + BRW_STATE_VIEWPORT_COUNT, > BRW_NUM_STATE_BITS > }; > > @@ -294,6 +295,7 @@ enum brw_state_id { > #define BRW_NEW_PROGRAM_CACHE (1ull << BRW_STATE_PROGRAM_CACHE) > #define BRW_NEW_STATE_BASE_ADDRESS (1ull << > BRW_STATE_STATE_BASE_ADDRESS) > #define BRW_NEW_VUE_MAP_GEOM_OUT (1ull << BRW_STATE_VUE_MAP_GEOM_OUT) > +#define BRW_NEW_VIEWPORT_COUNT (1ull << BRW_STATE_VIEWPORT_COUNT) > #define BRW_NEW_TRANSFORM_FEEDBACK (1ull << > BRW_STATE_TRANSFORM_FEEDBACK) > #define BRW_NEW_RASTERIZER_DISCARD (1ull << > BRW_STATE_RASTERIZER_DISCARD) > #define BRW_NEW_STATS_WM (1ull << BRW_STATE_STATS_WM) > @@ -1160,6 +1162,13 @@ struct brw_context > * instead of vp_bo. > */ > uint32_t vp_offset; > + > + /** > + * The number of viewports to use. If gl_ViewportIndex is written, > + * we can have up to ctx->Const.MaxViewports viewports. If not, > + * the viewport index is always 0, so we can only emit one. > + */ > + uint8_t viewport_count; > } clip; > > > diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c > b/src/mesa/drivers/dri/i965/brw_gs_state.c > index 1757201..8e3bf1e 100644 > --- a/src/mesa/drivers/dri/i965/brw_gs_state.c > +++ b/src/mesa/drivers/dri/i965/brw_gs_state.c > @@ -83,7 +83,8 @@ brw_upload_gs_unit(struct brw_context *brw) > if (unlikely(INTEL_DEBUG & DEBUG_STATS)) > gs->thread4.stats_enable = 1; > > - gs->gs6.max_vp_index = brw->ctx.Const.MaxViewports - 1; > + /* BRW_NEW_VIEWPORT_COUNT */ > + gs->gs6.max_vp_index = brw->clip.viewport_count - 1; > > brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; > } > @@ -96,7 +97,8 @@ const struct brw_tracked_state brw_gs_unit = { > BRW_NEW_CURBE_OFFSETS | > BRW_NEW_FF_GS_PROG_DATA | > BRW_NEW_PROGRAM_CACHE | > - BRW_NEW_URB_FENCE, > + BRW_NEW_URB_FENCE | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = brw_upload_gs_unit, > }; > diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c > b/src/mesa/drivers/dri/i965/brw_state_upload.c > index 60f3be6..4f74e23 100644 > --- a/src/mesa/drivers/dri/i965/brw_state_upload.c > +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c > @@ -655,6 +655,7 @@ static struct dirty_bit_map brw_bits[] = { > DEFINE_BIT(BRW_NEW_URB_SIZE), > DEFINE_BIT(BRW_NEW_CC_STATE), > DEFINE_BIT(BRW_NEW_BLORP), > + DEFINE_BIT(BRW_NEW_VIEWPORT_COUNT), > {0, 0, 0} > }; > > @@ -710,6 +711,8 @@ static inline void > brw_upload_programs(struct brw_context *brw, > enum brw_pipeline pipeline) > { > + struct gl_context *ctx = &brw->ctx; > + > if (pipeline == BRW_RENDER_PIPELINE) { > brw_upload_vs_prog(brw); > brw_upload_tess_programs(brw); > @@ -736,6 +739,14 @@ brw_upload_programs(struct brw_context *brw, > old_separate != brw->vue_map_geom_out.separate) > brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT; > > + if ((old_slots ^ brw->vue_map_geom_out.slots_valid) & > + VARYING_BIT_VIEWPORT) { > + ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT; > + brw->clip.viewport_count = > + (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ? > + ctx->Const.MaxViewports : 1; > + } > + > if (brw->gen < 6) { > brw_setup_vue_interpolation(brw); > brw_upload_clip_prog(brw); > diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c > b/src/mesa/drivers/dri/i965/gen6_clip_state.c > index 7dc9740..9c33e67 100644 > --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c > @@ -157,6 +157,9 @@ upload_clip_state(struct brw_context *brw) > > dw2 |= GEN6_CLIP_GB_TEST; > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > /* We need to disable guardband clipping if the guardband (which we always > * program to the maximum screen-space bounding box of 8K x 8K) will be > * smaller than the viewport. > @@ -180,7 +183,7 @@ upload_clip_state(struct brw_context *brw) > * "objects must have a screenspace bounding box not exceeding 8K in the X > * or Y direction" restriction. Instead, they're clipped. > */ > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > if (ctx->ViewportArray[i].Width > 8192 || > ctx->ViewportArray[i].Height > 8192) { > dw2 &= ~GEN6_CLIP_GB_TEST; > @@ -203,7 +206,7 @@ upload_clip_state(struct brw_context *brw) > const float fb_width = (float)_mesa_geometric_width(fb); > const float fb_height = (float)_mesa_geometric_height(fb); > > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > if (ctx->ViewportArray[i].X != 0 || > ctx->ViewportArray[i].Y != 0 || > ctx->ViewportArray[i].Width != fb_width || > @@ -236,11 +239,6 @@ upload_clip_state(struct brw_context *brw) > if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw)) > dw2 |= GEN6_CLIP_XY_TEST; > > - /* BRW_NEW_VUE_MAP_GEOM_OUT */ > - const int max_vp_index = > - (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) != 0 ? > - ctx->Const.MaxViewports : 1; > - > BEGIN_BATCH(4); > OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); > OUT_BATCH(dw1); > @@ -250,7 +248,7 @@ upload_clip_state(struct brw_context *brw) > OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT | > U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | > (_mesa_geometric_layers(fb) > 0 ? 0 : > GEN6_CLIP_FORCE_ZERO_RTAINDEX) | > - ((max_vp_index - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK)); > + ((viewport_count - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK)); > ADVANCE_BATCH(); > } > > @@ -268,7 +266,7 @@ const struct brw_tracked_state gen6_clip_state = { > BRW_NEW_PRIMITIVE | > BRW_NEW_RASTERIZER_DISCARD | > BRW_NEW_TES_PROG_DATA | > - BRW_NEW_VUE_MAP_GEOM_OUT, > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = upload_clip_state, > }; > diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c > b/src/mesa/drivers/dri/i965/gen6_scissor_state.c > index b03ac73..860445a 100644 > --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c > @@ -42,8 +42,11 @@ gen6_upload_scissor_state(struct brw_context *brw) > const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer); > const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer); > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE, > - sizeof(*scissor) * ctx->Const.MaxViewports, 32, > + sizeof(*scissor) * viewport_count, 32, > &scissor_state_offset); > > /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */ > @@ -55,7 +58,7 @@ gen6_upload_scissor_state(struct brw_context *brw) > * Note that the hardware's coordinates are inclusive, while Mesa's min is > * inclusive but max is exclusive. > */ > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > int bbox[4]; > > bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); > @@ -102,7 +105,8 @@ const struct brw_tracked_state gen6_scissor_state = { > _NEW_SCISSOR | > _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = gen6_upload_scissor_state, > }; > diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c > b/src/mesa/drivers/dri/i965/gen6_viewport_state.c > index eacffb9..ad1e72d 100644 > --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c > @@ -42,10 +42,13 @@ gen6_upload_clip_vp(struct brw_context *brw) > struct gl_context *ctx = &brw->ctx; > struct brw_clipper_viewport *vp; > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > vp = brw_state_batch(brw, AUB_TRACE_CLIP_VP_STATE, > - sizeof(*vp) * ctx->Const.MaxViewports, 32, > &brw->clip.vp_offset); > + sizeof(*vp) * viewport_count, 32, > &brw->clip.vp_offset); > > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > /* According to the "Vertex X,Y Clamping and Quantization" section of > the > * Strips and Fans documentation, objects must not have a screen-space > * extents of over 8192 pixels, or they may be mis-rasterized. The > maximum > @@ -74,7 +77,8 @@ const struct brw_tracked_state gen6_clip_vp = { > .dirty = { > .mesa = _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = gen6_upload_clip_vp, > }; > @@ -87,10 +91,13 @@ gen6_upload_sf_vp(struct brw_context *brw) > GLfloat y_scale, y_bias; > const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > sfv = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, > - sizeof(*sfv) * ctx->Const.MaxViewports, > + sizeof(*sfv) * viewport_count, > 32, &brw->sf.vp_offset); > - memset(sfv, 0, sizeof(*sfv) * ctx->Const.MaxViewports); > + memset(sfv, 0, sizeof(*sfv) * viewport_count); > > /* _NEW_BUFFERS */ > if (render_to_fbo) { > @@ -101,7 +108,7 @@ gen6_upload_sf_vp(struct brw_context *brw) > y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer); > } > > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > float scale[3], translate[3]; > > /* _NEW_VIEWPORT */ > @@ -123,7 +130,8 @@ const struct brw_tracked_state gen6_sf_vp = { > .mesa = _NEW_BUFFERS | > _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = gen6_upload_sf_vp, > }; > diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c > b/src/mesa/drivers/dri/i965/gen7_viewport_state.c > index 34f93af..c447331 100644 > --- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c > @@ -37,8 +37,11 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw) > const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); > struct gen7_sf_clip_viewport *vp; > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, > - sizeof(*vp) * ctx->Const.MaxViewports, 64, > + sizeof(*vp) * viewport_count, 64, > &brw->sf.vp_offset); > /* Also assign to clip.vp_offset in case something uses it. */ > brw->clip.vp_offset = brw->sf.vp_offset; > @@ -52,7 +55,7 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw) > y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer); > } > > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > float scale[3], translate[3]; > _mesa_get_viewport_xform(ctx, i, scale, translate); > > @@ -97,7 +100,8 @@ const struct brw_tracked_state gen7_sf_clip_viewport = { > .mesa = _NEW_BUFFERS | > _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = gen7_upload_sf_clip_viewport, > }; > diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c > b/src/mesa/drivers/dri/i965/gen8_viewport_state.c > index acaee1a..84000e3 100644 > --- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c > +++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c > @@ -37,8 +37,11 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw) > const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer); > const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); > > + /* BRW_NEW_VIEWPORT_COUNT */ > + const unsigned viewport_count = brw->clip.viewport_count; > + > float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE, > - 16 * 4 * ctx->Const.MaxViewports, > + 16 * 4 * viewport_count, > 64, &brw->sf.vp_offset); > /* Also assign to clip.vp_offset in case something uses it. */ > brw->clip.vp_offset = brw->sf.vp_offset; > @@ -52,7 +55,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw) > y_bias = fb_height; > } > > - for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { > + for (unsigned i = 0; i < viewport_count; i++) { > float scale[3], translate[3]; > _mesa_get_viewport_xform(ctx, i, scale, translate); > > @@ -136,7 +139,8 @@ const struct brw_tracked_state gen8_sf_clip_viewport = { > .mesa = _NEW_BUFFERS | > _NEW_VIEWPORT, > .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP, > + BRW_NEW_BLORP | > + BRW_NEW_VIEWPORT_COUNT, > }, > .emit = gen8_upload_sf_clip_viewport, > }; >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev