Kenneth Graunke <kenn...@whitecape.org> writes: > From: Chris Forbes <chr...@ijw.co.nz> > > v2: (by Ken, incorporating feedback from Matt Turner): > - Rewrite the push constant allocation code to be clearer. > - Only apply the minimum VS entries workaround on Gen 8. > > v3: (by Ken) > - Fix a bug in v2 where we failed to allocate the full push constant > space when the number of enabled stages didn't divide the available > push constant space evenly. (Any left over space is now allocated > to the PS, as it was in v1.) > - Fix an off-by-one error in v2's number of enabled stages calculation. > - Use DIV_ROUND_UP for nicer formatting. > - Line wrapping fixes. > > Signed-off-by: Chris Forbes <chr...@ijw.co.nz> > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_context.h | 21 +++- > src/mesa/drivers/dri/i965/gen7_blorp.cpp | 8 ++ > src/mesa/drivers/dri/i965/gen7_urb.c | 177 > +++++++++++++++++++++++++------ > 3 files changed, 170 insertions(+), 36 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 1cc4c7b..69bc04c 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1008,6 +1008,8 @@ struct brw_context > struct { > GLuint vsize; /* vertex size plus header in urb registers */ > GLuint gsize; /* GS output size in urb registers */ > + GLuint hsize; /* Tessellation control output size in urb > registers */ > + GLuint dsize; /* Tessellation evaluation output size in > urb registers */ > GLuint csize; /* constant buffer size in urb registers */ > GLuint sfsize; /* setup data size in urb registers */ > > @@ -1020,12 +1022,16 @@ struct brw_context > GLuint max_gs_entries; /* Maximum number of GS entries */ > > GLuint nr_vs_entries; > + GLuint nr_hs_entries; > + GLuint nr_ds_entries; > GLuint nr_gs_entries; > GLuint nr_clip_entries; > GLuint nr_sf_entries; > GLuint nr_cs_entries; > > GLuint vs_start; > + GLuint hs_start; > + GLuint ds_start; > GLuint gs_start; > GLuint clip_start; > GLuint sf_start; > @@ -1042,6 +1048,11 @@ struct brw_context > * URB space for the GS. > */ > bool gs_present; > + > + /* True if the most recently sent _3DSTATE_URB message allocated > + * URB space for the HS and DS. > + */ > + bool tess_present; > } urb; > > > @@ -1648,12 +1659,18 @@ void gen8_emit_3dstate_sample_pattern(struct > brw_context *brw); > /* gen7_urb.c */ > void > gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, > + unsigned hs_size, unsigned ds_size, > unsigned gs_size, unsigned fs_size); > > void > gen7_emit_urb_state(struct brw_context *brw, > - unsigned nr_vs_entries, unsigned vs_size, > - unsigned vs_start, unsigned nr_gs_entries, > + unsigned nr_vs_entries, > + unsigned vs_size, unsigned vs_start, > + unsigned nr_hs_entries, > + unsigned hs_size, unsigned hs_start, > + unsigned nr_ds_entries, > + unsigned ds_size, unsigned ds_start, > + unsigned nr_gs_entries, > unsigned gs_size, unsigned gs_start); > > > diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp > b/src/mesa/drivers/dri/i965/gen7_blorp.cpp > index e87b9d1..89b73ca 100644 > --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp > +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp > @@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) > unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16; > gen7_emit_push_constant_state(brw, > urb_size / 2 /* vs_size */, > + 0 /* hs_size */, > + 0 /* ds_size */, > 0 /* gs_size */, > urb_size / 2 /* fs_size */); > > @@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) > 32 /* num_vs_entries */, > 2 /* vs_size */, > 2 /* vs_start */, > + 0 /* num_hs_entries */, > + 1 /* hs_size */, > + 2 /* hs_start */, > + 0 /* num_ds_entries */, > + 1 /* ds_size */, > + 2 /* ds_start */, > 0 /* num_gs_entries */, > 1 /* gs_size */, > 2 /* gs_start */); > diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c > b/src/mesa/drivers/dri/i965/gen7_urb.c > index 421512b..f2b6ec3 100644 > --- a/src/mesa/drivers/dri/i965/gen7_urb.c > +++ b/src/mesa/drivers/dri/i965/gen7_urb.c > @@ -34,7 +34,7 @@ > * __________-__________ _________________-_________________ > * / \ / \ > * +-------------------------------------------------------------+ > - * | VS/FS/GS Push | VS/GS URB | > + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | > * | Constants | Entries | > * +-------------------------------------------------------------+ > * > @@ -60,27 +60,32 @@ > static void > gen7_allocate_push_constants(struct brw_context *brw) > { > + /* BRW_NEW_GEOMETRY_PROGRAM */ > + bool gs_present = brw->geometry_program; > + > + /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TESS_EVAL_PROGRAM */ > + bool tess_present = brw->tess_eval_program; > + > unsigned avail_size = 16; > unsigned multiplier = > (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; > > - /* BRW_NEW_GEOMETRY_PROGRAM */ > - bool gs_present = brw->geometry_program; > + int stages = 2 + gs_present + 2 * tess_present; > > - unsigned vs_size, gs_size; > - if (gs_present) { > - vs_size = avail_size / 3; > - avail_size -= vs_size; > - gs_size = avail_size / 2; > - avail_size -= gs_size; > - } else { > - vs_size = avail_size / 2; > - avail_size -= vs_size; > - gs_size = 0; > - } > - unsigned fs_size = avail_size; > + /* Divide up the available space equally between stages. Because we > + * round down (using floor division), there may be some left over > + * space. We allocate that to the pixel shader stage. > + */ > + unsigned size_per_stage = avail_size / stages; > + > + unsigned vs_size = size_per_stage; > + unsigned hs_size = tess_present ? size_per_stage : 0; > + unsigned ds_size = tess_present ? size_per_stage : 0; > + unsigned gs_size = gs_present ? size_per_stage : 0; > + unsigned fs_size = avail_size - size_per_stage * (stages - 1);
This is a lot nicer. On the whole, patch looks good, straightforward extension of our current heustics. Reviewed-by: Kristian Høgsberg <k...@bitplanet.net> > gen7_emit_push_constant_state(brw, multiplier * vs_size, > + multiplier * hs_size, multiplier * ds_size, > multiplier * gs_size, multiplier * fs_size); > > /* From p115 of the Ivy Bridge PRM (3.2.1.4 > 3DSTATE_PUSH_CONSTANT_ALLOC_VS): > @@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw) > > void > gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, > + unsigned hs_size, unsigned ds_size, > unsigned gs_size, unsigned fs_size) > { > unsigned offset = 0; > > - BEGIN_BATCH(6); > + BEGIN_BATCH(10); > OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); > OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); > offset += vs_size; > > + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2)); > + OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); > + offset += hs_size; > + > + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2)); > + OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); > + offset += ds_size; > + > OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2)); > OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); > offset += gs_size; > @@ -130,7 +144,10 @@ gen7_emit_push_constant_state(struct brw_context *brw, > unsigned vs_size, > const struct brw_tracked_state gen7_push_constant_space = { > .dirty = { > .mesa = 0, > - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, > + .brw = BRW_NEW_CONTEXT | > + BRW_NEW_GEOMETRY_PROGRAM | > + BRW_NEW_TESS_CTRL_PROGRAM | > + BRW_NEW_TESS_EVAL_PROGRAM, > }, > .emit = gen7_allocate_push_constants, > }; > @@ -138,6 +155,7 @@ const struct brw_tracked_state gen7_push_constant_space = > { > static void > gen7_upload_urb(struct brw_context *brw) > { > + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; > const int push_size_kB = > (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; > > @@ -149,6 +167,16 @@ gen7_upload_urb(struct brw_context *brw) > unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : > 1; > unsigned gs_entry_size_bytes = gs_size * 64; > > + /* BRW_NEW_TESS_CTRL_PROGRAM, BRW_NEW_TESS_EVAL_PROGRAM */ > + const bool tess_present = brw->tess_eval_program; > + assert(!tess_present || brw->tess_ctrl_program); > + /* BRW_NEW_TCS_PROG_DATA */ > + unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size > : 1; > + unsigned hs_entry_size_bytes = hs_size * 64; > + /* BRW_NEW_TES_PROG_DATA */ > + unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size > : 1; > + unsigned ds_entry_size_bytes = ds_size * 64; > + > /* If we're just switching between programs with the same URB > requirements, > * skip the rest of the logic. > */ > @@ -156,21 +184,29 @@ gen7_upload_urb(struct brw_context *brw) > !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) && > brw->urb.vsize == vs_size && > brw->urb.gs_present == gs_present && > - brw->urb.gsize == gs_size) { > + brw->urb.gsize == gs_size && > + brw->urb.tess_present == tess_present && > + brw->urb.hsize == hs_size && > + brw->urb.dsize == ds_size) { > return; > } > brw->urb.vsize = vs_size; > brw->urb.gs_present = gs_present; > brw->urb.gsize = gs_size; > + brw->urb.tess_present = tess_present; > + brw->urb.hsize = hs_size; > + brw->urb.dsize = ds_size; > > /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): > * > * VS Number of URB Entries must be divisible by 8 if the VS URB Entry > * Allocation Size is less than 9 512-bit URB entries. > * > - * Similar text exists for GS. > + * Similar text exists for HS, DS and GS. > */ > unsigned vs_granularity = (vs_size < 9) ? 8 : 1; > + unsigned hs_granularity = (hs_size < 9) ? 8 : 1; > + unsigned ds_granularity = (ds_size < 9) ? 8 : 1; > unsigned gs_granularity = (gs_size < 9) ? 8 : 1; > > /* URB allocations must be done in 8k chunks. */ > @@ -191,10 +227,17 @@ gen7_upload_urb(struct brw_context *brw) > * additional space it could actually make use of). > */ > > - /* VS has a lower limit on the number of URB entries */ > + /* VS has a lower limit on the number of URB entries. > + * > + * From the Broadwell PRM, 3DSTATE_URB_VS instruction: > + * "When tessellation is enabled, the VS Number of URB Entries must be > + * greater than or equal to 192." > + */ > + unsigned vs_min_entries = > + tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries; > + > unsigned vs_chunks = > - DIV_ROUND_UP(brw->urb.min_vs_entries * vs_entry_size_bytes, > - chunk_size_bytes); > + DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes); > unsigned vs_wants = > DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes, > chunk_size_bytes) - vs_chunks; > @@ -216,14 +259,36 @@ gen7_upload_urb(struct brw_context *brw) > chunk_size_bytes) - gs_chunks; > } > > + unsigned hs_chunks = 0; > + unsigned hs_wants = 0; > + unsigned ds_chunks = 0; > + unsigned ds_wants = 0; > + > + if (tess_present) { > + hs_chunks = > + DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes, > + chunk_size_bytes); > + hs_wants = > + DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes, > + chunk_size_bytes) - hs_chunks; > + > + ds_chunks = > + DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes, > + chunk_size_bytes); > + ds_wants = > + DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes, > + chunk_size_bytes) - ds_chunks; > + } > + > /* There should always be enough URB space to satisfy the minimum > * requirements of each stage. > */ > - unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; > + unsigned total_needs = push_constant_chunks + > + vs_chunks + hs_chunks + ds_chunks + gs_chunks; > assert(total_needs <= urb_chunks); > > /* Mete out remaining space (if any) in proportion to "wants". */ > - unsigned total_wants = vs_wants + gs_wants; > + unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants; > unsigned remaining_space = urb_chunks - total_needs; > if (remaining_space > total_wants) > remaining_space = total_wants; > @@ -232,61 +297,100 @@ gen7_upload_urb(struct brw_context *brw) > roundf(vs_wants * (((float) remaining_space) / total_wants)); > vs_chunks += vs_additional; > remaining_space -= vs_additional; > + total_wants -= vs_wants; > + > + unsigned hs_additional = (unsigned) > + round(hs_wants * (((double) remaining_space) / total_wants)); > + hs_chunks += hs_additional; > + remaining_space -= hs_additional; > + total_wants -= hs_wants; > + > + unsigned ds_additional = (unsigned) > + round(ds_wants * (((double) remaining_space) / total_wants)); > + ds_chunks += ds_additional; > + remaining_space -= ds_additional; > + total_wants -= ds_wants; > + > gs_chunks += remaining_space; > } > > /* Sanity check that we haven't over-allocated. */ > - assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); > + assert(push_constant_chunks + > + vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks); > > /* Finally, compute the number of entries that can fit in the space > * allocated to each stage. > */ > unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / > vs_entry_size_bytes; > + unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / > hs_entry_size_bytes; > + unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / > ds_entry_size_bytes; > unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / > gs_entry_size_bytes; > > /* Since we rounded up when computing *_wants, this may be slightly more > * than the maximum allowed amount, so correct for that. > */ > nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); > + nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries); > + nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries); > nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); > > /* Ensure that we program a multiple of the granularity. */ > nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); > + nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity); > + nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity); > nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); > > /* Finally, sanity check to make sure we have at least the minimum number > * of entries needed for each stage. > */ > - assert(nr_vs_entries >= brw->urb.min_vs_entries); > + assert(nr_vs_entries >= vs_min_entries); > if (gs_present) > assert(nr_gs_entries >= 2); > + if (tess_present) { > + assert(nr_hs_entries >= 1); > + assert(nr_ds_entries >= devinfo->urb.min_ds_entries); > + } > > /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems > * better to put reasonable data in there rather than leave them > * uninitialized. > */ > brw->urb.nr_vs_entries = nr_vs_entries; > + brw->urb.nr_hs_entries = nr_hs_entries; > + brw->urb.nr_ds_entries = nr_ds_entries; > brw->urb.nr_gs_entries = nr_gs_entries; > > /* Lay out the URB in the following order: > * - push constants > * - VS > + * - HS > + * - DS > * - GS > */ > brw->urb.vs_start = push_constant_chunks; > - brw->urb.gs_start = push_constant_chunks + vs_chunks; > + brw->urb.hs_start = push_constant_chunks + vs_chunks; > + brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks; > + brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks + > + ds_chunks; > > if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail) > gen7_emit_vs_workaround_flush(brw); > gen7_emit_urb_state(brw, > brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, > + brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start, > + brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start, > brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); > } > > void > gen7_emit_urb_state(struct brw_context *brw, > - unsigned nr_vs_entries, unsigned vs_size, > - unsigned vs_start, unsigned nr_gs_entries, > + unsigned nr_vs_entries, > + unsigned vs_size, unsigned vs_start, > + unsigned nr_hs_entries, > + unsigned hs_size, unsigned hs_start, > + unsigned nr_ds_entries, > + unsigned ds_size, unsigned ds_start, > + unsigned nr_gs_entries, > unsigned gs_size, unsigned gs_start) > { > BEGIN_BATCH(8); > @@ -300,14 +404,15 @@ gen7_emit_urb_state(struct brw_context *brw, > ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | > (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); > > - /* Allocate the HS and DS zero space - we don't use them. */ > OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); > - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | > - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); > + OUT_BATCH(nr_hs_entries | > + ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | > + (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); > > OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); > - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | > - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); > + OUT_BATCH(nr_ds_entries | > + ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | > + (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); > ADVANCE_BATCH(); > } > > @@ -317,7 +422,11 @@ const struct brw_tracked_state gen7_urb = { > .brw = BRW_NEW_CONTEXT | > BRW_NEW_URB_SIZE | > BRW_NEW_GEOMETRY_PROGRAM | > + BRW_NEW_TESS_CTRL_PROGRAM | > + BRW_NEW_TESS_EVAL_PROGRAM | > BRW_NEW_GS_PROG_DATA | > + BRW_NEW_TCS_PROG_DATA | > + BRW_NEW_TES_PROG_DATA | > BRW_NEW_VS_PROG_DATA, > }, > .emit = gen7_upload_urb, > -- > 2.6.3 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev