On 28 August 2013 21:00, Kenneth Graunke <kenn...@whitecape.org> wrote:
> On 08/26/2013 03:12 PM, Paul Berry wrote: > >> Previously, we gave all of the URB space (other than the small amount >> that is used for push constants) to the vertex shader. However, when >> a geometry shader is active, we need to divide it up between the >> vertex and geometry shaders. >> >> The size of the URB entries for the vertex and geometry shaders can >> vary dramatically from one shader to the next. So it doesn't make >> sense to simply split the available space in two. In particular: >> >> - On Ivy Bridge GT1, this would not leave enough space for the worst >> case geometry shader, which requires 64k of URB space. >> >> - Due to hardware-imposed limits on the maximum number of URB entries, >> sometimes a given shader stage will only be capable of using a small >> amount of URB space. When this happens, it may make sense to >> allocate substantially less than half of the available space to that >> stage. >> >> Our algorithm for dividing space between the two stages is to first >> compute (a) the minimum amount of URB space that each stage needs in >> order to function properly, and (b) the amount of additional URB space >> that each stage "wants" (i.e. that it would be capable of making use >> of). If the total amount of space available is not enough to satisfy >> needs + wants, then each stage's "wants" amount is scaled back by the >> same factor in order to fit. >> >> When only a vertex shader is active, this algorithm produces >> equivalent results to the old algorithm (if the vertex shader stage >> can make use of all the available URB space, we assign all the space >> to it; if it can't, we let it use as much as it can). >> >> In the future, when we need to support tessellation control and >> tessellation evaluation pipeline stages, it should be straightforward >> to expand this algorithm to cover them. >> >> v2: Use "unsigned" rather than "GLuint". >> --- >> src/mesa/drivers/dri/i965/brw_**context.h | 6 +- >> src/mesa/drivers/dri/i965/**gen7_blorp.cpp | 16 ++-- >> src/mesa/drivers/dri/i965/**gen7_urb.c | 155 >> +++++++++++++++++++++++++-----**- >> 3 files changed, 142 insertions(+), 35 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/**brw_context.h >> b/src/mesa/drivers/dri/i965/**brw_context.h >> index be5175f..77f2a6b 100644 >> --- a/src/mesa/drivers/dri/i965/**brw_context.h >> +++ b/src/mesa/drivers/dri/i965/**brw_context.h >> @@ -1511,8 +1511,10 @@ void >> gen7_allocate_push_constants(**struct brw_context *brw); >> >> void >> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, >> - GLuint vs_size, GLuint vs_start); >> +gen7_emit_urb_state(struct brw_context *brw, >> + unsigned nr_vs_entries, unsigned vs_size, >> + unsigned vs_start, unsigned nr_gs_entries, >> + unsigned gs_size, unsigned gs_start); >> >> >> >> diff --git a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> index a387836..6c798b1 100644 >> --- a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> +++ b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> @@ -51,14 +51,16 @@ static void >> gen7_blorp_emit_urb_config(**struct brw_context *brw, >> const brw_blorp_params *params) >> { >> - /* The minimum valid value is 32. See 3DSTATE_URB_VS, >> - * Dword 1.15:0 "VS Number of URB Entries". >> + /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, >> Dword >> + * 1.15:0 "VS Number of URB Entries". >> */ >> - int num_vs_entries = 32; >> - int vs_size = 2; >> - int vs_start = 2; /* skip over push constants */ >> - >> - gen7_emit_urb_state(brw, num_vs_entries, vs_size, vs_start); >> + gen7_emit_urb_state(brw, >> + 32 /* num_vs_entries */, >> + 2 /* vs_size */, >> + 2 /* vs_start */, >> + 0 /* num_gs_entries */, >> + 1 /* gs_size */, >> + 2 /* gs_start */); >> } >> >> >> diff --git a/src/mesa/drivers/dri/i965/**gen7_urb.c >> b/src/mesa/drivers/dri/i965/**gen7_urb.c >> index 927af37..2d10cc12 100644 >> --- a/src/mesa/drivers/dri/i965/**gen7_urb.c >> +++ b/src/mesa/drivers/dri/i965/**gen7_urb.c >> @@ -74,34 +74,136 @@ gen7_upload_urb(struct brw_context *brw) >> { >> const int push_size_kB = brw->is_haswell && brw->gt == 3 ? 32 : 16; >> >> - /* Total space for entries is URB size - 16kB for push constants */ >> - int handle_region_size = (brw->urb.size - push_size_kB) * 1024; /* >> bytes */ >> - >> /* CACHE_NEW_VS_PROG */ >> unsigned vs_size = MAX2(brw->vs.prog_data->base.**urb_entry_size, >> 1); >> - >> - int nr_vs_entries = handle_region_size / (vs_size * 64); >> - if (nr_vs_entries > brw->urb.max_vs_entries) >> - nr_vs_entries = brw->urb.max_vs_entries; >> - >> - /* According to volume 2a, nr_vs_entries must be a multiple of 8. */ >> - brw->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 8); >> - >> - /* URB Starting Addresses are specified in multiples of 8kB. */ >> - brw->urb.vs_start = push_size_kB / 8; /* skip over push constants */ >> - >> - assert(brw->urb.nr_vs_entries % 8 == 0); >> - assert(brw->urb.nr_gs_entries % 8 == 0); >> - /* GS requirement */ >> - assert(!brw->ff_gs.prog_**active); >> + unsigned vs_entry_size_bytes = vs_size * 64; >> + /* BRW_NEW_GEOMETRY_PROGRAM, CACHE_NEW_GS_PROG */ >> + bool gs_present = brw->geometry_program; >> + unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_**entry_size >> : 1; >> + unsigned gs_entry_size_bytes = gs_size * 64; >> + >> + /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): >> + * >> + * VS Number of URB Entries must be divisible by 8 if the VS URB >> Entry >> + * Allocation Size is less than 9 512-bit URB entries. >> + * >> + * Similar text exists for GS. >> + */ >> + unsigned vs_granularity = (vs_size < 9) ? 8 : 1; >> + unsigned gs_granularity = (gs_size < 9) ? 8 : 1; >> + >> + /* URB allocations must be done in 8k chunks. */ >> + unsigned chunk_size_bytes = 8192; >> + >> + /* Determine the size of the URB in chunks. >> + */ >> + unsigned urb_chunks = brw->urb.size * 1024 / chunk_size_bytes; >> + >> + /* Reserve space for push constants */ >> + unsigned push_constant_bytes = 1024 * push_size_kB; >> + unsigned push_constant_chunks = >> + push_constant_bytes / chunk_size_bytes; >> + >> + /* Initially, assign each stage the minimum amount of URB space it >> needs, >> + * and make a note of how much additional space it "wants" (the >> amount of >> + * additional space it could actually make use of). >> + */ >> + >> + /* VS always requires at least 32 URB entries */ >> + unsigned vs_chunks = >> + ALIGN(32 * vs_entry_size_bytes, chunk_size_bytes) / >> chunk_size_bytes; >> + unsigned vs_wants = >> + ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, >> + chunk_size_bytes) / chunk_size_bytes - vs_chunks; >> + >> + unsigned gs_chunks = 0; >> + unsigned gs_wants = 0; >> + if (gs_present) { >> + /* There are two constraints on the minimum amount of URB space we >> can >> + * allocate: >> + * >> + * (1) We need room for at least 2 URB entries, since we always >> operate >> + * the GS in DUAL_OBJECT mode. >> + * >> + * (2) We can't allocate less than nr_gs_entries_granularity. >> + */ >> + gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, >> + chunk_size_bytes) / chunk_size_bytes; >> + gs_wants = >> + ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, >> + chunk_size_bytes) / chunk_size_bytes - gs_chunks; >> + } >> + >> + /* There should always be enough URB space to satisfy the minimum >> + * requirements of each stage. >> + */ >> + unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; >> + assert(total_needs <= urb_chunks); >> + >> + /* Mete out remaining space (if any) in proportion to "wants". */ >> + unsigned total_wants = vs_wants + gs_wants; >> + unsigned remaining_space = urb_chunks - total_needs; >> + if (remaining_space > total_wants) >> + remaining_space = total_wants; >> + if (remaining_space > 0) { >> + unsigned vs_additional = (unsigned) >> + round(vs_wants * (((double) remaining_space) / total_wants)); >> + vs_chunks += vs_additional; >> + remaining_space -= vs_additional; >> + gs_chunks += remaining_space; >> + } >> + >> + /* Sanity check that we haven't over-allocated. */ >> + assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); >> + >> + /* Finally, compute the number of entries that can fit in the space >> + * allocated to each stage. >> + */ >> + unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / >> vs_entry_size_bytes; >> + unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / >> gs_entry_size_bytes; >> + >> + /* Since we rounded up when computing *_wants, this may be slightly >> more >> + * than the maximum allowed amount, so correct for that. >> + */ >> + nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); >> + nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); >> + >> + /* Ensure that we program a multiple of the granularity. */ >> + nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); >> + nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); >> + >> + /* Finally, sanity check to make sure we have at least the minimum >> number >> + * of entries needed for each stage. >> + */ >> + assert(nr_vs_entries >= 32); >> + if (gs_present) >> + assert(nr_gs_entries >= 2); >> + >> + /* And store the values we computed in brw so that they can be used by >> + * other state atoms. >> + */ >> > > Actually, no other atoms ever use these. The only reason these fields > exist is for Gen4, where the VS_STATE, CLIP_STATE, etc. packets actually > specified the URB configuration for that stage. > > I just set them because they were there, and putting data in them seemed > better than leaving them uninitialized. > Ok, I've changed the comment to say: /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems * better to put reasonable data in there rather than leave them * uninitialized. */ > > However, I can't see any reason any other atom would /ever/ care, so I'm > somewhat tempted to keep these as locals. Would be a tiny bit more > efficient. > I could go either way on this. We can always do it in a follow-up patch if this state atom winds up looming large in profiling runs. > > This patch seems pretty reasonable. It's rather complicated, but...I > don't honestly know that I could come up with anything better. > > > + brw->urb.nr_vs_entries = nr_vs_entries; >> + brw->urb.nr_gs_entries = nr_gs_entries; >> + >> + /* Lay out the URB in the following order: >> + * - push constants >> + * - VS >> + * - GS >> + */ >> + brw->urb.vs_start = push_constant_chunks; >> + brw->urb.gs_start = push_constant_chunks + vs_chunks; >> >> gen7_emit_vs_workaround_flush(**brw); >> - gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, >> brw->urb.vs_start); >> + gen7_emit_urb_state(brw, >> + brw->urb.nr_vs_entries, vs_size, >> brw->urb.vs_start, >> + brw->urb.nr_gs_entries, gs_size, >> brw->urb.gs_start); >> } >> >> void >> -gen7_emit_urb_state(struct brw_context *brw, GLuint nr_vs_entries, >> - GLuint vs_size, GLuint vs_start) >> +gen7_emit_urb_state(struct brw_context *brw, >> + unsigned nr_vs_entries, unsigned vs_size, >> + unsigned vs_start, unsigned nr_gs_entries, >> + unsigned gs_size, unsigned gs_start) >> { >> BEGIN_BATCH(8); >> OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); >> @@ -109,11 +211,12 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint >> nr_vs_entries, >> ((vs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | >> (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT)); >> >> - /* Allocate the GS, HS, and DS zero space - we don't use them. */ >> OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); >> - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | >> - (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT)); >> + OUT_BATCH(nr_gs_entries | >> + ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | >> + (gs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT)); >> >> + /* Allocate the HS and DS zero space - we don't use them. */ >> OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); >> OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | >> (vs_start << GEN7_URB_STARTING_ADDRESS_**SHIFT)); >> @@ -127,8 +230,8 @@ gen7_emit_urb_state(struct brw_context *brw, GLuint >> nr_vs_entries, >> const struct brw_tracked_state gen7_urb = { >> .dirty = { >> .mesa = 0, >> - .brw = BRW_NEW_CONTEXT, >> - .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_FF_GS_PROG), >> + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, >> + .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG), >> }, >> .emit = gen7_upload_urb, >> }; >> >> >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev