On 28 August 2013 17:58, Kenneth Graunke <kenn...@whitecape.org> wrote:
> On 08/26/2013 03:12 PM, Paul Berry wrote: > >> Previously, we would always use the same push constant allocation >> regardless of what shader programs were being run: the available push >> constant space was split into 2 equal size partitions, one for the >> vertex shader, and one for the fragment shader. >> >> Now that we are adding geometry shader support, we need to do >> something smarter. This patch adjusts things so that when a geometry >> shader is in use, we split the available push constant space into 3 >> nearly-equal size partitions instead of 2. >> >> Since the push constant allocation is now affected by GL state, it can >> no longer be set up by brw_upload_initial_gpu_state()**; instead it must >> be set up by a state atom. >> --- >> src/mesa/drivers/dri/i965/brw_**context.h | 3 +- >> src/mesa/drivers/dri/i965/brw_**defines.h | 1 + >> src/mesa/drivers/dri/i965/brw_**state.h | 4 +- >> src/mesa/drivers/dri/i965/brw_**state_upload.c | 5 +- >> src/mesa/drivers/dri/i965/**gen7_blorp.cpp | 6 ++ >> src/mesa/drivers/dri/i965/**gen7_urb.c | 101 >> +++++++++++++++++++++++---- >> 6 files changed, 98 insertions(+), 22 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/**brw_context.h >> b/src/mesa/drivers/dri/i965/**brw_context.h >> index 77f2a6b..95f9bb2 100644 >> --- a/src/mesa/drivers/dri/i965/**brw_context.h >> +++ b/src/mesa/drivers/dri/i965/**brw_context.h >> @@ -1508,7 +1508,8 @@ gen6_get_sample_position(**struct gl_context *ctx, >> >> /* gen7_urb.c */ >> void >> -gen7_allocate_push_constants(**struct brw_context *brw); >> +gen7_emit_push_constant_**state(struct brw_context *brw, unsigned >> vs_size, >> + unsigned gs_size, unsigned fs_size); >> >> void >> gen7_emit_urb_state(struct brw_context *brw, >> diff --git a/src/mesa/drivers/dri/i965/**brw_defines.h >> b/src/mesa/drivers/dri/i965/**brw_defines.h >> index 832ff55..8d9a824 100644 >> --- a/src/mesa/drivers/dri/i965/**brw_defines.h >> +++ b/src/mesa/drivers/dri/i965/**brw_defines.h >> @@ -1284,6 +1284,7 @@ enum brw_message_target { >> # define GEN7_URB_STARTING_ADDRESS_**SHIFT 25 >> >> #define _3DSTATE_PUSH_CONSTANT_ALLOC_**VS 0x7912 /* GEN7+ */ >> +#define _3DSTATE_PUSH_CONSTANT_ALLOC_**GS 0x7915 /* GEN7+ */ >> #define _3DSTATE_PUSH_CONSTANT_ALLOC_**PS 0x7916 /* GEN7+ */ >> # define GEN7_PUSH_CONSTANT_BUFFER_**OFFSET_SHIFT 16 >> >> diff --git a/src/mesa/drivers/dri/i965/**brw_state.h >> b/src/mesa/drivers/dri/i965/**brw_state.h >> index 85f82fe..4814639 100644 >> --- a/src/mesa/drivers/dri/i965/**brw_state.h >> +++ b/src/mesa/drivers/dri/i965/**brw_state.h >> @@ -112,6 +112,7 @@ extern const struct brw_tracked_state >> gen7_cc_viewport_state_**pointer; >> extern const struct brw_tracked_state gen7_clip_state; >> extern const struct brw_tracked_state gen7_disable_stages; >> extern const struct brw_tracked_state gen7_ps_state; >> +extern const struct brw_tracked_state gen7_push_constant_space; >> extern const struct brw_tracked_state gen7_sbe_state; >> extern const struct brw_tracked_state gen7_sf_clip_viewport; >> extern const struct brw_tracked_state gen7_sf_state; >> @@ -220,9 +221,6 @@ uint32_t >> get_attr_override(const struct brw_vue_map *vue_map, int >> urb_entry_read_offset, >> int fs_attr, bool two_side_color, uint32_t >> *max_source_attr); >> >> -/* gen7_urb.c */ >> -void gen7_allocate_push_constants(**struct brw_context *brw); >> - >> #ifdef __cplusplus >> } >> #endif >> diff --git a/src/mesa/drivers/dri/i965/**brw_state_upload.c >> b/src/mesa/drivers/dri/i965/**brw_state_upload.c >> index b883002..9638c69 100644 >> --- a/src/mesa/drivers/dri/i965/**brw_state_upload.c >> +++ b/src/mesa/drivers/dri/i965/**brw_state_upload.c >> @@ -188,6 +188,7 @@ static const struct brw_tracked_state *gen7_atoms[] = >> &gen7_cc_viewport_state_**pointer, /* must do after brw_cc_vp */ >> &gen7_sf_clip_viewport, >> >> + &gen7_push_constant_space, >> &gen7_urb, >> &gen6_blend_state, /* must do before cc unit */ >> &gen6_color_calc_state, /* must do before cc unit */ >> @@ -251,10 +252,6 @@ brw_upload_initial_gpu_state(**struct brw_context >> *brw) >> return; >> >> brw_upload_invariant_state(**brw); >> - >> - if (brw->gen >= 7) { >> - gen7_allocate_push_constants(**brw); >> - } >> } >> >> void brw_init_state( struct brw_context *brw ) >> diff --git a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> index 6c798b1..9df3d92 100644 >> --- a/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> +++ b/src/mesa/drivers/dri/i965/**gen7_blorp.cpp >> @@ -51,6 +51,12 @@ static void >> gen7_blorp_emit_urb_config(**struct brw_context *brw, >> const brw_blorp_params *params) >> { >> + unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16; >> + gen7_emit_push_constant_state(**brw, >> + urb_size / 2 /* vs_size */, >> + 0 /* gs_size */, >> + urb_size / 2 /* fs_size */); >> + >> /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, >> Dword >> * 1.15:0 "VS Number of URB Entries". >> */ >> diff --git a/src/mesa/drivers/dri/i965/**gen7_urb.c >> b/src/mesa/drivers/dri/i965/**gen7_urb.c >> index 2d10cc12..4dc8f6e 100644 >> --- a/src/mesa/drivers/dri/i965/**gen7_urb.c >> +++ b/src/mesa/drivers/dri/i965/**gen7_urb.c >> @@ -30,12 +30,12 @@ >> /** >> * The following diagram shows how we partition the URB: >> * >> - * 8kB 8kB Rest of the URB space >> - * ____-____ ____-____ _________________-____________**_____ >> - * / \ / \ / \ >> + * 16kB or 32kB Rest of the URB space >> + * __________-__________ _________________-____________**_____ >> + * / \ / \ >> * +-----------------------------**------------------------------**--+ >> - * | VS Push | FS Push | VS | >> - * | Constants | Constants | Handles | >> + * | VS/FS/GS Push | VS/GS URB | >> + * | Constants | Entries | >> * +-----------------------------**------------------------------**--+ >> * >> * Notably, push constants must be stored at the beginning of the URB >> @@ -43,8 +43,12 @@ >> * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell >> GT3 >> * doubles this (32kB). >> * >> - * Currently we split the constant buffer space evenly between VS and FS. >> - * This is probably not ideal, but simple. >> + * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and >> + * sized) in increments of 1kB. Haswell GT3 requires them to be located >> and >> + * sized in increments of 2kB. >> + * >> + * Currently we split the constant buffer space evenly among whatever >> stages >> + * are active. This is probably not ideal, but simple. >> * >> * Ivybridge GT1 and Haswell GT1 have 128kB of URB space. >> * Ivybridge GT2 and Haswell GT2 have 256kB of URB space. >> @@ -53,22 +57,91 @@ >> * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: >> Configurations", >> * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_**xS. >> */ >> -void >> +static void >> gen7_allocate_push_constants(**struct brw_context *brw) >> { >> - unsigned size = 8; >> - if (brw->is_haswell && brw->gt == 3) >> - size = 16; >> + unsigned avail_size = 16; >> + unsigned multiplier = (brw->is_haswell && brw->gt == 3) ? 2 : 1; >> + >> + /* BRW_NEW_GEOMETRY_PROGRAM */ >> + bool gs_present = brw->geometry_program; >> + >> + unsigned vs_size, gs_size; >> + if (gs_present) { >> + vs_size = avail_size / 3; >> + avail_size -= vs_size; >> + gs_size = avail_size / 2; >> + avail_size -= gs_size; >> + } else { >> + vs_size = avail_size / 2; >> + avail_size -= vs_size; >> + gs_size = 0; >> + } >> + unsigned fs_size = avail_size; >> + >> + gen7_emit_push_constant_state(**brw, multiplier * vs_size, >> + multiplier * gs_size, multiplier * >> fs_size); >> +} >> + >> +void >> +gen7_emit_push_constant_**state(struct brw_context *brw, unsigned >> vs_size, >> + unsigned gs_size, unsigned fs_size) >> +{ >> + unsigned offset = 0; >> >> - BEGIN_BATCH(4); >> + BEGIN_BATCH(6); >> OUT_BATCH(_3DSTATE_PUSH_**CONSTANT_ALLOC_VS << 16 | (2 - 2)); >> - OUT_BATCH(size); >> + OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_** >> OFFSET_SHIFT); >> + offset += vs_size; >> + >> + OUT_BATCH(_3DSTATE_PUSH_**CONSTANT_ALLOC_GS << 16 | (2 - 2)); >> + OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_** >> OFFSET_SHIFT); >> + offset += gs_size; >> >> OUT_BATCH(_3DSTATE_PUSH_**CONSTANT_ALLOC_PS << 16 | (2 - 2)); >> - OUT_BATCH(size | size << GEN7_PUSH_CONSTANT_BUFFER_**OFFSET_SHIFT); >> + OUT_BATCH(offset | fs_size << GEN7_PUSH_CONSTANT_BUFFER_** >> OFFSET_SHIFT); >> ADVANCE_BATCH(); >> + >> + /* From p292 of the Ivy Bridge PRM (11.2.4 >> 3DSTATE_PUSH_CONSTANT_ALLOC_**PS): >> + * >> + * A PIPE_CONTOL command with the CS Stall bit set must be >> programmed >> + * in the ring after this instruction. >> + * >> + * No such restriction exists for Haswell. >> + */ >> + if (!brw->is_haswell) { >> + BEGIN_BATCH(4); >> + OUT_BATCH(_3DSTATE_PIPE_**CONTROL | (4 - 2)); >> + /* From p61 of the Ivy Bridge PRM (1.10.4 PIPE_CONTROL Command: >> DW1[20] >> + * CS Stall): >> + * >> + * One of the following must also be set: >> + * - Render Target Cache Flush Enable ([12] of DW1) >> + * - Depth Cache Flush Enable ([0] of DW1) >> + * - Stall at Pixel Scoreboard ([1] of DW1) >> + * - Depth Stall ([13] of DW1) >> + * - Post-Sync Operation ([13] of DW1) >> + * >> + * We choose to do a Post-Sync Operation (Write Immediate Data), >> since >> + * it seems like it will incur the least additional performance >> penalty. >> + */ >> + OUT_BATCH(PIPE_CONTROL_CS_**STALL | PIPE_CONTROL_WRITE_IMMEDIATE); >> + OUT_RELOC(brw->batch.**workaround_bo, >> + I915_GEM_DOMAIN_INSTRUCTION, >> I915_GEM_DOMAIN_INSTRUCTION, 0); >> + OUT_BATCH(0); >> + ADVANCE_BATCH(); >> + } >> > > This is really a bug fix, since it's implementing a hardware workaround on > a packet we've always been emitting. I'd really like to see it in a > separate patch, for easier bisectability. > Sure, no problem. It turns out this was really easy to split out to its own patch. > > Interestingly, this text doesn't appear in the latest documentation, and I > don't see it in the workaround database. It is definitely there in the > PRM, though. I'm not sure what to make of it. I see it in the current bspec, at the top of 3DSTATE_PUSH_CONSTANT_ALLOC_PS, in the "Description" section. > > > } >> >> +const struct brw_tracked_state gen7_push_constant_space = { >> + .dirty = { >> + .mesa = 0, >> + .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, >> > > It's worth noting that this will cause a full pipeline stall whenever the > current geometry shader changes (since 3DSTATE_PUSH_CONSTANT_ALLOC_* are > non-pipelined). We could avoid this when switching between two different > geometry shaders. But I suspect it's premature to care about that. Not > sure it'll matter much anyway. > > > + .cache = 0, >> + }, >> + .emit = gen7_allocate_push_constants, >> +}; >> + >> static void >> gen7_upload_urb(struct brw_context *brw) >> { >> >> > Other than pulling the PIPE_CONTROL into a separate patch, this looks fine > to me. >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev