On 12/05/2011 09:40 AM, Paul Berry wrote: > In Gen6, transform feedback is accomplished by having the geometry > shader send vertex data to the data port using "Streamed Vertex Buffer > Write" messages, while simultaneously passing vertices through to the > rest of the graphics pipeline (if rendering is enabled). > > This patch adds a geometry shader program that simply passes vertices > through to the rest of the graphics pipeline. The rest of transform > feedback functionality will be added in future patches. > > To make the new geometry shader easier to test, I've added an > environment variable "INTEL_FORCE_GS". If this environment variable > is enabled, then the pass-through geometry shader will always be used, > regardless of whether transform feedback is in effect. > > On my Sandy Bridge laptop, I'm able to enable INTEL_FORCE_GS with no > Piglit regressions.
Looks good, Paul. One minor nit in the middle, and then a question at the very end... > --- > src/mesa/drivers/dri/i965/brw_defines.h | 3 + > src/mesa/drivers/dri/i965/brw_eu.h | 5 ++ > src/mesa/drivers/dri/i965/brw_gs.c | 105 ++++++++++++++++++++-------- > src/mesa/drivers/dri/i965/brw_gs.h | 2 + > src/mesa/drivers/dri/i965/brw_gs_emit.c | 94 ++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/gen6_gs_state.c | 46 ++++++++----- > 6 files changed, 209 insertions(+), 46 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index 2990c90..ee7ec87 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -1075,6 +1075,9 @@ enum brw_message_target { > # define GEN6_GS_SVBI_POSTINCREMENT_VALUE_MASK INTEL_MASK(25, > 16) > # define GEN6_GS_ENABLE (1 << 15) > > +# define BRW_GS_EDGE_INDICATOR_0 (1 << 8) > +# define BRW_GS_EDGE_INDICATOR_1 (1 << 9) > + > #define _3DSTATE_HS 0x781B /* GEN7+ */ > #define _3DSTATE_TE 0x781C /* GEN7+ */ > #define _3DSTATE_DS 0x781D /* GEN7+ */ > diff --git a/src/mesa/drivers/dri/i965/brw_eu.h > b/src/mesa/drivers/dri/i965/brw_eu.h > index dcb1fc9..596be02 100644 > --- a/src/mesa/drivers/dri/i965/brw_eu.h > +++ b/src/mesa/drivers/dri/i965/brw_eu.h > @@ -650,6 +650,11 @@ static INLINE struct brw_reg get_element_ud( struct > brw_reg reg, GLuint elt ) > return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt)); > } > > +static INLINE struct brw_reg get_element_d( struct brw_reg reg, GLuint elt ) > +{ > + return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt)); > +} > + > > static INLINE struct brw_reg brw_swizzle( struct brw_reg reg, > GLuint x, > diff --git a/src/mesa/drivers/dri/i965/brw_gs.c > b/src/mesa/drivers/dri/i965/brw_gs.c > index 804ea84..2e729aa 100644 > --- a/src/mesa/drivers/dri/i965/brw_gs.c > +++ b/src/mesa/drivers/dri/i965/brw_gs.c > @@ -53,12 +53,6 @@ static void compile_gs_prog( struct brw_context *brw, > void *mem_ctx; > GLuint program_size; > > - /* Gen6: VF has already converted into polygon, and LINELOOP is > - * converted to LINESTRIP at the beginning of the 3D pipeline. > - */ > - if (intel->gen >= 6) > - return; > - > memset(&c, 0, sizeof(c)); > > c.key = *key; > @@ -76,24 +70,60 @@ static void compile_gs_prog( struct brw_context *brw, > */ > brw_set_mask_control(&c.func, BRW_MASK_DISABLE); > > - > - /* Note that primitives which don't require a GS program have > - * already been weeded out by this stage: > - */ > - > - switch (key->primitive) { > - case _3DPRIM_QUADLIST: > - brw_gs_quads( &c, key ); > - break; > - case _3DPRIM_QUADSTRIP: > - brw_gs_quad_strip( &c, key ); > - break; > - case _3DPRIM_LINELOOP: > - brw_gs_lines( &c ); > - break; > - default: > - ralloc_free(mem_ctx); > - return; > + if (intel->gen >= 6) { > + unsigned num_verts; > + bool check_edge_flag; > + /* On Sandybridge, we use the GS for implementing transform feedback > + * (called "Stream Out" in the PRM). > + */ > + switch (key->primitive) { > + case _3DPRIM_POINTLIST: > + num_verts = 1; > + check_edge_flag = false; > + break; > + case _3DPRIM_LINELIST: > + case _3DPRIM_LINESTRIP: > + case _3DPRIM_LINELOOP: > + num_verts = 2; > + check_edge_flag = false; > + break; > + case _3DPRIM_TRILIST: > + case _3DPRIM_TRIFAN: > + case _3DPRIM_TRISTRIP: > + case _3DPRIM_RECTLIST: > + num_verts = 3; > + check_edge_flag = false; > + break; > + case _3DPRIM_QUADLIST: > + case _3DPRIM_QUADSTRIP: > + case _3DPRIM_POLYGON: > + num_verts = 3; > + check_edge_flag = true; > + break; > + default: > + assert(!"Unexpected primitive type in Gen6 SOL program."); > + return; > + } > + gen6_sol_program(&c, key, num_verts, check_edge_flag); > + } else { > + /* On Gen4-5, we use the GS to decompose certain types of primitives. > + * Note that primitives which don't require a GS program have already > + * been weeded out by now. > + */ > + switch (key->primitive) { > + case _3DPRIM_QUADLIST: > + brw_gs_quads( &c, key ); > + break; > + case _3DPRIM_QUADSTRIP: > + brw_gs_quad_strip( &c, key ); > + break; > + case _3DPRIM_LINELOOP: > + brw_gs_lines( &c ); > + break; > + default: > + ralloc_free(mem_ctx); > + return; > + } > } > > /* get the program > @@ -147,11 +177,25 @@ static void populate_key( struct brw_context *brw, > key->pv_first = true; > } > > - key->need_gs_prog = (intel->gen >= 6) > - ? 0 > - : (brw->primitive == _3DPRIM_QUADLIST || > - brw->primitive == _3DPRIM_QUADSTRIP || > - brw->primitive == _3DPRIM_LINELOOP); > + if (intel->gen == 6) { > + /* On Gen6, GS is used for transform feedback. */ > + key->need_gs_prog = ctx->TransformFeedback.CurrentObject->Active; > + } else if (intel->gen >= 7) { > + /* On Gen7 and later, we don't use GS (yet). */ > + key->need_gs_prog = false; Could you please put these in order? 6, 7+, 4-5 is just asking for OCD issues. :) I'd probably move the >= 7 check to the top. > + } else { > + /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP > + * into simpler primitives. > + */ > + key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST || > + brw->primitive == _3DPRIM_QUADSTRIP || > + brw->primitive == _3DPRIM_LINELOOP); > + } > + /* For testing, the environment variable INTEL_FORCE_GS can be used to > + * force a GS program to be used, even if it's not necessary. > + */ > + if (getenv("INTEL_FORCE_GS")) > + key->need_gs_prog = true; > } > > /* Calculate interpolants for triangle and line rasterization. > @@ -182,7 +226,8 @@ brw_upload_gs_prog(struct brw_context *brw) > const struct brw_tracked_state brw_gs_prog = { > .dirty = { > .mesa = (_NEW_LIGHT | > - _NEW_TRANSFORM), > + _NEW_TRANSFORM | > + _NEW_TRANSFORM_FEEDBACK), > .brw = BRW_NEW_PRIMITIVE, > .cache = CACHE_NEW_VS_PROG > }, > diff --git a/src/mesa/drivers/dri/i965/brw_gs.h > b/src/mesa/drivers/dri/i965/brw_gs.h > index d71609f..bade3f6 100644 > --- a/src/mesa/drivers/dri/i965/brw_gs.h > +++ b/src/mesa/drivers/dri/i965/brw_gs.h > @@ -68,5 +68,7 @@ struct brw_gs_compile { > void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key ); > void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key > *key ); > void brw_gs_lines( struct brw_gs_compile *c ); > +void gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key, > + unsigned num_verts, bool check_edge_flag); > > #endif > diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c > b/src/mesa/drivers/dri/i965/brw_gs_emit.c > index 3d332c4..a6e9f50 100644 > --- a/src/mesa/drivers/dri/i965/brw_gs_emit.c > +++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c > @@ -101,6 +101,37 @@ static void brw_gs_overwrite_header_dw2(struct > brw_gs_compile *c, > } > > /** > + * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0. > + * > + * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0 > + * of DWORD 2. URB_WRITE messages need the primitive type in bits 6:2 of > + * DWORD 2. So this function extracts the primitive type field, bitshifts it > + * appropriately, and stores it in c->reg.header. > + */ > +static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c) > +{ > + struct brw_compile *p = &c->func; > + brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2), > + brw_imm_ud(0x1f)); > + brw_SHL(p, get_element_ud(c->reg.header, 2), > + get_element_ud(c->reg.header, 2), brw_imm_ud(2)); > +} > + > +/** > + * Apply an additive offset to DWORD 2 of c->reg.header. > + * > + * This is used to set/unset the "PrimStart" and "PrimEnd" flags > appropriately > + * for each vertex. > + */ > +static void brw_gs_offset_header_dw2(struct brw_gs_compile *c, int offset) > +{ > + struct brw_compile *p = &c->func; > + brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, > 2), > + brw_imm_d(offset)); > +} > + > + > +/** > * Emit a vertex using the URB_WRITE message. Use the contents of > * c->reg.header for the message header, and the registers starting at \c > vert > * for the vertex data. > @@ -269,3 +300,66 @@ void brw_gs_lines( struct brw_gs_compile *c ) > | URB_WRITE_M02_PRIM_END)); > brw_gs_emit_vue(c, c->reg.vertex[1], 1); > } > + > +/** > + * Generate the geometry shader program used on Gen6 to perform stream output > + * (transform feedback). > + */ > +void > +gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key, > + unsigned num_verts, bool check_edge_flags) > +{ > + struct brw_compile *p = &c->func; > + > + brw_gs_alloc_regs(c, num_verts); > + brw_gs_initialize_header(c); > + > + brw_gs_ff_sync(c, 1); > + > + brw_gs_overwrite_header_dw2_from_r0(c); > + switch (num_verts) { > + case 1: > + brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_START > + | URB_WRITE_M02_PRIM_END)); > + brw_gs_emit_vue(c, c->reg.vertex[0], true); > + break; > + case 2: > + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START); > + brw_gs_emit_vue(c, c->reg.vertex[0], false); > + brw_gs_offset_header_dw2(c, (URB_WRITE_M02_PRIM_END > + - URB_WRITE_M02_PRIM_START)); > + brw_gs_emit_vue(c, c->reg.vertex[1], true); > + break; > + case 3: > + if (check_edge_flags) { > + /* Only emit vertices 0 and 1 if this is the first triangle of the > + * polygon. Otherwise they are redundant. > + */ > + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); > + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), > + get_element_ud(c->reg.R0, 2), > + brw_imm_ud(BRW_GS_EDGE_INDICATOR_0)); > + brw_IF(p, BRW_EXECUTE_1); > + } > + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_START); > + brw_gs_emit_vue(c, c->reg.vertex[0], false); > + brw_gs_offset_header_dw2(c, -URB_WRITE_M02_PRIM_START); > + brw_gs_emit_vue(c, c->reg.vertex[1], false); > + if (check_edge_flags) { > + brw_ENDIF(p); > + /* Only emit vertex 2 in PRIM_END mode if this is the last triangle > + * of the polygon. Otherwise leave the primitive incomplete because > + * there are more polygon vertices coming. > + */ > + brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); > + brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), > + get_element_ud(c->reg.R0, 2), > + brw_imm_ud(BRW_GS_EDGE_INDICATOR_1)); > + brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); > + } > + brw_gs_offset_header_dw2(c, URB_WRITE_M02_PRIM_END); > + brw_set_predicate_control(p, BRW_PREDICATE_NONE); > + brw_gs_emit_vue(c, c->reg.vertex[2], true); > + break; > + } > +} > diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c > b/src/mesa/drivers/dri/i965/gen6_gs_state.c > index d29f029..b041140 100644 > --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c > @@ -44,22 +44,36 @@ upload_gs_state(struct brw_context *brw) > OUT_BATCH(0); > ADVANCE_BATCH(); > > - // GS should never be used on Gen6. Disable it. > - assert(!brw->gs.prog_active); > - BEGIN_BATCH(7); > - OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); > - OUT_BATCH(0); /* prog_bo */ > - OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | > - (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); > - OUT_BATCH(0); /* scratch space base offset */ > - OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | > - (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | > - (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); > - OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | > - GEN6_GS_STATISTICS_ENABLE | > - GEN6_GS_RENDERING_ENABLE); > - OUT_BATCH(0); > - ADVANCE_BATCH(); > + if (brw->gs.prog_active) { > + BEGIN_BATCH(7); > + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); > + OUT_BATCH(brw->gs.prog_offset); > + OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE); > + OUT_BATCH(0); /* no scratch space */ > + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | > + (brw->gs.prog_data->urb_read_length << > GEN6_GS_URB_READ_LENGTH_SHIFT)); > + OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) | > + GEN6_GS_STATISTICS_ENABLE | > + GEN6_GS_SO_STATISTICS_ENABLE | > + 0); //GEN6_GS_RENDERING_ENABLE); I'm rather surprised this works. I thought you needed the GEN6_GS_RENDERING_ENABLE bit set in order to draw anything at all. The commented out code looks like it came from a half-baked patch of mine, so I'm guessing it's unintentional. Still, do you have any idea why it would work? > + OUT_BATCH(GEN6_GS_ENABLE); > + ADVANCE_BATCH(); > + } else { > + BEGIN_BATCH(7); > + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); > + OUT_BATCH(0); /* prog_bo */ > + OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | > + (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); > + OUT_BATCH(0); /* scratch space base offset */ > + OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | > + (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | > + (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); > + OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | > + GEN6_GS_STATISTICS_ENABLE | > + GEN6_GS_RENDERING_ENABLE); > + OUT_BATCH(0); > + ADVANCE_BATCH(); > + } > } > > const struct brw_tracked_state gen6_gs_state = { _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev