On Monday, April 24, 2017 3:19:19 PM PDT Rafael Antognolli wrote: > Emit 3DSTATE_SOL on Gen7+ using brw_batch_emit helper, that uses pack > structs from genxml. > > v2: > - Add helpers to assign struct brw_address (Kristian) > > Signed-off-by: Rafael Antognolli <rafael.antogno...@intel.com> > --- > src/mesa/drivers/dri/i965/Makefile.sources | 1 +- > src/mesa/drivers/dri/i965/brw_state.h | 6 +- > src/mesa/drivers/dri/i965/gen7_sol_state.c | 307 +---------------- > src/mesa/drivers/dri/i965/gen8_sol_state.c | 95 +----- > src/mesa/drivers/dri/i965/genX_state_upload.c | 358 ++++++++++++++++++- > 5 files changed, 355 insertions(+), 412 deletions(-) > delete mode 100644 src/mesa/drivers/dri/i965/gen8_sol_state.c > > diff --git a/src/mesa/drivers/dri/i965/Makefile.sources > b/src/mesa/drivers/dri/i965/Makefile.sources > index 47680a7..bfcf57c 100644 > --- a/src/mesa/drivers/dri/i965/Makefile.sources > +++ b/src/mesa/drivers/dri/i965/Makefile.sources > @@ -111,7 +111,6 @@ i965_FILES = \ > gen8_hs_state.c \ > gen8_multisample_state.c \ > gen8_ps_state.c \ > - gen8_sol_state.c \ > gen8_surface_state.c \ > gen8_viewport_state.c \ > gen8_vs_state.c \ > diff --git a/src/mesa/drivers/dri/i965/brw_state.h > b/src/mesa/drivers/dri/i965/brw_state.h > index 3df975a..94f758b 100644 > --- a/src/mesa/drivers/dri/i965/brw_state.h > +++ b/src/mesa/drivers/dri/i965/brw_state.h > @@ -135,7 +135,6 @@ extern const struct brw_tracked_state gen7_l3_state; > extern const struct brw_tracked_state gen7_ps_state; > extern const struct brw_tracked_state gen7_push_constant_space; > extern const struct brw_tracked_state gen7_sf_clip_viewport; > -extern const struct brw_tracked_state gen7_sol_state; > extern const struct brw_tracked_state gen7_te_state; > extern const struct brw_tracked_state gen7_tes_push_constants; > extern const struct brw_tracked_state gen7_urb; > @@ -299,11 +298,6 @@ void gen8_upload_ps_state(struct brw_context *brw, > void gen8_upload_ps_extra(struct brw_context *brw, > const struct brw_wm_prog_data *prog_data); > > -/* gen7_sol_state.c */ > -void gen7_upload_3dstate_so_decl_list(struct brw_context *brw, > - const struct brw_vue_map *vue_map); > -void gen8_upload_3dstate_so_buffers(struct brw_context *brw); > - > /* gen8_surface_state.c */ > > void gen8_init_vtable_surface_functions(struct brw_context *brw); > diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c > b/src/mesa/drivers/dri/i965/gen7_sol_state.c > index f1bd19c..f54b370 100644 > --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c > @@ -35,313 +35,6 @@ > #include "intel_buffer_objects.h" > #include "main/transformfeedback.h" > > -static void > -upload_3dstate_so_buffers(struct brw_context *brw) > -{ > - struct gl_context *ctx = &brw->ctx; > - /* BRW_NEW_TRANSFORM_FEEDBACK */ > - struct gl_transform_feedback_object *xfb_obj = > - ctx->TransformFeedback.CurrentObject; > - const struct gl_transform_feedback_info *linked_xfb_info = > - xfb_obj->program->sh.LinkedTransformFeedback; > - int i; > - > - /* Set up the up to 4 output buffers. These are the ranges defined in the > - * gl_transform_feedback_object. > - */ > - for (i = 0; i < 4; i++) { > - struct intel_buffer_object *bufferobj = > - intel_buffer_object(xfb_obj->Buffers[i]); > - struct brw_bo *bo; > - uint32_t start, end; > - uint32_t stride; > - > - if (!xfb_obj->Buffers[i]) { > - /* The pitch of 0 in this command indicates that the buffer is > - * unbound and won't be written to. > - */ > - BEGIN_BATCH(4); > - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); > - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); > - OUT_BATCH(0); > - OUT_BATCH(0); > - ADVANCE_BATCH(); > - > - continue; > - } > - > - stride = linked_xfb_info->Buffers[i].Stride * 4; > - > - start = xfb_obj->Offset[i]; > - assert(start % 4 == 0); > - end = ALIGN(start + xfb_obj->Size[i], 4); > - bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); > - assert(end <= bo->size); > - > - BEGIN_BATCH(4); > - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); > - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride); > - OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); > - OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end); > - ADVANCE_BATCH(); > - } > -} > - > -/** > - * Outputs the 3DSTATE_SO_DECL_LIST command. > - * > - * The data output is a series of 64-bit entries containing a SO_DECL per > - * stream. We only have one stream of rendering coming out of the GS unit, > so > - * we only emit stream 0 (low 16 bits) SO_DECLs. > - */ > -void > -gen7_upload_3dstate_so_decl_list(struct brw_context *brw, > - const struct brw_vue_map *vue_map) > -{ > - struct gl_context *ctx = &brw->ctx; > - /* BRW_NEW_TRANSFORM_FEEDBACK */ > - struct gl_transform_feedback_object *xfb_obj = > - ctx->TransformFeedback.CurrentObject; > - const struct gl_transform_feedback_info *linked_xfb_info = > - xfb_obj->program->sh.LinkedTransformFeedback; > - uint16_t so_decl[MAX_VERTEX_STREAMS][128]; > - int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; > - int next_offset[BRW_MAX_SOL_BUFFERS] = {0, 0, 0, 0}; > - int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; > - int max_decls = 0; > - STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); > - > - memset(so_decl, 0, sizeof(so_decl)); > - > - /* Construct the list of SO_DECLs to be emitted. The formatting of the > - * command is feels strange -- each dword pair contains a SO_DECL per > stream. > - */ > - for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { > - int buffer = linked_xfb_info->Outputs[i].OutputBuffer; > - uint16_t decl = 0; > - int varying = linked_xfb_info->Outputs[i].OutputRegister; > - const unsigned components = linked_xfb_info->Outputs[i].NumComponents; > - unsigned component_mask = (1 << components) - 1; > - unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; > - unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT; > - assert(stream_id < MAX_VERTEX_STREAMS); > - > - /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w > - * gl_Layer is stored in VARYING_SLOT_PSIZ.y > - * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z > - */ > - if (varying == VARYING_SLOT_PSIZ) { > - assert(components == 1); > - component_mask <<= 3; > - } else if (varying == VARYING_SLOT_LAYER) { > - assert(components == 1); > - component_mask <<= 1; > - } else if (varying == VARYING_SLOT_VIEWPORT) { > - assert(components == 1); > - component_mask <<= 2; > - } else { > - component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; > - } > - > - buffer_mask[stream_id] |= 1 << buffer; > - > - decl |= decl_buffer_slot; > - if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) > { > - decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] << > - SO_DECL_REGISTER_INDEX_SHIFT; > - } else { > - assert(vue_map->varying_to_slot[varying] >= 0); > - decl |= vue_map->varying_to_slot[varying] << > - SO_DECL_REGISTER_INDEX_SHIFT; > - } > - decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT; > - > - /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] > - * array. Instead, it simply increments DstOffset for the following > - * input by the number of components that should be skipped. > - * > - * Our hardware is unusual in that it requires us to program SO_DECLs > - * for fake "hole" components, rather than simply taking the offset > - * for each real varying. Each hole can have size 1, 2, 3, or 4; we > - * program as many size = 4 holes as we can, then a final hole to > - * accommodate the final 1, 2, or 3 remaining. > - */ > - int skip_components = > - linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; > - > - next_offset[buffer] += skip_components; > - > - while (skip_components >= 4) { > - so_decl[stream_id][decls[stream_id]++] = > - SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot; > - skip_components -= 4; > - } > - if (skip_components > 0) > - so_decl[stream_id][decls[stream_id]++] = > - SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) | > - decl_buffer_slot; > - > - assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); > - > - next_offset[buffer] += components; > - > - so_decl[stream_id][decls[stream_id]++] = decl; > - > - if (decls[stream_id] > max_decls) > - max_decls = decls[stream_id]; > - } > - > - BEGIN_BATCH(max_decls * 2 + 3); > - OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1)); > - > - OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) | > - (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) | > - (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) | > - (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT)); > - > - OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) | > - (decls[1] << SO_NUM_ENTRIES_1_SHIFT) | > - (decls[2] << SO_NUM_ENTRIES_2_SHIFT) | > - (decls[3] << SO_NUM_ENTRIES_3_SHIFT)); > - > - for (int i = 0; i < max_decls; i++) { > - /* Stream 1 | Stream 0 */ > - OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]); > - /* Stream 3 | Stream 2 */ > - OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]); > - } > - > - ADVANCE_BATCH(); > -} > - > -static bool > -query_active(struct gl_query_object *q) > -{ > - return q && q->Active; > -} > - > -static void > -upload_3dstate_streamout(struct brw_context *brw, bool active, > - const struct brw_vue_map *vue_map) > -{ > - struct gl_context *ctx = &brw->ctx; > - /* BRW_NEW_TRANSFORM_FEEDBACK */ > - struct gl_transform_feedback_object *xfb_obj = > - ctx->TransformFeedback.CurrentObject; > - uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0; > - int i; > - > - if (active) { > - const struct gl_transform_feedback_info *linked_xfb_info = > - xfb_obj->program->sh.LinkedTransformFeedback; > - int urb_entry_read_offset = 0; > - int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - > - urb_entry_read_offset; > - > - dw1 |= SO_FUNCTION_ENABLE; > - dw1 |= SO_STATISTICS_ENABLE; > - > - /* BRW_NEW_RASTERIZER_DISCARD */ > - if (ctx->RasterDiscard) { > - if (!query_active(ctx->Query.PrimitivesGenerated[0])) { > - dw1 |= SO_RENDERING_DISABLE; > - } else { > - perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " > - "query active relies on the clipper."); > - } > - } > - > - /* _NEW_LIGHT */ > - if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) > - dw1 |= SO_REORDER_TRAILING; > - > - if (brw->gen < 8) { > - for (i = 0; i < 4; i++) { > - if (xfb_obj->Buffers[i]) { > - dw1 |= SO_BUFFER_ENABLE(i); > - } > - } > - } > - > - /* We always read the whole vertex. This could be reduced at some > - * point by reading less and offsetting the register index in the > - * SO_DECLs. > - */ > - dw2 |= SET_FIELD(urb_entry_read_offset, > SO_STREAM_0_VERTEX_READ_OFFSET); > - dw2 |= SET_FIELD(urb_entry_read_length - 1, > SO_STREAM_0_VERTEX_READ_LENGTH); > - > - dw2 |= SET_FIELD(urb_entry_read_offset, > SO_STREAM_1_VERTEX_READ_OFFSET); > - dw2 |= SET_FIELD(urb_entry_read_length - 1, > SO_STREAM_1_VERTEX_READ_LENGTH); > - > - dw2 |= SET_FIELD(urb_entry_read_offset, > SO_STREAM_2_VERTEX_READ_OFFSET); > - dw2 |= SET_FIELD(urb_entry_read_length - 1, > SO_STREAM_2_VERTEX_READ_LENGTH); > - > - dw2 |= SET_FIELD(urb_entry_read_offset, > SO_STREAM_3_VERTEX_READ_OFFSET); > - dw2 |= SET_FIELD(urb_entry_read_length - 1, > SO_STREAM_3_VERTEX_READ_LENGTH); > - > - if (brw->gen >= 8) { > - /* Set buffer pitches; 0 means unbound. */ > - if (xfb_obj->Buffers[0]) > - dw3 |= linked_xfb_info->Buffers[0].Stride * 4; > - if (xfb_obj->Buffers[1]) > - dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16; > - if (xfb_obj->Buffers[2]) > - dw4 |= linked_xfb_info->Buffers[2].Stride * 4; > - if (xfb_obj->Buffers[3]) > - dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16; > - } > - } > - > - const int dwords = brw->gen >= 8 ? 5 : 3; > - > - BEGIN_BATCH(dwords); > - OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2)); > - OUT_BATCH(dw1); > - OUT_BATCH(dw2); > - if (dwords > 3) { > - OUT_BATCH(dw3); > - OUT_BATCH(dw4); > - } > - ADVANCE_BATCH(); > -} > - > -static void > -upload_sol_state(struct brw_context *brw) > -{ > - struct gl_context *ctx = &brw->ctx; > - /* BRW_NEW_TRANSFORM_FEEDBACK */ > - bool active = _mesa_is_xfb_active_and_unpaused(ctx); > - > - if (active) { > - if (brw->gen >= 8) > - gen8_upload_3dstate_so_buffers(brw); > - else > - upload_3dstate_so_buffers(brw); > - > - /* BRW_NEW_VUE_MAP_GEOM_OUT */ > - gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out); > - } > - > - /* Finally, set up the SOL stage. This command must always follow > updates to > - * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or > - * MMIO register updates (current performed by the kernel at each batch > - * emit). > - */ > - upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out); > -} > - > -const struct brw_tracked_state gen7_sol_state = { > - .dirty = { > - .mesa = _NEW_LIGHT, > - .brw = BRW_NEW_BATCH | > - BRW_NEW_BLORP | > - BRW_NEW_RASTERIZER_DISCARD | > - BRW_NEW_VUE_MAP_GEOM_OUT | > - BRW_NEW_TRANSFORM_FEEDBACK, > - }, > - .emit = upload_sol_state, > -}; > - > void > gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode, > struct gl_transform_feedback_object *obj) > diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c > b/src/mesa/drivers/dri/i965/gen8_sol_state.c > deleted file mode 100644 > index 6866539..0000000 > --- a/src/mesa/drivers/dri/i965/gen8_sol_state.c > +++ /dev/null > @@ -1,95 +0,0 @@ > -/* > - * Copyright © 2012 Intel Corporation > - * > - * Permission is hereby granted, free of charge, to any person obtaining a > - * copy of this software and associated documentation files (the "Software"), > - * to deal in the Software without restriction, including without limitation > - * the rights to use, copy, modify, merge, publish, distribute, sublicense, > - * and/or sell copies of the Software, and to permit persons to whom the > - * Software is furnished to do so, subject to the following conditions: > - * > - * The above copyright notice and this permission notice (including the next > - * paragraph) shall be included in all copies or substantial portions of the > - * Software. > - * > - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > DEALINGS > - * IN THE SOFTWARE. > - */ > - > -/** > - * @file gen8_sol_state.c > - * > - * Controls the stream output logic (SOL) stage of the gen8 hardware, which > is > - * used to implement GL_EXT_transform_feedback. > - */ > - > -#include "brw_context.h" > -#include "brw_state.h" > -#include "brw_defines.h" > -#include "intel_batchbuffer.h" > -#include "intel_buffer_objects.h" > -#include "main/transformfeedback.h" > - > -void > -gen8_upload_3dstate_so_buffers(struct brw_context *brw) > -{ > - struct gl_context *ctx = &brw->ctx; > - /* BRW_NEW_TRANSFORM_FEEDBACK */ > - struct gl_transform_feedback_object *xfb_obj = > - ctx->TransformFeedback.CurrentObject; > - struct brw_transform_feedback_object *brw_obj = > - (struct brw_transform_feedback_object *) xfb_obj; > - uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; > - > - /* Set up the up to 4 output buffers. These are the ranges defined in the > - * gl_transform_feedback_object. > - */ > - for (int i = 0; i < 4; i++) { > - struct intel_buffer_object *bufferobj = > - intel_buffer_object(xfb_obj->Buffers[i]); > - > - if (!bufferobj) { > - BEGIN_BATCH(8); > - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); > - OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); > - OUT_BATCH(0); > - OUT_BATCH(0); > - OUT_BATCH(0); > - OUT_BATCH(0); > - OUT_BATCH(0); > - OUT_BATCH(0); > - ADVANCE_BATCH(); > - continue; > - } > - > - uint32_t start = xfb_obj->Offset[i]; > - assert(start % 4 == 0); > - uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); > - struct brw_bo *bo = > - intel_bufferobj_buffer(brw, bufferobj, start, end - start); > - assert(end <= bo->size); > - > - BEGIN_BATCH(8); > - OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (8 - 2)); > - OUT_BATCH(GEN8_SO_BUFFER_ENABLE | (i << SO_BUFFER_INDEX_SHIFT) | > - GEN8_SO_BUFFER_OFFSET_WRITE_ENABLE | > - GEN8_SO_BUFFER_OFFSET_ADDRESS_ENABLE | > - (mocs_wb << 22)); > - OUT_RELOC64(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); > - OUT_BATCH(xfb_obj->Size[i] / 4 - 1); > - OUT_RELOC64(brw_obj->offset_bo, > - I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, > - i * sizeof(uint32_t)); > - if (brw_obj->zero_offsets) > - OUT_BATCH(0); /* Zero out the offset and write that to offset_bo */ > - else > - OUT_BATCH(0xFFFFFFFF); /* Use offset_bo as the "Stream Offset." */ > - ADVANCE_BATCH(); > - } > - brw_obj->zero_offsets = false; > -} > diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c > b/src/mesa/drivers/dri/i965/genX_state_upload.c > index ec85ec1..cb3c2db 100644 > --- a/src/mesa/drivers/dri/i965/genX_state_upload.c > +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c > @@ -31,11 +31,13 @@ > #include "brw_util.h" > > #include "intel_batchbuffer.h" > +#include "intel_buffer_objects.h" > #include "intel_fbo.h" > > #include "main/fbobject.h" > #include "main/framebuffer.h" > #include "main/stencil.h" > +#include "main/transformfeedback.h" > > UNUSED static void * > emit_dwords(struct brw_context *brw, unsigned n) > @@ -80,6 +82,28 @@ __gen_combine_address(struct brw_context *brw, void > *location, > } > } > > +static inline struct brw_address > +render_bo(struct brw_bo *bo, uint32_t offset) > +{ > + return (struct brw_address) { > + .bo = bo, > + .offset = offset, > + .read_domains = I915_GEM_DOMAIN_RENDER, > + .write_domain = I915_GEM_DOMAIN_RENDER, > + }; > +} > + > +static inline struct brw_address > +instruction_bo(struct brw_bo *bo, uint32_t offset) > +{ > + return (struct brw_address) { > + .bo = bo, > + .offset = offset, > + .read_domains = I915_GEM_DOMAIN_INSTRUCTION, > + .write_domain = I915_GEM_DOMAIN_INSTRUCTION, > + }; > +} > + > #include "genxml/genX_pack.h" > > #define _brw_cmd_length(cmd) cmd ## _length > @@ -94,11 +118,12 @@ __gen_combine_address(struct brw_context *brw, void > *location, > _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \ > _dst = NULL) > > -#define brw_batch_emitn(brw, cmd, n) ({ \ > +#define brw_batch_emitn(brw, cmd, n, ...) ({ \ > uint32_t *_dw = emit_dwords(brw, n); \ > struct cmd template = { \ > _brw_cmd_header(cmd), \ > .DWordLength = n - _brw_cmd_length_bias(cmd), \ > + __VA_ARGS__ \ > }; \ > _brw_cmd_pack(cmd)(brw, _dw, &template); \ > _dw + 1; /* Array starts at dw[1] */ \ > @@ -112,6 +137,9 @@ __gen_combine_address(struct brw_context *brw, void > *location, > _brw_cmd_pack(cmd)(brw, (void *)_dst, &name), \ > _dst = NULL) > > +#define SKL_MOCS_WB (2 << 1) > +#define BDW_MOCS_WB 0x78 > +
Please don't duplicate these - it's likely to trip us up in the future. Let's just move them to brw_state.h or brw_context.h if we have to. > #if GEN_GEN >= 6 > /** > * Determine the appropriate attribute override value to store into the > @@ -878,6 +906,330 @@ static const struct brw_tracked_state genX(sbe_state) = > { > }, > .emit = genX(upload_sbe), > }; > + > +/* ---------------------------------------------------------------------- */ > + > +/** > + * Outputs the 3DSTATE_SO_DECL_LIST command. > + * > + * The data output is a series of 64-bit entries containing a SO_DECL per > + * stream. We only have one stream of rendering coming out of the GS unit, > so > + * we only emit stream 0 (low 16 bits) SO_DECLs. > + */ > +static void > +genX(upload_3dstate_so_decl_list)(struct brw_context *brw, > + const struct brw_vue_map *vue_map) > +{ > + struct gl_context *ctx = &brw->ctx; > + /* BRW_NEW_TRANSFORM_FEEDBACK */ > + struct gl_transform_feedback_object *xfb_obj = > + ctx->TransformFeedback.CurrentObject; > + const struct gl_transform_feedback_info *linked_xfb_info = > + xfb_obj->program->sh.LinkedTransformFeedback; > + struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128]; > + int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; > + int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; > + int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; > + int max_decls = 0; > + STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); > + > + memset(so_decl, 0, sizeof(so_decl)); > + > + /* Construct the list of SO_DECLs to be emitted. The formatting of the > + * command feels strange -- each dword pair contains a SO_DECL per stream. > + */ > + for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { > + int buffer = linked_xfb_info->Outputs[i].OutputBuffer; > + struct GENX(SO_DECL) decl = {0}; > + int varying = linked_xfb_info->Outputs[i].OutputRegister; > + const unsigned components = linked_xfb_info->Outputs[i].NumComponents; > + unsigned component_mask = (1 << components) - 1; > + unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; > + unsigned decl_buffer_slot = buffer; > + assert(stream_id < MAX_VERTEX_STREAMS); > + > + /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w > + * gl_Layer is stored in VARYING_SLOT_PSIZ.y > + * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z > + */ > + if (varying == VARYING_SLOT_PSIZ) { > + assert(components == 1); > + component_mask <<= 3; > + } else if (varying == VARYING_SLOT_LAYER) { > + assert(components == 1); > + component_mask <<= 1; > + } else if (varying == VARYING_SLOT_VIEWPORT) { > + assert(components == 1); > + component_mask <<= 2; > + } else { > + component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; > + } > + > + buffer_mask[stream_id] |= 1 << buffer; > + > + decl.OutputBufferSlot = decl_buffer_slot; > + if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) > { > + decl.RegisterIndex = vue_map->varying_to_slot[VARYING_SLOT_PSIZ]; > + } else { > + assert(vue_map->varying_to_slot[varying] >= 0); > + decl.RegisterIndex = vue_map->varying_to_slot[varying]; > + } > + decl.ComponentMask = component_mask; > + > + /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] > + * array. Instead, it simply increments DstOffset for the following > + * input by the number of components that should be skipped. > + * > + * Our hardware is unusual in that it requires us to program SO_DECLs > + * for fake "hole" components, rather than simply taking the offset > + * for each real varying. Each hole can have size 1, 2, 3, or 4; we > + * program as many size = 4 holes as we can, then a final hole to > + * accommodate the final 1, 2, or 3 remaining. > + */ > + int skip_components = > + linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; > + > + next_offset[buffer] += skip_components; > + > + while (skip_components >= 4) { > + struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++]; > + d->HoleFlag = 1; > + d->OutputBufferSlot = decl_buffer_slot; > + d->ComponentMask = 0xf; > + skip_components -= 4; > + } > + > + if (skip_components > 0) { > + struct GENX(SO_DECL) *d = &so_decl[stream_id][decls[stream_id]++]; > + d->HoleFlag = 1; > + d->OutputBufferSlot = decl_buffer_slot; > + d->ComponentMask = (1 << skip_components) - 1; > + } > + > + assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); > + > + next_offset[buffer] += components; > + > + so_decl[stream_id][decls[stream_id]++] = decl; > + > + if (decls[stream_id] > max_decls) > + max_decls = decls[stream_id]; > + } > + > + uint32_t *dw; > + dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls, > + .StreamtoBufferSelects0 = buffer_mask[0], > + .StreamtoBufferSelects1 = buffer_mask[1], > + .StreamtoBufferSelects2 = buffer_mask[2], > + .StreamtoBufferSelects3 = buffer_mask[3], > + .NumEntries0 = decls[0], > + .NumEntries1 = decls[1], > + .NumEntries2 = decls[2], > + .NumEntries3 = decls[3]); > + > + for (int i = 0; i < max_decls; i++) { > + GENX(SO_DECL_ENTRY_pack)( > + brw, dw + 2 + i * 2, > + &(struct GENX(SO_DECL_ENTRY)) { > + .Stream0Decl = so_decl[0][i], > + .Stream1Decl = so_decl[1][i], > + .Stream2Decl = so_decl[2][i], > + .Stream3Decl = so_decl[3][i], > + }); > + } > +} FWIW, I'm planning on cleaning up some of this code in the future. Your port looks good, but this code has always been a mess. > + > +static void > +genX(upload_3dstate_so_buffers)(struct brw_context *brw) > +{ > + struct gl_context *ctx = &brw->ctx; > + /* BRW_NEW_TRANSFORM_FEEDBACK */ > + struct gl_transform_feedback_object *xfb_obj = > + ctx->TransformFeedback.CurrentObject; > +#if GEN_GEN < 8 > + const struct gl_transform_feedback_info *linked_xfb_info = > + xfb_obj->program->sh.LinkedTransformFeedback; > +#else > + struct brw_transform_feedback_object *brw_obj = > + (struct brw_transform_feedback_object *) xfb_obj; > + /* Copy these values from brw_defines.h so we don't have to include the > whole > + * file. > + */ Please drop this comment. > + uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; > +#endif > + > + /* Set up the up to 4 output buffers. These are the ranges defined in the > + * gl_transform_feedback_object. > + */ > + for (int i = 0; i < 4; i++) { > + struct intel_buffer_object *bufferobj = > + intel_buffer_object(xfb_obj->Buffers[i]); > + > + if (!bufferobj) { > + brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { > + sob.SOBufferIndex = i; > + } > + continue; > + } > + > + uint32_t start = xfb_obj->Offset[i]; > + assert(start % 4 == 0); > + uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); > + struct brw_bo *bo = > + intel_bufferobj_buffer(brw, bufferobj, start, end - start); > + assert(end <= bo->size); > + > + brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { > + sob.SOBufferIndex = i; > + > + sob.SurfaceBaseAddress = render_bo(bo, start); > +#if GEN_GEN < 8 > + sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4; > + sob.SurfaceEndAddress = render_bo(bo, end); > +#else > + sob.SOBufferEnable = true; > + sob.StreamOffsetWriteEnable = true; > + sob.StreamOutputBufferOffsetAddressEnable = true; > + sob.MOCS = mocs_wb; > + > + sob.SurfaceSize = xfb_obj->Size[i] / 4; > + if (sob.SurfaceSize > 0) > + sob.SurfaceSize -= 1; This looks like a bug fix. How about sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1; > + sob.StreamOutputBufferOffsetAddress = > + instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t)); > + > + if (brw_obj->zero_offsets) > + /* Zero out the offset and write that to offset_bo */ > + sob.StreamOffset = 0; > + else > + /* Use offset_bo as the "Stream Offset." */ > + sob.StreamOffset = 0xFFFFFFFF; Please use curly braces here. > +#endif > + } > + } > + > +#if GEN_GEN >= 8 > + brw_obj->zero_offsets = false; > +#endif > +} > + > +static bool > +genX(query_active)(struct gl_query_object *q) Is it possible to make this a static inline instead of genX()? If not, no worries. > +{ > + return q && q->Active; > +} > + > +static void > +genX(upload_3dstate_streamout)(struct brw_context *brw, bool active, > + const struct brw_vue_map *vue_map) > +{ > + struct gl_context *ctx = &brw->ctx; > + /* BRW_NEW_TRANSFORM_FEEDBACK */ > + struct gl_transform_feedback_object *xfb_obj = > + ctx->TransformFeedback.CurrentObject; > + > + brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) { > + if (active) { > +#if GEN_GEN >= 8 > + const struct gl_transform_feedback_info *linked_xfb_info = > + xfb_obj->program->sh.LinkedTransformFeedback; > +#endif Please move this variable declaration to the use below, so we can combine the two #if GEN_GEN >= 8 blocks. > + int urb_entry_read_offset = 0; > + int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - > + urb_entry_read_offset; > + > + sos.SOFunctionEnable = true; > + sos.SOStatisticsEnable = true; > + > + /* BRW_NEW_RASTERIZER_DISCARD */ > + if (ctx->RasterDiscard) { > + if (!genX(query_active)(ctx->Query.PrimitivesGenerated[0])) { > + sos.RenderingDisable = true; > + } else { > + perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED > " > + "query active relies on the clipper."); > + } > + } > + > + /* _NEW_LIGHT */ > + if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) > + sos.ReorderMode = TRAILING; > + > +#if GEN_GEN < 8 > + if (brw->gen < 8) { > + if (xfb_obj->Buffers[0]) > + sos.SOBufferEnable0 = true; > + if (xfb_obj->Buffers[1]) > + sos.SOBufferEnable1 = true; > + if (xfb_obj->Buffers[2]) > + sos.SOBufferEnable2 = true; > + if (xfb_obj->Buffers[3]) > + sos.SOBufferEnable3 = true; Shorter: sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL; > + } > +#endif > + > + /* We always read the whole vertex. This could be reduced at some > + * point by reading less and offsetting the register index in the > + * SO_DECLs. > + */ > + sos.Stream0VertexReadOffset = urb_entry_read_offset; > + sos.Stream0VertexReadLength = urb_entry_read_length - 1; > + sos.Stream1VertexReadOffset = urb_entry_read_offset; > + sos.Stream1VertexReadLength = urb_entry_read_length - 1; > + sos.Stream2VertexReadOffset = urb_entry_read_offset; > + sos.Stream2VertexReadLength = urb_entry_read_length - 1; > + sos.Stream3VertexReadOffset = urb_entry_read_offset; > + sos.Stream3VertexReadLength = urb_entry_read_length - 1; Can we move these up and combine the enable/pitch blocks, i.e. #if GEN_GEN < 8 ...enables... #else ...pitches... #endif With those changes, Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> > + > +#if GEN_GEN >= 8 > + /* Set buffer pitches; 0 means unbound. */ > + if (xfb_obj->Buffers[0]) > + sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4; > + if (xfb_obj->Buffers[1]) > + sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4; > + if (xfb_obj->Buffers[2]) > + sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4; > + if (xfb_obj->Buffers[3]) > + sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4; > +#endif > + } > + } > +} > + > +static void > +genX(upload_sol)(struct brw_context *brw) > +{ > + struct gl_context *ctx = &brw->ctx; > + /* BRW_NEW_TRANSFORM_FEEDBACK */ > + bool active = _mesa_is_xfb_active_and_unpaused(ctx); > + > + if (active) { > + genX(upload_3dstate_so_buffers)(brw); > + > + /* BRW_NEW_VUE_MAP_GEOM_OUT */ > + genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out); > + } > + > + /* Finally, set up the SOL stage. This command must always follow > updates to > + * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or > + * MMIO register updates (current performed by the kernel at each batch > + * emit). > + */ > + genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out); > +} > + > +static const struct brw_tracked_state genX(sol_state) = { > + .dirty = { > + .mesa = _NEW_LIGHT, > + .brw = BRW_NEW_BATCH | > + BRW_NEW_BLORP | > + BRW_NEW_RASTERIZER_DISCARD | > + BRW_NEW_VUE_MAP_GEOM_OUT | > + BRW_NEW_TRANSFORM_FEEDBACK, > + }, > + .emit = genX(upload_sol), > +}; > + > #endif > > /* ---------------------------------------------------------------------- */ > @@ -1197,7 +1549,7 @@ genX(init_atoms)(struct brw_context *brw) > &gen7_te_state, > &gen7_ds_state, > &gen7_gs_state, > - &gen7_sol_state, > + &genX(sol_state), > &genX(clip_state), > &genX(sbe_state), > &genX(sf_state), > @@ -1284,7 +1636,7 @@ genX(init_atoms)(struct brw_context *brw) > &gen7_te_state, > &gen8_ds_state, > &gen8_gs_state, > - &gen7_sol_state, > + &genX(sol_state), > &genX(clip_state), > &genX(raster_state), > &genX(sbe_state), >
signature.asc
Description: This is a digitally signed message part.
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev