Re: [Mesa-dev] [PATCH v03 35/38] i965: Port gen4+ emit vertices code to genxml.

Pohjolainen, Topi Thu, 04 May 2017 01:19:45 -0700

On Mon, May 01, 2017 at 06:43:23PM -0700, Rafael Antognolli wrote:
> Some code that was placed in brw_draw_upload.c and exported to be used
> by gen8+ was also moved to genX_state_upload, and the respective symbols
> are not exported anymore.
> 
> v2:
>    - Remove code from brw_draw_upload too
>    - Emit vertices for gen4-5 too.
>    - Use helper to setup brw_address (Kristian)
>    - Use macros for MOCS values.
>    - Do not use #ifndef NDEBUG on code that is actually used (Ken)
> v3:
>    - Style and code clenup (Ken)
>    - Keep some of the common code inside brw_draw_upload.c (Ken)
> 
> Signed-off-by: Rafael Antognolli <rafael.antogno...@intel.com>


There are some formatting nits further down but comparing to original I
couldn't spot anything really missing. All in all looks cleaner than before :)

Reviewed-by: Topi Pohjolainen <topi.pohjolai...@intel.com>

> ---
>  src/mesa/drivers/dri/i965/brw_draw_upload.c   | 454 +---------------
>  src/mesa/drivers/dri/i965/brw_state.h         |   2 +-
>  src/mesa/drivers/dri/i965/gen8_draw_upload.c  | 330 +-----------
>  src/mesa/drivers/dri/i965/genX_state_upload.c | 560 ++++++++++++++++++-
>  4 files changed, 556 insertions(+), 790 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c 
> b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index 7846293..8b30151 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -242,86 +242,6 @@ double_types(struct brw_context *brw,
>             : double_types_float[size]);
>  }
>  
> -static bool
> -is_passthru_format(uint32_t format)
> -{
> -   switch (format) {
> -   case ISL_FORMAT_R64_PASSTHRU:
> -   case ISL_FORMAT_R64G64_PASSTHRU:
> -   case ISL_FORMAT_R64G64B64_PASSTHRU:
> -   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> -      return true;
> -   default:
> -      return false;
> -   }
> -}
> -
> -static int
> -uploads_needed(uint32_t format)
> -{
> -   if (!is_passthru_format(format))
> -      return 1;
> -
> -   switch (format) {
> -   case ISL_FORMAT_R64_PASSTHRU:
> -   case ISL_FORMAT_R64G64_PASSTHRU:
> -      return 1;
> -   case ISL_FORMAT_R64G64B64_PASSTHRU:
> -   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> -      return 2;
> -   default:
> -      unreachable("not reached");
> -   }
> -}
> -
> -/*
> - * Returns the number of componentes associated with a format that is used on
> - * a 64 to 32 format split. See downsize_format()
> - */
> -static int
> -upload_format_size(uint32_t upload_format)
> -{
> -   switch (upload_format) {
> -   case ISL_FORMAT_R32G32_FLOAT:
> -      return 2;
> -   case ISL_FORMAT_R32G32B32A32_FLOAT:
> -      return 4;
> -   default:
> -      unreachable("not reached");
> -   }
> -}
> -
> -/*
> - * Returns the format that we are finally going to use when upload a vertex
> - * element. It will only change if we are using *64*PASSTHRU formats, as for
> - * gen < 8 they need to be splitted on two *32*FLOAT formats.
> - *
> - * @upload points in which upload we are. Valid values are [0,1]
> - */
> -static uint32_t
> -downsize_format_if_needed(uint32_t format,
> -                          int upload)
> -{
> -   assert(upload == 0 || upload == 1);
> -
> -   if (!is_passthru_format(format))
> -      return format;
> -
> -   switch (format) {
> -   case ISL_FORMAT_R64_PASSTHRU:
> -      return ISL_FORMAT_R32G32_FLOAT;
> -   case ISL_FORMAT_R64G64_PASSTHRU:
> -      return ISL_FORMAT_R32G32B32A32_FLOAT;
> -   case ISL_FORMAT_R64G64B64_PASSTHRU:
> -      return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
> -                     : ISL_FORMAT_R32G32_FLOAT;
> -   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> -      return ISL_FORMAT_R32G32B32A32_FLOAT;
> -   default:
> -      unreachable("not reached");
> -   }
> -}
> -
>  /**
>   * Given vertex array type/size/format/normalized info, return
>   * the appopriate hardware surface type.
> @@ -786,380 +706,6 @@ brw_prepare_shader_draw_parameters(struct brw_context 
> *brw)
>     }
>  }
>  
> -/**
> - * Emit a VERTEX_BUFFER_STATE entry (part of 3DSTATE_VERTEX_BUFFERS).
> - */
> -uint32_t *
> -brw_emit_vertex_buffer_state(struct brw_context *brw,
> -                             unsigned buffer_nr,
> -                             struct brw_bo *bo,
> -                             unsigned start_offset,
> -                             unsigned end_offset,
> -                             unsigned stride,
> -                             unsigned step_rate,
> -                             uint32_t *__map)
> -{
> -   struct gl_context *ctx = &brw->ctx;
> -   uint32_t dw0;
> -
> -   if (brw->gen >= 8) {
> -      dw0 = buffer_nr << GEN6_VB0_INDEX_SHIFT;
> -   } else if (brw->gen >= 6) {
> -      dw0 = (buffer_nr << GEN6_VB0_INDEX_SHIFT) |
> -            (step_rate ? GEN6_VB0_ACCESS_INSTANCEDATA
> -                       : GEN6_VB0_ACCESS_VERTEXDATA);
> -   } else {
> -      dw0 = (buffer_nr << BRW_VB0_INDEX_SHIFT) |
> -            (step_rate ? BRW_VB0_ACCESS_INSTANCEDATA
> -                       : BRW_VB0_ACCESS_VERTEXDATA);
> -   }
> -
> -   if (brw->gen >= 7)
> -      dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE;
> -
> -   switch (brw->gen) {
> -   case 7:
> -      dw0 |= GEN7_MOCS_L3 << 16;
> -      break;
> -   case 8:
> -      dw0 |= BDW_MOCS_WB << 16;
> -      break;
> -   case 9:
> -      dw0 |= SKL_MOCS_WB << 16;
> -      break;
> -   }
> -
> -   WARN_ONCE(stride >= (brw->gen >= 5 ? 2048 : 2047),
> -             "VBO stride %d too large, bad rendering may occur\n",
> -             stride);
> -   OUT_BATCH(dw0 | (stride << BRW_VB0_PITCH_SHIFT));
> -   if (brw->gen >= 8) {
> -      OUT_RELOC64(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
> -      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
> -       *                 Vertex Fetch (VF) Stage - State
> -       *
> -       * Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
> -       * VBState.BufferPitch", the address of the byte immediately beyond the
> -       * last valid byte of the buffer is determined by
> -       * "VBState.StartingBufferAddress + VBState.BufferSize".
> -       */
> -      OUT_BATCH(end_offset - start_offset);
> -   } else if (brw->gen >= 5) {
> -      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
> -      /* From the BSpec: 3D Pipeline Stages - 3D Pipeline Geometry -
> -       *                 Vertex Fetch (VF) Stage - State
> -       *
> -       *  Instead of "VBState.StartingBufferAddress + VBState.MaxIndex x
> -       *  VBState.BufferPitch", the address of the byte immediately beyond 
> the
> -       *  last valid byte of the buffer is determined by
> -       *  "VBState.EndAddress + 1".
> -       */
> -      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, end_offset - 1);
> -      OUT_BATCH(step_rate);
> -   } else {
> -      OUT_RELOC(bo, I915_GEM_DOMAIN_VERTEX, 0, start_offset);
> -      OUT_BATCH(0);
> -      OUT_BATCH(step_rate);
> -   }
> -
> -   return __map;
> -}
> -
> -static void
> -brw_emit_vertices(struct brw_context *brw)
> -{
> -   GLuint i;
> -
> -   brw_prepare_vertices(brw);
> -   brw_prepare_shader_draw_parameters(brw);
> -
> -   brw_emit_query_begin(brw);
> -
> -   const struct brw_vs_prog_data *vs_prog_data =
> -      brw_vs_prog_data(brw->vs.base.prog_data);
> -
> -   unsigned nr_elements = brw->vb.nr_enabled;
> -   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
> -       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance)
> -      ++nr_elements;
> -   if (vs_prog_data->uses_drawid)
> -      nr_elements++;
> -
> -   /* If any of the formats of vb.enabled needs more that one upload, we need
> -    * to add it to nr_elements */
> -   unsigned extra_uploads = 0;
> -   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
> -      struct brw_vertex_element *input = brw->vb.enabled[i];
> -      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
> -
> -      if (uploads_needed(format) > 1)
> -         extra_uploads++;
> -   }
> -   nr_elements += extra_uploads;
> -
> -   /* If the VS doesn't read any inputs (calculating vertex position from
> -    * a state variable for some reason, for example), emit a single pad
> -    * VERTEX_ELEMENT struct and bail.
> -    *
> -    * The stale VB state stays in place, but they don't do anything unless
> -    * a VE loads from them.
> -    */
> -   if (nr_elements == 0) {
> -      BEGIN_BATCH(3);
> -      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | 1);
> -      if (brw->gen >= 6) {
> -      OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
> -                GEN6_VE0_VALID |
> -                (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
> -                (0 << BRW_VE0_SRC_OFFSET_SHIFT));
> -      } else {
> -      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
> -                BRW_VE0_VALID |
> -                (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
> -                (0 << BRW_VE0_SRC_OFFSET_SHIFT));
> -      }
> -      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -             (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -             (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -             (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
> -      ADVANCE_BATCH();
> -      return;
> -   }
> -
> -   /* Now emit VB and VEP state packets.
> -    */
> -
> -   const bool uses_draw_params =
> -      vs_prog_data->uses_basevertex ||
> -      vs_prog_data->uses_baseinstance;
> -   const unsigned nr_buffers = brw->vb.nr_buffers +
> -      uses_draw_params + vs_prog_data->uses_drawid;
> -
> -   if (nr_buffers) {
> -      if (brw->gen >= 6) {
> -      assert(nr_buffers <= 33);
> -      } else {
> -      assert(nr_buffers <= 17);
> -      }
> -
> -      BEGIN_BATCH(1 + 4 * nr_buffers);
> -      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
> -      for (i = 0; i < brw->vb.nr_buffers; i++) {
> -      struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
> -         /* Prior to Haswell and Bay Trail we have to use 4-component formats
> -          * to fake 3-component ones.  In particular, we do this for
> -          * half-float and 8 and 16-bit integer formats.  This means that the
> -          * vertex element may poke over the end of the buffer by 2 bytes.
> -          */
> -         unsigned padding =
> -            (brw->gen <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;
> -         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->offset,
> -                                  buffer->offset + buffer->size + padding,
> -                                  buffer->stride, buffer->step_rate);
> -
> -      }
> -
> -      if (uses_draw_params) {
> -         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
> -                                  brw->draw.draw_params_bo,
> -                                  brw->draw.draw_params_offset,
> -                                  brw->draw.draw_params_bo->size,
> -                                  0,  /* stride */
> -                                  0); /* step rate */
> -      }
> -
> -      if (vs_prog_data->uses_drawid) {
> -         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1,
> -                                  brw->draw.draw_id_bo,
> -                                  brw->draw.draw_id_offset,
> -                                  brw->draw.draw_id_bo->size,
> -                                  0,  /* stride */
> -                                  0); /* step rate */
> -      }
> -
> -      ADVANCE_BATCH();
> -   }
> -
> -   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, 
> presumably
> -    * for VertexID/InstanceID.
> -    */
> -   if (brw->gen >= 6) {
> -      assert(nr_elements <= 34);
> -   } else {
> -      assert(nr_elements <= 18);
> -   }
> -
> -   struct brw_vertex_element *gen6_edgeflag_input = NULL;
> -
> -   BEGIN_BATCH(1 + nr_elements * 2);
> -   OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1));
> -   for (i = 0; i < brw->vb.nr_enabled; i++) {
> -      struct brw_vertex_element *input = brw->vb.enabled[i];
> -      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
> -      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
> -      unsigned num_uploads = 1;
> -      unsigned c;
> -
> -      num_uploads = uploads_needed(format);
> -
> -      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
> -         /* Gen6+ passes edgeflag as sideband along with the vertex, instead
> -          * of in the VUE.  We have to upload it sideband as the last vertex
> -          * element according to the B-Spec.
> -          */
> -         if (brw->gen >= 6) {
> -            gen6_edgeflag_input = input;
> -            continue;
> -         }
> -      }
> -
> -      for (c = 0; c < num_uploads; c++) {
> -         uint32_t upload_format = downsize_format_if_needed(format, c);
> -         /* If we need more that one upload, the offset stride would be 128
> -          * bits (16 bytes), as for previous uploads we are using the full
> -          * entry. */
> -         unsigned int offset = input->offset + c * 16;
> -         int size = input->glarray->Size;
> -
> -         if (is_passthru_format(format))
> -            size = upload_format_size(upload_format);
> -
> -         switch (size) {
> -         case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
> -         case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
> -         case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
> -         case 3: comp3 = input->glarray->Integer
> -                         ? BRW_VE1_COMPONENT_STORE_1_INT
> -                         : BRW_VE1_COMPONENT_STORE_1_FLT;
> -            break;
> -         }
> -
> -         if (brw->gen >= 6) {
> -            OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |
> -                      GEN6_VE0_VALID |
> -                      (upload_format << BRW_VE0_FORMAT_SHIFT) |
> -                      (offset << BRW_VE0_SRC_OFFSET_SHIFT));
> -         } else {
> -            OUT_BATCH((input->buffer << BRW_VE0_INDEX_SHIFT) |
> -                      BRW_VE0_VALID |
> -                      (upload_format << BRW_VE0_FORMAT_SHIFT) |
> -                      (offset << BRW_VE0_SRC_OFFSET_SHIFT));
> -         }
> -
> -         if (brw->gen >= 5)
> -            OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -                      (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                      (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                      (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
> -         else
> -            OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -                      (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                      (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                      (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
> -                      ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
> -      }
> -   }
> -
> -   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
> -       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
> -      uint32_t dw0 = 0, dw1 = 0;
> -      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_0;
> -      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_0;
> -      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_0;
> -      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_0;
> -
> -      if (vs_prog_data->uses_basevertex)
> -         comp0 = BRW_VE1_COMPONENT_STORE_SRC;
> -
> -      if (vs_prog_data->uses_baseinstance)
> -         comp1 = BRW_VE1_COMPONENT_STORE_SRC;
> -
> -      if (vs_prog_data->uses_vertexid)
> -         comp2 = BRW_VE1_COMPONENT_STORE_VID;
> -
> -      if (vs_prog_data->uses_instanceid)
> -         comp3 = BRW_VE1_COMPONENT_STORE_IID;
> -
> -      dw1 = (comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -            (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
> -            (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
> -            (comp3 << BRW_VE1_COMPONENT_3_SHIFT);
> -
> -      if (brw->gen >= 6) {
> -         dw0 |= GEN6_VE0_VALID |
> -                brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
> -                ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
> -      } else {
> -         dw0 |= BRW_VE0_VALID |
> -                brw->vb.nr_buffers << BRW_VE0_INDEX_SHIFT |
> -                ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT;
> -      dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
> -      }
> -
> -      /* Note that for gl_VertexID, gl_InstanceID, and gl_PrimitiveID values,
> -       * the format is ignored and the value is always int.
> -       */
> -
> -      OUT_BATCH(dw0);
> -      OUT_BATCH(dw1);
> -   }
> -
> -   if (vs_prog_data->uses_drawid) {
> -      uint32_t dw0 = 0, dw1 = 0;
> -
> -      dw1 = (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
> -            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_1_SHIFT) |
> -            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_2_SHIFT) |
> -            (BRW_VE1_COMPONENT_STORE_0   << BRW_VE1_COMPONENT_3_SHIFT);
> -
> -      if (brw->gen >= 6) {
> -         dw0 |= GEN6_VE0_VALID |
> -                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
> -                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
> -      } else {
> -         dw0 |= BRW_VE0_VALID |
> -                ((brw->vb.nr_buffers + 1) << BRW_VE0_INDEX_SHIFT) |
> -                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
> -
> -      dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT;
> -      }
> -
> -      OUT_BATCH(dw0);
> -      OUT_BATCH(dw1);
> -   }
> -
> -   if (brw->gen >= 6 && gen6_edgeflag_input) {
> -      uint32_t format =
> -         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
> -
> -      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
> -                GEN6_VE0_VALID |
> -                GEN6_VE0_EDGE_FLAG_ENABLE |
> -                (format << BRW_VE0_FORMAT_SHIFT) |
> -                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
> -      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
> -   }
> -
> -   ADVANCE_BATCH();
> -}
> -
> -const struct brw_tracked_state brw_vertices = {
> -   .dirty = {
> -      .mesa = _NEW_POLYGON,
> -      .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP |
> -             BRW_NEW_VERTICES |
> -             BRW_NEW_VS_PROG_DATA,
> -   },
> -   .emit = brw_emit_vertices,
> -};
> -
>  static void
>  brw_upload_indices(struct brw_context *brw)
>  {
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
> b/src/mesa/drivers/dri/i965/brw_state.h
> index 084f97f..acb7334 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -103,7 +103,6 @@ extern const struct brw_tracked_state brw_psp_urb_cbs;
>  
>  extern const struct brw_tracked_state brw_drawing_rect;
>  extern const struct brw_tracked_state brw_indices;
> -extern const struct brw_tracked_state brw_vertices;
>  extern const struct brw_tracked_state brw_index_buffer;
>  extern const struct brw_tracked_state brw_cs_state;
>  extern const struct brw_tracked_state gen7_cs_push_constants;
> @@ -125,7 +124,6 @@ extern const struct brw_tracked_state haswell_cut_index;
>  extern const struct brw_tracked_state gen8_index_buffer;
>  extern const struct brw_tracked_state gen8_multisample_state;
>  extern const struct brw_tracked_state gen8_pma_fix;
> -extern const struct brw_tracked_state gen8_vertices;
>  extern const struct brw_tracked_state gen8_vf_topology;
>  extern const struct brw_tracked_state brw_cs_work_groups_surface;
>  
> diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c 
> b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> index e81cca9..8db160b 100644
> --- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
> @@ -34,336 +34,6 @@
>  #include "intel_batchbuffer.h"
>  #include "intel_buffer_objects.h"
>  
> -#ifndef NDEBUG
> -static bool
> -is_passthru_format(uint32_t format)
> -{
> -   switch (format) {
> -   case ISL_FORMAT_R64_PASSTHRU:
> -   case ISL_FORMAT_R64G64_PASSTHRU:
> -   case ISL_FORMAT_R64G64B64_PASSTHRU:
> -   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> -      return true;
> -   default:
> -      return false;
> -   }
> -}
> -#endif
> -
> -static void
> -gen8_emit_vertices(struct brw_context *brw)
> -{
> -   struct gl_context *ctx = &brw->ctx;
> -   bool uses_edge_flag;
> -
> -   brw_prepare_vertices(brw);
> -   brw_prepare_shader_draw_parameters(brw);
> -
> -   uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
> -                     ctx->Polygon.BackMode != GL_FILL);
> -
> -   const struct brw_vs_prog_data *vs_prog_data =
> -      brw_vs_prog_data(brw->vs.base.prog_data);
> -
> -   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
> -      unsigned vue = brw->vb.nr_enabled;
> -
> -      /* The element for the edge flags must always be last, so we have to
> -       * insert the SGVS before it in that case.
> -       */
> -      if (uses_edge_flag) {
> -         assert(vue > 0);
> -         vue--;
> -      }
> -
> -      WARN_ONCE(vue >= 33,
> -                "Trying to insert VID/IID past 33rd vertex element, "
> -                "need to reorder the vertex attrbutes.");
> -
> -      unsigned dw1 = 0;
> -      if (vs_prog_data->uses_vertexid) {
> -         dw1 |= GEN8_SGVS_ENABLE_VERTEX_ID |
> -                (2 << GEN8_SGVS_VERTEX_ID_COMPONENT_SHIFT) |  /* .z channel 
> */
> -                (vue << GEN8_SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT);
> -      }
> -
> -      if (vs_prog_data->uses_instanceid) {
> -         dw1 |= GEN8_SGVS_ENABLE_INSTANCE_ID |
> -                (3 << GEN8_SGVS_INSTANCE_ID_COMPONENT_SHIFT) | /* .w channel 
> */
> -                (vue << GEN8_SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT);
> -      }
> -
> -      BEGIN_BATCH(2);
> -      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
> -      OUT_BATCH(dw1);
> -      ADVANCE_BATCH();
> -
> -      BEGIN_BATCH(3);
> -      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
> -      OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE);
> -      OUT_BATCH(0);
> -      ADVANCE_BATCH();
> -   } else {
> -      BEGIN_BATCH(2);
> -      OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2));
> -      OUT_BATCH(0);
> -      ADVANCE_BATCH();
> -   }
> -
> -   /* Normally we don't need an element for the SGVS attribute because the
> -    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in 
> an
> -    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
> -    * we're using draw parameters then we need an element for the those
> -    * values.  Additionally if there is an edge flag element then the SGVS
> -    * can't be inserted past that so we need a dummy element to ensure that
> -    * the edge flag is the last one.
> -    */
> -   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
> -                                    vs_prog_data->uses_baseinstance ||
> -                                    ((vs_prog_data->uses_instanceid ||
> -                                      vs_prog_data->uses_vertexid) &&
> -                                     uses_edge_flag));
> -   const unsigned nr_elements =
> -      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
> -
> -   /* If the VS doesn't read any inputs (calculating vertex position from
> -    * a state variable for some reason, for example), emit a single pad
> -    * VERTEX_ELEMENT struct and bail.
> -    *
> -    * The stale VB state stays in place, but they don't do anything unless
> -    * a VE loads from them.
> -    */
> -   if (nr_elements == 0) {
> -      BEGIN_BATCH(3);
> -      OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (3 - 2));
> -      OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) |
> -                GEN6_VE0_VALID |
> -                (ISL_FORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
> -                (0 << BRW_VE0_SRC_OFFSET_SHIFT));
> -      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_1_FLT << 
> BRW_VE1_COMPONENT_3_SHIFT));
> -      ADVANCE_BATCH();
> -      return;
> -   }
> -
> -   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
> -   const bool uses_draw_params =
> -      vs_prog_data->uses_basevertex ||
> -      vs_prog_data->uses_baseinstance;
> -   const unsigned nr_buffers = brw->vb.nr_buffers +
> -      uses_draw_params + vs_prog_data->uses_drawid;
> -
> -   if (nr_buffers) {
> -      assert(nr_buffers <= 33);
> -
> -      BEGIN_BATCH(1 + 4 * nr_buffers);
> -      OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
> -      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
> -         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
> -         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo,
> -                                  buffer->offset,
> -                                  buffer->offset + buffer->size,
> -                                  buffer->stride, 0 /* unused */);
> -      }
> -
> -      if (uses_draw_params) {
> -         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers,
> -                                  brw->draw.draw_params_bo,
> -                                  brw->draw.draw_params_offset,
> -                                  brw->draw.draw_params_bo->size,
> -                                  0 /* stride */,
> -                                  0 /* unused */);
> -      }
> -
> -      if (vs_prog_data->uses_drawid) {
> -         EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1,
> -                                  brw->draw.draw_id_bo,
> -                                  brw->draw.draw_id_offset,
> -                                  brw->draw.draw_id_bo->size,
> -                                  0 /* stride */,
> -                                  0 /* unused */);
> -      }
> -      ADVANCE_BATCH();
> -   }
> -
> -   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
> -    * presumably for VertexID/InstanceID.
> -    */
> -   assert(nr_elements <= 34);
> -
> -   struct brw_vertex_element *gen6_edgeflag_input = NULL;
> -
> -   BEGIN_BATCH(1 + nr_elements * 2);
> -   OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1));
> -   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
> -      struct brw_vertex_element *input = brw->vb.enabled[i];
> -      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
> -      uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC;
> -      uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC;
> -
> -      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
> -       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
> -       * element which has edge flag enabled."
> -       */
> -      assert(!(is_passthru_format(format) && uses_edge_flag));
> -
> -      /* The gen4 driver expects edgeflag to come in as a float, and passes
> -       * that float on to the tests in the clipper.  Mesa's current vertex
> -       * attribute value for EdgeFlag is stored as a float, which works out.
> -       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
> -       * integer ubyte.  Just rewrite that to convert to a float.
> -       */
> -      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
> -         /* Gen6+ passes edgeflag as sideband along with the vertex, instead
> -          * of in the VUE.  We have to upload it sideband as the last vertex
> -          * element according to the B-Spec.
> -          */
> -         gen6_edgeflag_input = input;
> -         continue;
> -      }
> -
> -      switch (input->glarray->Size) {
> -      case 0: comp0 = BRW_VE1_COMPONENT_STORE_0;
> -      case 1: comp1 = BRW_VE1_COMPONENT_STORE_0;
> -      case 2: comp2 = BRW_VE1_COMPONENT_STORE_0;
> -      case 3:
> -         if (input->glarray->Doubles) {
> -            comp3 = BRW_VE1_COMPONENT_STORE_0;
> -         } else if (input->glarray->Integer) {
> -            comp3 = BRW_VE1_COMPONENT_STORE_1_INT;
> -         } else {
> -            comp3 = BRW_VE1_COMPONENT_STORE_1_FLT;
> -         }
> -
> -         break;
> -      }
> -
> -      /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
> -       *
> -       *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
> -       *     formats, 64-bit components are stored in the URB without any
> -       *     conversion. In this case, vertex elements must be written as 128
> -       *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output
> -       *     as required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
> -       *     component into the URB, Component 1 must be specified as
> -       *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE)
> -       *     in order to output a 128-bit vertex element, or Components 1-3 
> must
> -       *     be specified as VFCOMP_STORE_0 in order to output a 256-bit 
> vertex
> -       *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 
> 3
> -       *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit 
> vertex
> -       *     element."
> -       */
> -      if (input->glarray->Doubles && !input->is_dual_slot) {
> -         /* Store vertex elements which correspond to double and dvec2 vertex
> -          * shader inputs as 128-bit vertex elements, instead of 256-bits.
> -          */
> -         comp2 = BRW_VE1_COMPONENT_NOSTORE;
> -         comp3 = BRW_VE1_COMPONENT_NOSTORE;
> -      }
> -
> -      OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) |
> -                GEN6_VE0_VALID |
> -                (format << BRW_VE0_FORMAT_SHIFT) |
> -                (input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
> -
> -      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -                (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
> -   }
> -
> -   if (needs_sgvs_element) {
> -      if (vs_prog_data->uses_basevertex ||
> -          vs_prog_data->uses_baseinstance) {
> -         OUT_BATCH(GEN6_VE0_VALID |
> -                   brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
> -                   ISL_FORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT);
> -         OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << 
> BRW_VE1_COMPONENT_0_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_SRC << 
> BRW_VE1_COMPONENT_1_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
> -      } else {
> -         OUT_BATCH(GEN6_VE0_VALID);
> -         OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
> -      }
> -   }
> -
> -   if (vs_prog_data->uses_drawid) {
> -      OUT_BATCH(GEN6_VE0_VALID |
> -                ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) |
> -                (ISL_FORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT));
> -      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
> -   }
> -
> -   if (gen6_edgeflag_input) {
> -      uint32_t format =
> -         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
> -
> -      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
> -                GEN6_VE0_VALID |
> -                GEN6_VE0_EDGE_FLAG_ENABLE |
> -                (format << BRW_VE0_FORMAT_SHIFT) |
> -                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
> -      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
> -                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
> -   }
> -   ADVANCE_BATCH();
> -
> -   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
> -      const struct brw_vertex_element *input = brw->vb.enabled[i];
> -      const struct brw_vertex_buffer *buffer = 
> &brw->vb.buffers[input->buffer];
> -      unsigned element_index;
> -
> -      /* The edge flag element is reordered to be the last one in the code
> -       * above so we need to compensate for that in the element indices used
> -       * below.
> -       */
> -      if (input == gen6_edgeflag_input)
> -         element_index = nr_elements - 1;
> -      else
> -         element_index = j++;
> -
> -      BEGIN_BATCH(3);
> -      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
> -      OUT_BATCH(element_index |
> -                (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0));
> -      OUT_BATCH(buffer->step_rate);
> -      ADVANCE_BATCH();
> -   }
> -
> -   if (vs_prog_data->uses_drawid) {
> -      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
> -      BEGIN_BATCH(3);
> -      OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
> -      OUT_BATCH(element);
> -      OUT_BATCH(0);
> -      ADVANCE_BATCH();
> -   }
> -}
> -
> -const struct brw_tracked_state gen8_vertices = {
> -   .dirty = {
> -      .mesa = _NEW_POLYGON,
> -      .brw = BRW_NEW_BATCH |
> -             BRW_NEW_BLORP |
> -             BRW_NEW_VERTICES |
> -             BRW_NEW_VS_PROG_DATA,
> -   },
> -   .emit = gen8_emit_vertices,
> -};
> -
>  static void
>  gen8_emit_index_buffer(struct brw_context *brw)
>  {
> diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c 
> b/src/mesa/drivers/dri/i965/genX_state_upload.c
> index 3e6ffbd..df05b51 100644
> --- a/src/mesa/drivers/dri/i965/genX_state_upload.c
> +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
> @@ -26,10 +26,16 @@
>  #include "common/gen_device_info.h"
>  #include "genxml/gen_macros.h"
>  
> +#include "main/bufferobj.h"
> +#include "main/context.h"
> +#include "main/enums.h"
> +#include "main/macros.h"
> +
>  #include "brw_context.h"
>  #if GEN_GEN == 6
>  #include "brw_defines.h"
>  #endif
> +#include "brw_draw.h"
>  #include "brw_state.h"
>  #include "brw_wm.h"
>  #include "brw_util.h"
> @@ -125,6 +131,17 @@ instruction_bo(struct brw_bo *bo, uint32_t offset)
>     };
>  }
>  
> +static inline struct brw_address
> +vertex_bo(struct brw_bo *bo, uint32_t offset)
> +{
> +   return (struct brw_address) {
> +            .bo = bo,
> +            .offset = offset,
> +            .read_domains = I915_GEM_DOMAIN_VERTEX,
> +            .write_domain = 0,
> +   };
> +}
> +
>  #include "genxml/genX_pack.h"
>  
>  #define _brw_cmd_length(cmd) cmd ## _length
> @@ -158,6 +175,541 @@ instruction_bo(struct brw_bo *bo, uint32_t offset)
>          _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
>          _dst = NULL)
>  
> +static uint32_t *
> +genX(emit_vertex_buffer_state)(struct brw_context *brw,
> +                               uint32_t *dw,
> +                               unsigned buffer_nr,
> +                               struct brw_bo *bo,
> +                               unsigned start_offset,
> +                               unsigned end_offset,
> +                               unsigned stride,
> +                               unsigned step_rate)
> +{
> +   struct GENX(VERTEX_BUFFER_STATE) buf_state = {
> +      .VertexBufferIndex = buffer_nr,
> +      .BufferPitch = stride,
> +      .BufferStartingAddress = vertex_bo(bo, start_offset),
> +#if GEN_GEN >= 8
> +      .BufferSize = end_offset - start_offset,
> +#endif
> +
> +#if GEN_GEN >= 7
> +      .AddressModifyEnable = true,
> +#endif
> +
> +#if GEN_GEN < 8
> +      .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
> +      .InstanceDataStepRate = step_rate,
> +#if GEN_GEN >= 5
> +      .EndAddress = vertex_bo(bo, end_offset - 1),
> +#endif
> +#endif
> +
> +#if GEN_GEN == 9
> +      .VertexBufferMOCS = SKL_MOCS_WB,
> +#elif GEN_GEN == 8
> +      .VertexBufferMOCS = BDW_MOCS_WB,
> +#elif GEN_GEN == 7
> +      .VertexBufferMOCS = GEN7_MOCS_L3,
> +#endif
> +   };
> +
> +   GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
> +   return dw + GENX(VERTEX_BUFFER_STATE_length);
> +}
> +
> +UNUSED static bool
> +is_passthru_format(uint32_t format)
> +{
> +   switch (format) {
> +   case ISL_FORMAT_R64_PASSTHRU:
> +   case ISL_FORMAT_R64G64_PASSTHRU:
> +   case ISL_FORMAT_R64G64B64_PASSTHRU:
> +   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> +      return true;
> +   default:
> +      return false;
> +   }
> +}
> +
> +UNUSED static int
> +genX(uploads_needed)(uint32_t format)

As you mark this with UNUSED, does this need to be genX()-wrapped?

> +{
> +   if (!is_passthru_format(format))
> +      return 1;
> +
> +   switch (format) {
> +   case ISL_FORMAT_R64_PASSTHRU:
> +   case ISL_FORMAT_R64G64_PASSTHRU:
> +      return 1;
> +   case ISL_FORMAT_R64G64B64_PASSTHRU:
> +   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> +      return 2;
> +   default:
> +      unreachable("not reached");
> +   }
> +}
> +
> +/*
> + * Returns the format that we are finally going to use when upload a vertex
> + * element. It will only change if we are using *64*PASSTHRU formats, as for
> + * gen < 8 they need to be splitted on two *32*FLOAT formats.
> + *
> + * @upload points in which upload we are. Valid values are [0,1]
> + */
> +static uint32_t
> +downsize_format_if_needed(uint32_t format,
> +                          int upload)
> +{
> +   assert(upload == 0 || upload == 1);
> +
> +   if (!is_passthru_format(format))
> +      return format;
> +
> +   switch (format) {
> +   case ISL_FORMAT_R64_PASSTHRU:
> +      return ISL_FORMAT_R32G32_FLOAT;
> +   case ISL_FORMAT_R64G64_PASSTHRU:
> +      return ISL_FORMAT_R32G32B32A32_FLOAT;
> +   case ISL_FORMAT_R64G64B64_PASSTHRU:
> +      return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
> +                     : ISL_FORMAT_R32G32_FLOAT;
> +   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
> +      return ISL_FORMAT_R32G32B32A32_FLOAT;
> +   default:
> +      unreachable("not reached");
> +   }
> +}
> +
> +/*
> + * Returns the number of componentes associated with a format that is used on
> + * a 64 to 32 format split. See downsize_format()
> + */
> +static int
> +upload_format_size(uint32_t upload_format)
> +{
> +   switch (upload_format) {
> +   case ISL_FORMAT_R32G32_FLOAT:
> +      return 2;
> +   case ISL_FORMAT_R32G32B32A32_FLOAT:
> +      return 4;
> +   default:
> +      unreachable("not reached");
> +   }
> +}
> +
> +static void
> +genX(emit_vertices)(struct brw_context *brw)
> +{
> +   uint32_t *dw;
> +
> +   brw_prepare_vertices(brw);
> +   brw_prepare_shader_draw_parameters(brw);
> +
> +#if GEN_GEN < 6
> +   brw_emit_query_begin(brw);
> +#endif
> +
> +   const struct brw_vs_prog_data *vs_prog_data =
> +      brw_vs_prog_data(brw->vs.base.prog_data);
> +
> +#if GEN_GEN >= 8
> +   struct gl_context *ctx = &brw->ctx;
> +   bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||

Could be const.

> +                          ctx->Polygon.BackMode != GL_FILL);
> +
> +   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
> +      unsigned vue = brw->vb.nr_enabled;
> +
> +      /* The element for the edge flags must always be last, so we have to
> +       * insert the SGVS before it in that case.
> +       */
> +      if (uses_edge_flag) {
> +         assert(vue > 0);
> +         vue--;
> +      }
> +
> +      WARN_ONCE(vue >= 33,
> +                "Trying to insert VID/IID past 33rd vertex element, "
> +                "need to reorder the vertex attrbutes.");
> +
> +      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
> +         if (vs_prog_data->uses_vertexid) {
> +            vfs.VertexIDEnable = true;
> +            vfs.VertexIDComponentNumber = 2;
> +            vfs.VertexIDElementOffset = vue;
> +         }
> +
> +         if (vs_prog_data->uses_instanceid) {
> +            vfs.InstanceIDEnable = true;
> +            vfs.InstanceIDComponentNumber = 3;
> +            vfs.InstanceIDElementOffset = vue;
> +         }
> +      }
> +
> +      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
> +         vfi.InstancingEnable = true;
> +         vfi.VertexElementIndex = vue;
> +      }
> +   } else {
> +      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
> +   }
> +
> +   /* Normally we don't need an element for the SGVS attribute because the
> +    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in 
> an
> +    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
> +    * we're using draw parameters then we need an element for the those
> +    * values.  Additionally if there is an edge flag element then the SGVS
> +    * can't be inserted past that so we need a dummy element to ensure that
> +    * the edge flag is the last one.
> +    */
> +   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
> +                                    vs_prog_data->uses_baseinstance ||
> +                                    ((vs_prog_data->uses_instanceid ||
> +                                      vs_prog_data->uses_vertexid)
> +                                     && uses_edge_flag));
> +#else
> +   const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
> +                                    vs_prog_data->uses_baseinstance ||
> +                                    vs_prog_data->uses_instanceid ||
> +                                    vs_prog_data->uses_vertexid);
> +#endif
> +   unsigned nr_elements =
> +      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
> +
> +#if GEN_GEN < 8
> +   /* If any of the formats of vb.enabled needs more that one upload, we need
> +    * to add it to nr_elements
> +    */
> +   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
> +      struct brw_vertex_element *input = brw->vb.enabled[i];
> +      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
> +
> +      if (genX(uploads_needed(format)) > 1)
> +         nr_elements++;
> +   }
> +#endif
> +
> +   /* If the VS doesn't read any inputs (calculating vertex position from
> +    * a state variable for some reason, for example), emit a single pad
> +    * VERTEX_ELEMENT struct and bail.
> +    *
> +    * The stale VB state stays in place, but they don't do anything unless
> +    * a VE loads from them.
> +    */
> +   if (nr_elements == 0) {
> +      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS), 1 + 
> GENX(VERTEX_ELEMENT_STATE_length));

Wrap overflowing "1 + GENX(..." to next line.

> +      struct GENX(VERTEX_ELEMENT_STATE) elem = {
> +         .Valid = true,
> +         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
> +         .Component0Control = VFCOMP_STORE_0,
> +         .Component1Control = VFCOMP_STORE_0,
> +         .Component2Control = VFCOMP_STORE_0,
> +         .Component3Control = VFCOMP_STORE_1_FP,
> +      };
> +      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
> +      return;
> +   }
> +
> +   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
> +   const bool uses_draw_params =
> +      vs_prog_data->uses_basevertex ||
> +      vs_prog_data->uses_baseinstance;
> +   const unsigned nr_buffers = brw->vb.nr_buffers +
> +      uses_draw_params + vs_prog_data->uses_drawid;
> +
> +   if (nr_buffers) {
> +#if GEN_GEN >= 6
> +      assert(nr_buffers <= 33);
> +#else
> +      assert(nr_buffers <= 17);
> +#endif

Either drop these five lines or the one below :)

> +      assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
> +
> +      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
> +                           1 + GENX(VERTEX_BUFFER_STATE_length) * 
> nr_buffers);
> +
> +      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
> +         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
> +         /* Prior to Haswell and Bay Trail we have to use 4-component formats
> +          * to fake 3-component ones.  In particular, we do this for
> +          * half-float and 8 and 16-bit integer formats.  This means that the
> +          * vertex element may poke over the end of the buffer by 2 bytes.
> +          */
> +         unsigned padding =
> +            (GEN_GEN <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;

Could be const. And if we added:

            const unsigned end = buffer->offset + buffer->size + padding;

> +         dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
> +                                             buffer->offset,
> +                                             buffer->offset + buffer->size + 
> padding,

we could use it here and avoid overflowing the line.

> +                                             buffer->stride,
> +                                             buffer->step_rate);
> +      }
> +
> +      if (uses_draw_params) {
> +         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
> +                                             brw->draw.draw_params_bo,
> +                                             brw->draw.draw_params_offset,
> +                                             brw->draw.draw_params_bo->size,
> +                                             0 /* stride */,
> +                                             0 /* step rate */);
> +      }
> +
> +      if (vs_prog_data->uses_drawid) {
> +         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
> +                                             brw->draw.draw_id_bo,
> +                                             brw->draw.draw_id_offset,
> +                                             brw->draw.draw_id_bo->size,
> +                                             0 /* stride */,
> +                                             0 /* step rate */);
> +      }
> +   }
> +
> +   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
> +    * presumably for VertexID/InstanceID.
> +    */
> +#if GEN_GEN >= 6
> +   assert(nr_elements <= 34);
> +   struct brw_vertex_element *gen6_edgeflag_input = NULL;

Could be const, contents is only used for reading.

> +#else
> +   assert(nr_elements <= 18);
> +#endif
> +
> +   dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
> +                        1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
> +   unsigned i;
> +   for (i = 0; i < brw->vb.nr_enabled; i++) {
> +      struct brw_vertex_element *input = brw->vb.enabled[i];

Could be const.

> +      uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
> +      uint32_t comp0 = VFCOMP_STORE_SRC;
> +      uint32_t comp1 = VFCOMP_STORE_SRC;
> +      uint32_t comp2 = VFCOMP_STORE_SRC;
> +      uint32_t comp3 = VFCOMP_STORE_SRC;
> +      unsigned num_uploads = 1;

Would this be a little simpler (dropping the update below):

         const unsigned num_uploads =
            GEN_GEN < 8 ? genX(uploads_needed(format)) : 1;

> +
> +#if GEN_GEN >= 8
> +      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
> +       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
> +       * element which has edge flag enabled."
> +       */
> +      assert(!(is_passthru_format(format) && uses_edge_flag));
> +#endif
> +
> +      /* The gen4 driver expects edgeflag to come in as a float, and passes
> +       * that float on to the tests in the clipper.  Mesa's current vertex
> +       * attribute value for EdgeFlag is stored as a float, which works out.
> +       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
> +       * integer ubyte.  Just rewrite that to convert to a float.
> +       *
> +       * Gen6+ passes edgeflag as sideband along with the vertex, instead
> +       * of in the VUE.  We have to upload it sideband as the last vertex
> +       * element according to the B-Spec.
> +       */
> +#if GEN_GEN >= 6
> +      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
> +         gen6_edgeflag_input = input;
> +         continue;
> +      }
> +#endif
> +
> +#if GEN_GEN < 8
> +      num_uploads = genX(uploads_needed(format));
> +#endif
> +
> +      for (unsigned c = 0; c < num_uploads; c++) {
> +         uint32_t upload_format = GEN_GEN >= 8 ? format :

Could be const.

> +            downsize_format_if_needed(format, c);
> +         /* If we need more that one upload, the offset stride would be 128
> +          * bits (16 bytes), as for previous uploads we are using the full
> +          * entry. */
> +         unsigned int offset = input->offset + c * 16;

Could be const and simply "unsigned".

> +         int size = input->glarray->Size;
> +
> +         if (GEN_GEN < 8 && is_passthru_format(format))
> +            size = upload_format_size(upload_format);

Matter of taste but could be also:

            const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
               upload_format_size(upload_format) : input->glarray->Size;

> +
> +         switch (size) {
> +            case 0: comp0 = VFCOMP_STORE_0;
> +            case 1: comp1 = VFCOMP_STORE_0;
> +            case 2: comp2 = VFCOMP_STORE_0;
> +            case 3:
> +               if (GEN_GEN >= 8 && input->glarray->Doubles) {
> +                  comp3 = VFCOMP_STORE_0;
> +               } else if (input->glarray->Integer) {
> +                  comp3 = VFCOMP_STORE_1_INT;
> +               } else {
> +                  comp3 = VFCOMP_STORE_1_FP;
> +               }
> +
> +               break;
> +         }
> +
> +#if GEN_GEN >= 8
> +         /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
> +          *
> +          *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
> +          *     formats, 64-bit components are stored in the URB without any
> +          *     conversion. In this case, vertex elements must be written as 
> 128
> +          *     or 256 bits, with VFCOMP_STORE_0 being used to pad the 
> output as
> +          *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
> +          *     component into the URB, Component 1 must be specified as
> +          *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
> +          *     order to output a 128-bit vertex element, or Components 1-3 
> must
> +          *     be specified as VFCOMP_STORE_0 in order to output a 256-bit 
> vertex
> +          *     element. Likewise, use of R64G64B64_PASSTHRU requires 
> Component 3
> +          *     to be specified as VFCOMP_STORE_0 in order to output a 
> 256-bit
> +          *     vertex element."
> +          */
> +         if (input->glarray->Doubles && !input->is_dual_slot) {
> +            /* Store vertex elements which correspond to double and dvec2 
> vertex
> +             * shader inputs as 128-bit vertex elements, instead of 256-bits.
> +             */
> +            comp2 = VFCOMP_NOSTORE;
> +            comp3 = VFCOMP_NOSTORE;
> +         }
> +#endif
> +
> +         struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
> +            .VertexBufferIndex = input->buffer,
> +            .Valid = true,
> +            .SourceElementFormat = upload_format,
> +            .SourceElementOffset = offset,
> +            .Component0Control = comp0,
> +            .Component1Control = comp1,
> +            .Component2Control = comp2,
> +            .Component3Control = comp3,
> +#if GEN_GEN < 5
> +            .DestinationElementOffset = i * 4,
> +#endif
> +         };
> +
> +         GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
> +         dw += GENX(VERTEX_ELEMENT_STATE_length);
> +      }
> +   }
> +
> +   if (needs_sgvs_element) {
> +      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
> +         .Valid = true,
> +         .Component0Control = VFCOMP_STORE_0,
> +         .Component1Control = VFCOMP_STORE_0,
> +         .Component2Control = VFCOMP_STORE_0,
> +         .Component3Control = VFCOMP_STORE_0,
> +#if GEN_GEN < 5
> +         .DestinationElementOffset = i * 4,

This is how original had it also. I'm just thinking should we use instead:

            .DestinationElementOffset = brw->vb.nr_buffers * 4,

At this point i == brw->vb.nr_buffers always holds, right?

> +#endif
> +      };
> +
> +#if GEN_GEN >= 8
> +      if (vs_prog_data->uses_basevertex ||
> +          vs_prog_data->uses_baseinstance) {
> +         elem_state.VertexBufferIndex = brw->vb.nr_buffers;
> +         elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
> +         elem_state.Component0Control = VFCOMP_STORE_SRC;
> +         elem_state.Component1Control = VFCOMP_STORE_SRC;
> +      }
> +#else
> +      elem_state.VertexBufferIndex = brw->vb.nr_buffers;
> +      elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
> +      if (vs_prog_data->uses_basevertex)
> +         elem_state.Component0Control = VFCOMP_STORE_SRC;
> +
> +      if (vs_prog_data->uses_baseinstance)
> +         elem_state.Component1Control = VFCOMP_STORE_SRC;
> +
> +      if (vs_prog_data->uses_vertexid)
> +         elem_state.Component2Control = VFCOMP_STORE_VID;
> +
> +      if (vs_prog_data->uses_instanceid)
> +         elem_state.Component3Control = VFCOMP_STORE_IID;
> +#endif
> +
> +      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
> +      dw += GENX(VERTEX_ELEMENT_STATE_length);
> +   }
> +
> +   if (vs_prog_data->uses_drawid) {
> +      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
> +         .Valid = true,
> +         .VertexBufferIndex = brw->vb.nr_buffers + 1,
> +         .SourceElementFormat = ISL_FORMAT_R32_UINT,
> +         .Component0Control = VFCOMP_STORE_SRC,
> +         .Component1Control = VFCOMP_STORE_0,
> +         .Component2Control = VFCOMP_STORE_0,
> +         .Component3Control = VFCOMP_STORE_0,
> +#if GEN_GEN < 5
> +         .DestinationElementOffset = i * 4,

Same comment as further up.

> +#endif
> +      };
> +
> +      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
> +      dw += GENX(VERTEX_ELEMENT_STATE_length);
> +   }
> +
> +#if GEN_GEN >= 6
> +   if (gen6_edgeflag_input) {
> +      uint32_t format =

Could be const.

> +         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
> +
> +      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
> +         .Valid = true,
> +         .VertexBufferIndex = gen6_edgeflag_input->buffer,
> +         .EdgeFlagEnable = true,
> +         .SourceElementFormat = format,
> +         .SourceElementOffset = gen6_edgeflag_input->offset,
> +         .Component0Control = VFCOMP_STORE_SRC,
> +         .Component1Control = VFCOMP_STORE_0,
> +         .Component2Control = VFCOMP_STORE_0,
> +         .Component3Control = VFCOMP_STORE_0,
> +      };
> +
> +      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
> +      dw += GENX(VERTEX_ELEMENT_STATE_length);
> +   }
> +#endif
> +
> +#if GEN_GEN >= 8
> +   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
> +      const struct brw_vertex_element *input = brw->vb.enabled[i];
> +      const struct brw_vertex_buffer *buffer = 
> &brw->vb.buffers[input->buffer];
> +      unsigned element_index;
> +
> +      /* The edge flag element is reordered to be the last one in the code
> +       * above so we need to compensate for that in the element indices used
> +       * below.
> +       */
> +      if (input == gen6_edgeflag_input)
> +         element_index = nr_elements - 1;
> +      else
> +         element_index = j++;
> +
> +      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
> +         vfi.VertexElementIndex = element_index;
> +         vfi.InstancingEnable = buffer->step_rate != 0;
> +         vfi.InstanceDataStepRate = buffer->step_rate;
> +      }
> +   }
> +
> +   if (vs_prog_data->uses_drawid) {
> +      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
> +
> +      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
> +         vfi.VertexElementIndex = element;
> +      }
> +   }
> +#endif
> +}
> +
> +static const struct brw_tracked_state genX(vertices) = {
> +   .dirty = {
> +      .mesa = _NEW_POLYGON,
> +      .brw = BRW_NEW_BATCH |
> +             BRW_NEW_BLORP |
> +             BRW_NEW_VERTICES |
> +             BRW_NEW_VS_PROG_DATA,
> +   },
> +   .emit = genX(emit_vertices),
> +};
> +
>  #if GEN_GEN >= 6
>  /**
>   * Determine the appropriate attribute override value to store into the
> @@ -3004,7 +3556,7 @@ genX(init_atoms)(struct brw_context *brw)
>        &brw_drawing_rect,
>        &brw_indices, /* must come before brw_vertices */
>        &brw_index_buffer,
> -      &brw_vertices,
> +      &genX(vertices),
>  
>        &brw_constant_buffer
>     };
> @@ -3071,7 +3623,7 @@ genX(init_atoms)(struct brw_context *brw)
>  
>        &brw_indices, /* must come before brw_vertices */
>        &brw_index_buffer,
> -      &brw_vertices,
> +      &genX(vertices),
>     };
>  #elif GEN_GEN == 7
>     static const struct brw_tracked_state *render_atoms[] =
> @@ -3159,7 +3711,7 @@ genX(init_atoms)(struct brw_context *brw)
>  
>        &brw_indices, /* must come before brw_vertices */
>        &brw_index_buffer,
> -      &brw_vertices,
> +      &genX(vertices),
>  
>        &haswell_cut_index,
>     };
> @@ -3252,7 +3804,7 @@ genX(init_atoms)(struct brw_context *brw)
>  
>        &brw_indices,
>        &gen8_index_buffer,
> -      &gen8_vertices,
> +      &genX(vertices),
>  
>        &haswell_cut_index,
>        &gen8_pma_fix,
> -- 
> git-series 0.9.1
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v03 35/38] i965: Port gen4+ emit vertices code to genxml.

Reply via email to