Re: [Mesa-dev] [PATCH 1/2] i965: Massively simplify the intel_upload implementation.

Kenneth Graunke Tue, 25 Mar 2014 14:33:06 -0700

On 03/25/2014 11:27 AM, Eric Anholt wrote:
> The implementation kept a page-sized area for uploading data, and
> uploaded chunks from that to a 64kb-sized streamed buffer.  This wasted
> cache footprint (and extra state tracking to do so) when we want to just
> write our data into the buffer immediately.
> 
> Instead, build it around an interface like brw_state_batch() that just
> gets you a pointer to BO memory to upload your stuff immediately.
> 
> Improves OpenArena on HSW by 1.62209% +/- 0.355299% (n=61) and on BYT by
> 1.7916% +/- 0.415743% (n=31).
> 
> v2: Rebase on Mesa master, drop old prototypes.  Re-do performance
>     comparison on a kernel that doesn't punish CPU efficiency
>     improvements.
> ---
>  src/mesa/drivers/dri/i965/brw_context.h          |   5 +-
>  src/mesa/drivers/dri/i965/brw_draw_upload.c      |  10 +-
>  src/mesa/drivers/dri/i965/intel_buffer_objects.h |  21 +--
>  src/mesa/drivers/dri/i965/intel_upload.c         | 167 
> +++++++++--------------
>  4 files changed, 77 insertions(+), 126 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> b/src/mesa/drivers/dri/i965/brw_context.h
> index 04af5d0..e119682 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -1041,10 +1041,7 @@ struct brw_context
>  
>     struct {
>        drm_intel_bo *bo;
> -      GLuint offset;
> -      uint32_t buffer_len;
> -      uint32_t buffer_offset;
> -      char buffer[4096];
> +      uint32_t next_offset;
>     } upload;
>  
>     /**
> diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c 
> b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> index e261163..a579025 100644
> --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
> +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
> @@ -381,21 +381,17 @@ copy_array_to_vbo_array(struct brw_context *brw,
>     const unsigned char *src = element->glarray->Ptr + min * src_stride;
>     int count = max - min + 1;
>     GLuint size = count * dst_stride;
> +   uint8_t *dst = intel_upload_space(brw, size, dst_stride,
> +                                     &buffer->bo, &buffer->offset);
>


It sure seems like these references will exist across batchbuffers,
meaning we may hold on to the upload buffer containing our vertex data
until we do copy_array_to_vbo_array on it again.  That seems
unfortunate.  (Or, maybe I'm wrong?)

However, I'm pretty sure the old code had that same problem, so your new
code is still a huge improvement.

Patch 1 is:
Reviewed-by: Kenneth Graunke <kenn...@whitecape.org>

>     if (dst_stride == src_stride) {
> -      intel_upload_data(brw, src, size, dst_stride,
> -                     &buffer->bo, &buffer->offset);
> +      memcpy(dst, src, size);
>     } else {
> -      char * const map = intel_upload_map(brw, size, dst_stride);
> -      char *dst = map;
> -
>        while (count--) {
>        memcpy(dst, src, dst_stride);
>        src += src_stride;
>        dst += dst_stride;
>        }
> -      intel_upload_unmap(brw, map, size, dst_stride,
> -                      &buffer->bo, &buffer->offset);
>     }
>     buffer->stride = dst_stride;
>  }
> diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h 
> b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> index b27d25f..5eaf9dc 100644
> --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h
> @@ -90,16 +90,17 @@ drm_intel_bo *intel_bufferobj_buffer(struct brw_context 
> *brw,
>                                       uint32_t size);
>  
>  void intel_upload_data(struct brw_context *brw,
> -                    const void *ptr, GLuint size, GLuint align,
> -                    drm_intel_bo **return_bo,
> -                    GLuint *return_offset);
> -
> -void *intel_upload_map(struct brw_context *brw,
> -                    GLuint size, GLuint align);
> -void intel_upload_unmap(struct brw_context *brw,
> -                     const void *ptr, GLuint size, GLuint align,
> -                     drm_intel_bo **return_bo,
> -                     GLuint *return_offset);
> +                       const void *data,
> +                       uint32_t size,
> +                       uint32_t alignment,
> +                       drm_intel_bo **out_bo,
> +                       uint32_t *out_offset);
> +
> +void *intel_upload_space(struct brw_context *brw,
> +                         uint32_t size,
> +                         uint32_t alignment,
> +                         drm_intel_bo **out_bo,
> +                         uint32_t *out_offset);
>  
>  void intel_upload_finish(struct brw_context *brw);
>  
> diff --git a/src/mesa/drivers/dri/i965/intel_upload.c 
> b/src/mesa/drivers/dri/i965/intel_upload.c
> index ec3109b..bb3f615 100644
> --- a/src/mesa/drivers/dri/i965/intel_upload.c
> +++ b/src/mesa/drivers/dri/i965/intel_upload.c
> @@ -57,127 +57,84 @@ intel_upload_finish(struct brw_context *brw)
>     if (!brw->upload.bo)
>        return;
>  
> -   if (brw->upload.buffer_len) {
> -      drm_intel_bo_subdata(brw->upload.bo,
> -                           brw->upload.buffer_offset,
> -                           brw->upload.buffer_len,
> -                           brw->upload.buffer);
> -      brw->upload.buffer_len = 0;
> -   }
> -
> +   drm_intel_bo_unmap(brw->upload.bo);
>     drm_intel_bo_unreference(brw->upload.bo);
>     brw->upload.bo = NULL;
> +   brw->upload.next_offset = 0;
>  }
>  
> -static void
> -wrap_buffers(struct brw_context *brw, GLuint size)
> -{
> -   intel_upload_finish(brw);
> -
> -   if (size < INTEL_UPLOAD_SIZE)
> -      size = INTEL_UPLOAD_SIZE;
> -
> -   brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "upload", size, 0);
> -   brw->upload.offset = 0;
> -}
> -
> -void
> -intel_upload_data(struct brw_context *brw,
> -                  const void *ptr, GLuint size, GLuint align,
> -                  drm_intel_bo **return_bo,
> -                  GLuint *return_offset)
> -{
> -   GLuint base, delta;
> -
> -   base = ALIGN_NPOT(brw->upload.offset, align);
> -   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
> -      wrap_buffers(brw, size);
> -      base = 0;
> -   }
> -
> -   drm_intel_bo_reference(brw->upload.bo);
> -   *return_bo = brw->upload.bo;
> -   *return_offset = base;
> -
> -   delta = base - brw->upload.offset;
> -   if (brw->upload.buffer_len &&
> -       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
> -      drm_intel_bo_subdata(brw->upload.bo,
> -                           brw->upload.buffer_offset,
> -                           brw->upload.buffer_len,
> -                           brw->upload.buffer);
> -      brw->upload.buffer_len = 0;
> -   }
> -
> -   if (size < sizeof(brw->upload.buffer)) {
> -      if (brw->upload.buffer_len == 0)
> -         brw->upload.buffer_offset = base;
> -      else
> -         brw->upload.buffer_len += delta;
> -
> -      memcpy(brw->upload.buffer + brw->upload.buffer_len, ptr, size);
> -      brw->upload.buffer_len += size;
> -   } else {
> -      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
> -   }
> -
> -   brw->upload.offset = base + size;
> -}
> -
> +/**
> + * Interface for getting memory for uploading streamed data to the GPU
> + *
> + * In most cases, streamed data (for GPU state structures, for example) is
> + * uploaded through brw_state_batch(), since that interface allows 
> relocations
> + * from the streamed space returned to other BOs.  However, that interface 
> has
> + * the restriction that the amount of space allocated has to be "small" (see
> + * estimated_max_prim_size in brw_draw.c).
> + *
> + * This interface, on the other hand, is able to handle arbitrary sized
> + * allocation requests, though it will batch small allocations into the same
> + * BO for efficiency and reduced memory footprint.
> + *
> + * \note The returned pointer is valid only until intel_upload_finish(), 
> which
> + * will happen at batch flush or the next
> + * intel_upload_space()/intel_upload_data().
> + *
> + * \param out_bo Pointer to a BO, which must point to a valid BO or NULL on
> + * entry, and will have a reference to the new BO containing the state on
> + * return.
> + *
> + * \param out_offset Offset within the buffer object that the data will land.
> + */
>  void *
> -intel_upload_map(struct brw_context *brw, GLuint size, GLuint align)
> +intel_upload_space(struct brw_context *brw,
> +                   uint32_t size,
> +                   uint32_t alignment,
> +                   drm_intel_bo **out_bo,
> +                   uint32_t *out_offset)
>  {
> -   GLuint base, delta;
> -   char *ptr;
> +   uint32_t offset;
>  
> -   base = ALIGN_NPOT(brw->upload.offset, align);
> -   if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) {
> -      wrap_buffers(brw, size);
> -      base = 0;
> +   offset = ALIGN_NPOT(brw->upload.next_offset, alignment);
> +   if (brw->upload.bo && offset + size > brw->upload.bo->size) {
> +      intel_upload_finish(brw);
> +      offset = 0;
>     }
>  
> -   delta = base - brw->upload.offset;
> -   if (brw->upload.buffer_len &&
> -       brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) {
> -      drm_intel_bo_subdata(brw->upload.bo,
> -                           brw->upload.buffer_offset,
> -                           brw->upload.buffer_len,
> -                           brw->upload.buffer);
> -      brw->upload.buffer_len = 0;
> +   if (!brw->upload.bo) {
> +      brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data",
> +                                          MAX2(INTEL_UPLOAD_SIZE, size), 
> 4096);
> +      if (brw->has_llc)
> +         drm_intel_bo_map(brw->upload.bo, true);
> +      else
> +         drm_intel_gem_bo_map_gtt(brw->upload.bo);
>     }
>  
> -   if (size <= sizeof(brw->upload.buffer)) {
> -      if (brw->upload.buffer_len == 0)
> -         brw->upload.buffer_offset = base;
> -      else
> -         brw->upload.buffer_len += delta;
> +   brw->upload.next_offset = offset + size;
>  
> -      ptr = brw->upload.buffer + brw->upload.buffer_len;
> -      brw->upload.buffer_len += size;
> -   } else {
> -      ptr = malloc(size);
> +   *out_offset = offset;
> +   if (*out_bo != brw->upload.bo) {
> +      drm_intel_bo_unreference(*out_bo);
> +      *out_bo = brw->upload.bo;
> +      drm_intel_bo_reference(brw->upload.bo);
>     }
>  
> -   return ptr;
> +   return brw->upload.bo->virtual + offset;
>  }
>  
> +/**
> + * Handy interface to upload some data to temporary GPU memory quickly.
> + *
> + * References to this memory should not be retained across batch flushes.
> + */
>  void
> -intel_upload_unmap(struct brw_context *brw,
> -                   const void *ptr, GLuint size, GLuint align,
> -                   drm_intel_bo **return_bo,
> -                   GLuint *return_offset)
> +intel_upload_data(struct brw_context *brw,
> +                  const void *data,
> +                  uint32_t size,
> +                  uint32_t alignment,
> +                  drm_intel_bo **out_bo,
> +                  uint32_t *out_offset)
>  {
> -   GLuint base;
> -
> -   base = ALIGN_NPOT(brw->upload.offset, align);
> -   if (size > sizeof(brw->upload.buffer)) {
> -      drm_intel_bo_subdata(brw->upload.bo, base, size, ptr);
> -      free((void*)ptr);
> -   }
> -
> -   drm_intel_bo_reference(brw->upload.bo);
> -   *return_bo = brw->upload.bo;
> -   *return_offset = base;
> -
> -   brw->upload.offset = base + size;
> +   void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset);
> +   memcpy(dst, data, size);
>  }
>

signature.asc
Description: OpenPGP digital signature

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 1/2] i965: Massively simplify the intel_upload implementation.

Reply via email to