On 03/25/2014 11:27 AM, Eric Anholt wrote: > The implementation kept a page-sized area for uploading data, and > uploaded chunks from that to a 64kb-sized streamed buffer. This wasted > cache footprint (and extra state tracking to do so) when we want to just > write our data into the buffer immediately. > > Instead, build it around an interface like brw_state_batch() that just > gets you a pointer to BO memory to upload your stuff immediately. > > Improves OpenArena on HSW by 1.62209% +/- 0.355299% (n=61) and on BYT by > 1.7916% +/- 0.415743% (n=31). > > v2: Rebase on Mesa master, drop old prototypes. Re-do performance > comparison on a kernel that doesn't punish CPU efficiency > improvements. > --- > src/mesa/drivers/dri/i965/brw_context.h | 5 +- > src/mesa/drivers/dri/i965/brw_draw_upload.c | 10 +- > src/mesa/drivers/dri/i965/intel_buffer_objects.h | 21 +-- > src/mesa/drivers/dri/i965/intel_upload.c | 167 > +++++++++-------------- > 4 files changed, 77 insertions(+), 126 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > index 04af5d0..e119682 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -1041,10 +1041,7 @@ struct brw_context > > struct { > drm_intel_bo *bo; > - GLuint offset; > - uint32_t buffer_len; > - uint32_t buffer_offset; > - char buffer[4096]; > + uint32_t next_offset; > } upload; > > /** > diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c > b/src/mesa/drivers/dri/i965/brw_draw_upload.c > index e261163..a579025 100644 > --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c > +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c > @@ -381,21 +381,17 @@ copy_array_to_vbo_array(struct brw_context *brw, > const unsigned char *src = element->glarray->Ptr + min * src_stride; > int count = max - min + 1; > GLuint size = count * dst_stride; > + uint8_t *dst = intel_upload_space(brw, size, dst_stride, > + &buffer->bo, &buffer->offset); >
It sure seems like these references will exist across batchbuffers, meaning we may hold on to the upload buffer containing our vertex data until we do copy_array_to_vbo_array on it again. That seems unfortunate. (Or, maybe I'm wrong?) However, I'm pretty sure the old code had that same problem, so your new code is still a huge improvement. Patch 1 is: Reviewed-by: Kenneth Graunke <kenn...@whitecape.org> > if (dst_stride == src_stride) { > - intel_upload_data(brw, src, size, dst_stride, > - &buffer->bo, &buffer->offset); > + memcpy(dst, src, size); > } else { > - char * const map = intel_upload_map(brw, size, dst_stride); > - char *dst = map; > - > while (count--) { > memcpy(dst, src, dst_stride); > src += src_stride; > dst += dst_stride; > } > - intel_upload_unmap(brw, map, size, dst_stride, > - &buffer->bo, &buffer->offset); > } > buffer->stride = dst_stride; > } > diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.h > b/src/mesa/drivers/dri/i965/intel_buffer_objects.h > index b27d25f..5eaf9dc 100644 > --- a/src/mesa/drivers/dri/i965/intel_buffer_objects.h > +++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.h > @@ -90,16 +90,17 @@ drm_intel_bo *intel_bufferobj_buffer(struct brw_context > *brw, > uint32_t size); > > void intel_upload_data(struct brw_context *brw, > - const void *ptr, GLuint size, GLuint align, > - drm_intel_bo **return_bo, > - GLuint *return_offset); > - > -void *intel_upload_map(struct brw_context *brw, > - GLuint size, GLuint align); > -void intel_upload_unmap(struct brw_context *brw, > - const void *ptr, GLuint size, GLuint align, > - drm_intel_bo **return_bo, > - GLuint *return_offset); > + const void *data, > + uint32_t size, > + uint32_t alignment, > + drm_intel_bo **out_bo, > + uint32_t *out_offset); > + > +void *intel_upload_space(struct brw_context *brw, > + uint32_t size, > + uint32_t alignment, > + drm_intel_bo **out_bo, > + uint32_t *out_offset); > > void intel_upload_finish(struct brw_context *brw); > > diff --git a/src/mesa/drivers/dri/i965/intel_upload.c > b/src/mesa/drivers/dri/i965/intel_upload.c > index ec3109b..bb3f615 100644 > --- a/src/mesa/drivers/dri/i965/intel_upload.c > +++ b/src/mesa/drivers/dri/i965/intel_upload.c > @@ -57,127 +57,84 @@ intel_upload_finish(struct brw_context *brw) > if (!brw->upload.bo) > return; > > - if (brw->upload.buffer_len) { > - drm_intel_bo_subdata(brw->upload.bo, > - brw->upload.buffer_offset, > - brw->upload.buffer_len, > - brw->upload.buffer); > - brw->upload.buffer_len = 0; > - } > - > + drm_intel_bo_unmap(brw->upload.bo); > drm_intel_bo_unreference(brw->upload.bo); > brw->upload.bo = NULL; > + brw->upload.next_offset = 0; > } > > -static void > -wrap_buffers(struct brw_context *brw, GLuint size) > -{ > - intel_upload_finish(brw); > - > - if (size < INTEL_UPLOAD_SIZE) > - size = INTEL_UPLOAD_SIZE; > - > - brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "upload", size, 0); > - brw->upload.offset = 0; > -} > - > -void > -intel_upload_data(struct brw_context *brw, > - const void *ptr, GLuint size, GLuint align, > - drm_intel_bo **return_bo, > - GLuint *return_offset) > -{ > - GLuint base, delta; > - > - base = ALIGN_NPOT(brw->upload.offset, align); > - if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) { > - wrap_buffers(brw, size); > - base = 0; > - } > - > - drm_intel_bo_reference(brw->upload.bo); > - *return_bo = brw->upload.bo; > - *return_offset = base; > - > - delta = base - brw->upload.offset; > - if (brw->upload.buffer_len && > - brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) { > - drm_intel_bo_subdata(brw->upload.bo, > - brw->upload.buffer_offset, > - brw->upload.buffer_len, > - brw->upload.buffer); > - brw->upload.buffer_len = 0; > - } > - > - if (size < sizeof(brw->upload.buffer)) { > - if (brw->upload.buffer_len == 0) > - brw->upload.buffer_offset = base; > - else > - brw->upload.buffer_len += delta; > - > - memcpy(brw->upload.buffer + brw->upload.buffer_len, ptr, size); > - brw->upload.buffer_len += size; > - } else { > - drm_intel_bo_subdata(brw->upload.bo, base, size, ptr); > - } > - > - brw->upload.offset = base + size; > -} > - > +/** > + * Interface for getting memory for uploading streamed data to the GPU > + * > + * In most cases, streamed data (for GPU state structures, for example) is > + * uploaded through brw_state_batch(), since that interface allows > relocations > + * from the streamed space returned to other BOs. However, that interface > has > + * the restriction that the amount of space allocated has to be "small" (see > + * estimated_max_prim_size in brw_draw.c). > + * > + * This interface, on the other hand, is able to handle arbitrary sized > + * allocation requests, though it will batch small allocations into the same > + * BO for efficiency and reduced memory footprint. > + * > + * \note The returned pointer is valid only until intel_upload_finish(), > which > + * will happen at batch flush or the next > + * intel_upload_space()/intel_upload_data(). > + * > + * \param out_bo Pointer to a BO, which must point to a valid BO or NULL on > + * entry, and will have a reference to the new BO containing the state on > + * return. > + * > + * \param out_offset Offset within the buffer object that the data will land. > + */ > void * > -intel_upload_map(struct brw_context *brw, GLuint size, GLuint align) > +intel_upload_space(struct brw_context *brw, > + uint32_t size, > + uint32_t alignment, > + drm_intel_bo **out_bo, > + uint32_t *out_offset) > { > - GLuint base, delta; > - char *ptr; > + uint32_t offset; > > - base = ALIGN_NPOT(brw->upload.offset, align); > - if (brw->upload.bo == NULL || base + size > brw->upload.bo->size) { > - wrap_buffers(brw, size); > - base = 0; > + offset = ALIGN_NPOT(brw->upload.next_offset, alignment); > + if (brw->upload.bo && offset + size > brw->upload.bo->size) { > + intel_upload_finish(brw); > + offset = 0; > } > > - delta = base - brw->upload.offset; > - if (brw->upload.buffer_len && > - brw->upload.buffer_len + delta + size > sizeof(brw->upload.buffer)) { > - drm_intel_bo_subdata(brw->upload.bo, > - brw->upload.buffer_offset, > - brw->upload.buffer_len, > - brw->upload.buffer); > - brw->upload.buffer_len = 0; > + if (!brw->upload.bo) { > + brw->upload.bo = drm_intel_bo_alloc(brw->bufmgr, "streamed data", > + MAX2(INTEL_UPLOAD_SIZE, size), > 4096); > + if (brw->has_llc) > + drm_intel_bo_map(brw->upload.bo, true); > + else > + drm_intel_gem_bo_map_gtt(brw->upload.bo); > } > > - if (size <= sizeof(brw->upload.buffer)) { > - if (brw->upload.buffer_len == 0) > - brw->upload.buffer_offset = base; > - else > - brw->upload.buffer_len += delta; > + brw->upload.next_offset = offset + size; > > - ptr = brw->upload.buffer + brw->upload.buffer_len; > - brw->upload.buffer_len += size; > - } else { > - ptr = malloc(size); > + *out_offset = offset; > + if (*out_bo != brw->upload.bo) { > + drm_intel_bo_unreference(*out_bo); > + *out_bo = brw->upload.bo; > + drm_intel_bo_reference(brw->upload.bo); > } > > - return ptr; > + return brw->upload.bo->virtual + offset; > } > > +/** > + * Handy interface to upload some data to temporary GPU memory quickly. > + * > + * References to this memory should not be retained across batch flushes. > + */ > void > -intel_upload_unmap(struct brw_context *brw, > - const void *ptr, GLuint size, GLuint align, > - drm_intel_bo **return_bo, > - GLuint *return_offset) > +intel_upload_data(struct brw_context *brw, > + const void *data, > + uint32_t size, > + uint32_t alignment, > + drm_intel_bo **out_bo, > + uint32_t *out_offset) > { > - GLuint base; > - > - base = ALIGN_NPOT(brw->upload.offset, align); > - if (size > sizeof(brw->upload.buffer)) { > - drm_intel_bo_subdata(brw->upload.bo, base, size, ptr); > - free((void*)ptr); > - } > - > - drm_intel_bo_reference(brw->upload.bo); > - *return_bo = brw->upload.bo; > - *return_offset = base; > - > - brw->upload.offset = base + size; > + void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset); > + memcpy(dst, data, size); > } >
signature.asc
Description: OpenPGP digital signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev