The primary benefit for this is that we get format conversion for "free", along with detiling and cache flushing (most relevant for !llc). Using the GPU does impose a bandwidth cost that is presumably better used for rendering, hence we limit the use to readback into client memory (not pbo) where we would need to stall on the GPU anyway. (Uploads remain direct/staged to avoid the synchronisation cost.) And we only use the GPU path if a direct read into client memory from video memory is unavailable.
The ultimate user of this is Xorg/glamor! On byt, bsw, bxt (and presumably but not measured ilk), x11perf -shmget500 is improved by 15-fold. Though conversely the overhead of executing and waiting upon an additional blorp batch is shown by x11perf -shmget10 being reduced by a factor of 2. I think it is fair to presume that large copies will dominate (and that the overhead of a single batch is something that we can iteratively reduce, for the benefit of all.) llc machines continue to use direct access where there is no format changes (which one hopes is the typical use case). Cc: Jason Ekstrand <jason.ekstr...@intel.com> Cc: Topi Pohjolainen <topi.pohjolai...@intel.com> Cc: Kenneth Graunke <kenn...@whitecape.org> --- src/mesa/drivers/dri/i965/brw_blorp.c | 34 ++++++++++++++++++++++------ src/mesa/drivers/dri/i965/intel_pixel_read.c | 21 ++++++++--------- src/mesa/drivers/dri/i965/intel_tex_image.c | 27 +++++++++++----------- 3 files changed, 50 insertions(+), 32 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c index ed4f9870f2..c2a549b204 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.c +++ b/src/mesa/drivers/dri/i965/brw_blorp.c @@ -36,6 +36,7 @@ #include "brw_defines.h" #include "brw_meta_util.h" #include "brw_state.h" +#include "intel_batchbuffer.h" #include "intel_buffer_objects.h" #include "intel_fbo.h" #include "common/gen_debug.h" @@ -806,13 +807,10 @@ blorp_get_client_bo(struct brw_context *brw, *offset_out = offset; return bo; - } else { + } else if (read_only) { /* Someone should have already checked that there is data to upload. */ assert(pixels); - /* Creating a temp buffer currently only works for upload */ - assert(read_only); - /* This is not a user-provided PBO. Instead, pixels is a pointer to CPU * data which we need to copy into a BO. */ @@ -832,6 +830,23 @@ blorp_get_client_bo(struct brw_context *brw, *offset_out = 0; return bo; + } else if (brw->screen->kernel_featuers & KERNEL_ALLOWS_USERPTR) { + void *addr = (void *)pixels + first_pixel; + void *first_page = (void *)((GLintptr)addr & -4096); + void *last_page = (void *)(ALIGN((GLintptr)(pixels + last_pixel), 4096)); + + struct brw_bo *bo = + brw_bo_alloc_userptr(brw->bufmgr, "tex_subimage_userptr", + first_page, last_page - first_page, 0); + if (bo == NULL) { + perf_debug("intel_texsubimage: userptr mapping failed\n"); + return NULL; + } + + *offset_out = addr - first_page; + return bo; + } else { + return NULL; } } @@ -974,6 +989,10 @@ brw_blorp_upload_miptree(struct brw_context *brw, result = true; err: + if (src_bo->userptr) { + intel_batchbuffer_flush(brw); + brw_bo_wait_rendering(src_bo); + } brw_bo_unreference(src_bo); return result; @@ -1019,9 +1038,6 @@ brw_blorp_download_miptree(struct brw_context *brw, break; } - /* This pass only works for PBOs */ - assert(_mesa_is_bufferobj(packing->BufferObj)); - uint32_t dst_offset, dst_row_stride, dst_image_stride; struct brw_bo *dst_bo = blorp_get_client_bo(brw, width, height, depth, @@ -1117,6 +1133,10 @@ brw_blorp_download_miptree(struct brw_context *brw, brw_emit_mi_flush(brw); err: + if (dst_bo->userptr) { + intel_batchbuffer_flush(brw); + brw_bo_wait_rendering(dst_bo); + } brw_bo_unreference(dst_bo); return result; diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c index 4528d6d265..699ce73b0f 100644 --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c @@ -259,8 +259,6 @@ intelReadPixels(struct gl_context * ctx, GLenum format, GLenum type, const struct gl_pixelstore_attrib *pack, GLvoid * pixels) { - bool ok; - struct brw_context *brw = brw_context(ctx); bool dirty; @@ -273,18 +271,19 @@ intelReadPixels(struct gl_context * ctx, intel_prepare_render(brw); brw->front_buffer_dirty = dirty; - if (_mesa_is_bufferobj(pack->BufferObj)) { - if (intel_readpixels_blorp(ctx, x, y, width, height, - format, type, pixels, pack)) - return; - - perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__); + if (!_mesa_is_bufferobj(pack->BufferObj) && + intel_readpixels_tiled_memcpy(ctx, x, y, width, height, + format, type, pixels, pack)) { + return; } - ok = intel_readpixels_tiled_memcpy(ctx, x, y, width, height, - format, type, pixels, pack); - if(ok) + if (intel_readpixels_blorp(ctx, x, y, width, height, + format, type, pixels, pack)) { return; + } + + perf_debug("%s: fallback to CPU mapping for %s\n", __func__, + _mesa_is_bufferobj(pack->BufferObj) ? "PBO" : "memory"); /* Update Mesa state before calling _mesa_readpixels(). * XXX this may not be needed since ReadPixels no longer uses the diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c index 5396e0a43b..74d10f29f1 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_image.c +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c @@ -739,26 +739,25 @@ intel_get_tex_sub_image(struct gl_context *ctx, struct gl_texture_image *texImage) { struct brw_context *brw = brw_context(ctx); - bool ok; DBG("%s\n", __func__); - if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) { - if (intel_gettexsubimage_blorp(brw, texImage, - xoffset, yoffset, zoffset, - width, height, depth, format, type, - pixels, &ctx->Pack)) - return; - - perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__); + if (!_mesa_is_bufferobj(ctx->Pack.BufferObj) && + intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset, + width, height, + format, type, pixels, &ctx->Pack)) { + return; } - ok = intel_gettexsubimage_tiled_memcpy(ctx, texImage, xoffset, yoffset, - width, height, - format, type, pixels, &ctx->Pack); - - if(ok) + if (intel_gettexsubimage_blorp(brw, texImage, + xoffset, yoffset, zoffset, + width, height, depth, format, type, + pixels, &ctx->Pack)) { return; + } + + perf_debug("%s: fallback to CPU mapping for %s\n", __func__, + _mesa_is_bufferobj(ctx->Pack.BufferObj) ? "PBO" : "memory"); _mesa_meta_GetTexSubImage(ctx, xoffset, yoffset, zoffset, width, height, depth, -- 2.15.0.rc0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev