From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_buffer.c | 56 ++++++++++++++++++++++-- src/gallium/drivers/radeonsi/si_dma_cs.c | 19 ++++---- src/gallium/drivers/radeonsi/si_gfx_cs.c | 42 +++++++++++++++--- src/gallium/drivers/radeonsi/si_pipe.c | 23 ++++++---- src/gallium/drivers/radeonsi/si_pipe.h | 17 +++++++ 5 files changed, 131 insertions(+), 26 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index c01118ce96a..3f8db7cf4f0 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -433,21 +433,29 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, if (si_invalidate_buffer(sctx, buf)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } else { /* Fall back to a temporary buffer. */ usage |= PIPE_TRANSFER_DISCARD_RANGE; } } - if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && + if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT && + buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { + usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED | + PIPE_TRANSFER_PERSISTENT); + usage |= PIPE_TRANSFER_DISCARD_RANGE; + force_discard_range = true; + } + + if (usage & PIPE_TRANSFER_DISCARD_RANGE && ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT))) || (buf->flags & RADEON_FLAG_SPARSE))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (buf->flags & RADEON_FLAG_SPARSE || force_discard_range || si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) || @@ -514,32 +522,72 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, data += box->x; return si_buffer_get_transfer(ctx, resource, usage, box, ptransfer, data, NULL, 0); } static void si_buffer_do_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer, const struct pipe_box *box) { + struct si_context *sctx = (struct si_context*)ctx; struct si_transfer *stransfer = (struct si_transfer*)transfer; struct si_resource *buf = si_resource(transfer->resource); if (stransfer->staging) { unsigned src_offset = stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x); + if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) { + /* This should be true for all uploaders. */ + assert(transfer->box.x == 0); + + /* Find a previous upload and extend its range. The last + * upload is likely to be at the end of the list. + */ + for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) { + struct si_sdma_upload *up = &sctx->sdma_uploads[i]; + + if (up->dst != buf) + continue; + + assert(up->src == stransfer->staging); + assert(box->x > up->dst_offset); + up->size = box->x + box->width - up->dst_offset; + return; + } + + /* Enlarge the array if it's full. */ + if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) { + unsigned size; + + sctx->max_sdma_uploads += 4; + size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]); + sctx->sdma_uploads = realloc(sctx->sdma_uploads, size); + } + + /* Add a new upload. */ + struct si_sdma_upload *up = + &sctx->sdma_uploads[sctx->num_sdma_uploads++]; + up->dst = up->src = NULL; + si_resource_reference(&up->dst, buf); + si_resource_reference(&up->src, stransfer->staging); + up->dst_offset = box->x; + up->src_offset = src_offset; + up->size = box->width; + return; + } + /* Copy the staging buffer into the original one. */ - si_copy_buffer((struct si_context*)ctx, transfer->resource, - &stransfer->staging->b.b, box->x, src_offset, - box->width); + si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, + box->x, src_offset, box->width); } util_range_add(&buf->valid_buffer_range, box->x, box->x + box->width); } static void si_buffer_flush_region(struct pipe_context *ctx, struct pipe_transfer *transfer, const struct pipe_box *rel_box) { diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 2aafc1f09a0..bba1bd95826 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -133,21 +133,22 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw, if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } if (src) { vram += src->vram_usage; gtt += src->gart_usage; } /* Flush the GFX IB if DMA depends on it. */ - if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && + if (!ctx->sdma_uploads_in_progress && + radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && ((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE)))) si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. @@ -155,45 +156,47 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw, * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ num_dw++; /* for emit_wait_idle below */ - if (!ws->cs_check_space(ctx->dma_cs, num_dw) || - ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 || - !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt)) { + if (!ctx->sdma_uploads_in_progress && + (!ws->cs_check_space(ctx->dma_cs, num_dw) || + ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 * 1024 || + !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram, gtt))) { si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma_cs->current.cdw) <= ctx->dma_cs->current.max_dw); } /* Wait for idle if either buffer has been used in the IB before to * prevent read-after-write hazards. */ if ((dst && ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf, RADEON_USAGE_WRITE))) si_dma_emit_wait_idle(ctx); + unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED; if (dst) { - radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst, - RADEON_USAGE_WRITE, 0); + ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | sync, + dst->domains, 0); } if (src) { - radeon_add_to_buffer_list(ctx, ctx->dma_cs, src, - RADEON_USAGE_READ, 0); + ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | sync, + src->domains, 0); } /* this function is called before all DMA calls, so increment this. */ ctx->num_dma_calls++; } void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_cmdbuf *cs = ctx->dma_cs; diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 3d64587fa2b..13d5b5a959a 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -19,20 +19,21 @@ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "si_pipe.h" #include "util/os_time.h" +#include "util/u_upload_mgr.h" /* initialize */ void si_need_gfx_cs_space(struct si_context *ctx) { struct radeon_cmdbuf *cs = ctx->gfx_cs; /* There is no need to flush the DMA IB here, because * si_need_dma_space always flushes the GFX IB if there is * a conflict, which means any unflushed DMA commands automatically * precede the GFX IB (= they had no dependency on the GFX IB when @@ -57,20 +58,29 @@ void si_need_gfx_cs_space(struct si_context *ctx) * and just flush if there is not enough space left. * * Also reserve space for stopping queries at the end of IB, because * the number of active queries is mostly unlimited. */ unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend; if (!ctx->ws->cs_check_space(cs, need_dwords)) si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } +void si_unref_sdma_uploads(struct si_context *sctx) +{ + for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) { + si_resource_reference(&sctx->sdma_uploads[i].dst, NULL); + si_resource_reference(&sctx->sdma_uploads[i].src, NULL); + } + sctx->num_sdma_uploads = 0; +} + void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_cmdbuf *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) return; @@ -91,31 +101,51 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy)) return; if (si_check_device_reset(ctx)) return; if (ctx->screen->debug_flags & DBG(CHECK_VM)) flags &= ~PIPE_FLUSH_ASYNC; + ctx->gfx_flush_in_progress = true; + /* If the state tracker is flushing the GFX IB, si_flush_from_st is * responsible for flushing the DMA IB and merging the fences from both. - * This code is only needed when the driver flushes the GFX IB - * internally, and it never asks for a fence handle. + * If the driver flushes the GFX IB internally, and it should never ask + * for a fence handle. */ - if (radeon_emitted(ctx->dma_cs, 0)) { - assert(fence == NULL); /* internal flushes only */ - si_flush_dma_cs(ctx, flags, NULL); + assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL); + + /* Update the sdma_uploads list by flushing the uploader. */ + u_upload_unmap(ctx->b.const_uploader); + + /* Execute SDMA uploads. */ + ctx->sdma_uploads_in_progress = true; + for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { + struct si_sdma_upload *up = &ctx->sdma_uploads[i]; + struct pipe_box box; + + assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && + up->size % 4 == 0); + + u_box_1d(up->src_offset, up->size, &box); + ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0, + &up->src->b.b, 0, &box); } + ctx->sdma_uploads_in_progress = false; + si_unref_sdma_uploads(ctx); - ctx->gfx_flush_in_progress = true; + /* Flush SDMA (preamble IB). */ + if (radeon_emitted(ctx->dma_cs, 0)) + si_flush_dma_cs(ctx, flags, NULL); if (!LIST_IS_EMPTY(&ctx->active_queries)) si_suspend_queries(ctx); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { si_emit_streamout_end(ctx); ctx->streamout.suspended = true; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index c6f93e7b15e..c0ee2b1a6dc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -257,20 +257,21 @@ static void si_destroy_context(struct pipe_context *context) si_saved_cs_reference(&sctx->current_saved_cs, NULL); _mesa_hash_table_destroy(sctx->tex_handles, NULL); _mesa_hash_table_destroy(sctx->img_handles, NULL); util_dynarray_fini(&sctx->resident_tex_handles); util_dynarray_fini(&sctx->resident_img_handles); util_dynarray_fini(&sctx->resident_tex_needs_color_decompress); util_dynarray_fini(&sctx->resident_img_needs_color_decompress); util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress); + si_unref_sdma_uploads(sctx); FREE(sctx); } static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; if (sctx->screen->info.has_gpu_reset_status_query) return sctx->ws->ctx_query_reset_status(sctx->ctx); @@ -436,43 +437,49 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, SI_RESOURCE_FLAG_CLEAR, false); if (!sctx->allocator_zeroed_memory) goto fail; sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY); if (!sctx->b.stream_uploader) goto fail; - sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024, - 0, PIPE_USAGE_DEFAULT, - SI_RESOURCE_FLAG_32BIT | - (sscreen->cpdma_prefetch_writes_memory ? - 0 : SI_RESOURCE_FLAG_READ_ONLY)); - if (!sctx->b.const_uploader) - goto fail; - sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024, 0, PIPE_USAGE_STAGING, 0); if (!sctx->cached_gtt_allocator) goto fail; sctx->ctx = sctx->ws->ctx_create(sctx->ws); if (!sctx->ctx) goto fail; if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) { sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void*)si_flush_dma_cs, sctx, stop_exec_on_failure); } + bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->dma_cs && debug_get_bool_option("SDMA", true); + sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024, + 0, PIPE_USAGE_DEFAULT, + SI_RESOURCE_FLAG_32BIT | + (use_sdma_upload ? + SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : + (sscreen->cpdma_prefetch_writes_memory ? + 0 : SI_RESOURCE_FLAG_READ_ONLY))); + if (!sctx->b.const_uploader) + goto fail; + + if (use_sdma_upload) + u_upload_enable_flush_explicit(sctx->b.const_uploader); + si_init_buffer_functions(sctx); si_init_clear_functions(sctx); si_init_blit_functions(sctx); si_init_compute_functions(sctx); si_init_compute_blit_functions(sctx); si_init_debug_functions(sctx); si_init_msaa_functions(sctx); si_init_streamout_functions(sctx); if (sscreen->info.has_hw_decode) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index b01d5744752..b208bdeb848 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -103,20 +103,22 @@ #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 #define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define SI_RESOURCE_FLAG_FORCE_MSAA_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) #define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) #define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) #define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) #define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) +/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */ +#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8) enum si_clear_code { DCC_CLEAR_COLOR_0000 = 0x00000000, DCC_CLEAR_COLOR_0001 = 0x40404040, DCC_CLEAR_COLOR_1110 = 0x80808080, DCC_CLEAR_COLOR_1111 = 0xC0C0C0C0, DCC_CLEAR_COLOR_REG = 0x20202020, DCC_UNCOMPRESSED = 0xFFFFFFFF, }; @@ -769,20 +771,28 @@ struct si_saved_cs { struct si_context *ctx; struct radeon_saved_cs gfx; struct si_resource *trace_buf; unsigned trace_id; unsigned gfx_last_dw; bool flushed; int64_t time_flush; }; +struct si_sdma_upload { + struct si_resource *dst; + struct si_resource *src; + unsigned src_offset; + unsigned dst_offset; + unsigned size; +}; + struct si_context { struct pipe_context b; /* base class */ enum radeon_family family; enum chip_class chip_class; struct radeon_winsys *ws; struct radeon_winsys_ctx *ctx; struct radeon_cmdbuf *gfx_cs; struct radeon_cmdbuf *dma_cs; @@ -1074,20 +1084,26 @@ struct si_context { int num_perfect_occlusion_queries; struct list_head active_queries; unsigned num_cs_dw_queries_suspend; /* Render condition. */ struct pipe_query *render_cond; unsigned render_cond_mode; bool render_cond_invert; bool render_cond_force_off; /* for u_blitter */ + /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */ + bool sdma_uploads_in_progress; + struct si_sdma_upload *sdma_uploads; + unsigned num_sdma_uploads; + unsigned max_sdma_uploads; + /* Statistics gathering for the DCC enablement heuristic. It can't be * in si_texture because si_texture can be shared by multiple * contexts. This is for back buffers only. We shouldn't get too many * of those. * * X11 DRI3 rotates among a finite set of back buffers. They should * all fit in this array. If they don't, separate DCC might never be * enabled by DCC stat gathering. */ struct { @@ -1273,20 +1289,21 @@ struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, struct tc_unflushed_batch_token *tc_token); /* si_get.c */ void si_init_screen_get_functions(struct si_screen *sscreen); /* si_gfx_cs.c */ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence); void si_begin_new_gfx_cs(struct si_context *ctx); void si_need_gfx_cs_space(struct si_context *ctx); +void si_unref_sdma_uploads(struct si_context *sctx); /* si_gpu_load.c */ void si_gpu_load_kill_thread(struct si_screen *sscreen); uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type); unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin); /* si_compute.c */ void si_init_compute_functions(struct si_context *sctx); -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev