From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_buffer.c | 6 +++--- src/gallium/drivers/radeonsi/si_dma_cs.c | 2 +- src/gallium/drivers/radeonsi/si_fence.c | 5 ++++- src/gallium/drivers/radeonsi/si_gfx_cs.c | 4 ++-- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 4 ++-- src/gallium/drivers/radeonsi/si_texture.c | 2 +- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 12 ++++++++---- src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 12 ++++++++---- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 3 ++- 10 files changed, 32 insertions(+), 20 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 1420702d8d4..d17b2c6a831 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -57,24 +57,24 @@ void *si_buffer_map_sync_with_rings(struct si_context *sctx, if (!(usage & PIPE_TRANSFER_WRITE)) { /* have to wait for the last write */ rusage = RADEON_USAGE_WRITE; } if (radeon_emitted(sctx->gfx_cs, sctx->initial_gfx_cs_size) && sctx->ws->cs_is_buffer_referenced(sctx->gfx_cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } else { - si_flush_gfx_cs(sctx, 0, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); busy = true; } } if (radeon_emitted(sctx->dma_cs, 0) && sctx->ws->cs_is_buffer_referenced(sctx->dma_cs, resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL); return NULL; } else { @@ -718,21 +718,21 @@ static bool si_resource_commit(struct pipe_context *pctx, /* * Since buffer commitment changes cannot be pipelined, we need to * (a) flush any pending commands that refer to the buffer we're about * to change, and * (b) wait for threaded submit to finish, including those that were * triggered by some other, earlier operation. */ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) { - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } if (radeon_emitted(ctx->dma_cs, 0) && ctx->ws->cs_is_buffer_referenced(ctx->dma_cs, res->buf, RADEON_USAGE_READWRITE)) { si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL); } ctx->ws->cs_sync_flush(ctx->dma_cs); ctx->ws->cs_sync_flush(ctx->gfx_cs); diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c index 7af7c5623b7..1eefaeb6ad5 100644 --- a/src/gallium/drivers/radeonsi/si_dma_cs.c +++ b/src/gallium/drivers/radeonsi/si_dma_cs.c @@ -51,21 +51,21 @@ void si_need_dma_space(struct si_context *ctx, unsigned num_dw, } /* Flush the GFX IB if DMA depends on it. */ if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) && ((dst && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) || (src && ctx->ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE)))) - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); /* Flush if there's not enough space, or if the memory usage per IB * is too large. * * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index 26d6c43b34d..19fcb96041f 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -367,21 +367,24 @@ static boolean si_fence_finish(struct pipe_screen *screen, * * and the calls to ClientWaitSync and FenceSync were * issued from the same context, * * then the GL will behave as if the equivalent of Flush * were inserted immediately after the creation of sync." * * This means we need to flush for such fences even when we're * not going to wait. */ threaded_context_unwrap_sync(ctx); - si_flush_gfx_cs(sctx, timeout ? 0 : PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, + (timeout ? 0 : PIPE_FLUSH_ASYNC) | + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, + NULL); rfence->gfx_unflushed.ctx = NULL; if (!timeout) return false; /* Recompute the timeout after all that. */ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { int64_t time = os_time_get_nano(); timeout = abs_timeout > time ? abs_timeout - time : 0; } diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index 63bff29e63a..0173c64631b 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -40,35 +40,35 @@ void si_need_gfx_cs_space(struct si_context *ctx) */ /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) { ctx->gtt = 0; ctx->vram = 0; - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return; } ctx->gtt = 0; ctx->vram = 0; /* If the IB is sufficiently large, don't count the space needed * and just flush if there is not enough space left. * * Also reserve space for stopping queries at the end of IB, because * the number of active queries is mostly unlimited. */ unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend; if (!ctx->ws->cs_check_space(cs, need_dwords)) - si_flush_gfx_cs(ctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct radeon_winsys_cs *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f0f323ff3a7..5e24d6cbb7e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1321,19 +1321,19 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx, struct r600_resource *rbo, enum radeon_bo_usage usage, enum radeon_bo_priority priority, bool check_mem) { if (check_mem && !radeon_cs_memory_below_limit(sctx->screen, sctx->gfx_cs, sctx->vram + rbo->vram_usage, sctx->gtt + rbo->gart_usage)) - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); radeon_add_to_buffer_list(sctx, sctx->gfx_cs, rbo, usage, priority); } #define PRINT_ERR(fmt, args...) \ fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args) #endif diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 7e1660415f5..67ab75bbd2d 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2767,21 +2767,21 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); sctx->init_config_gs_rings = pm4; if (!sctx->init_config_has_vgt_flush) { si_init_config_add_vgt_flush(sctx); si_pm4_upload_indirect_buffer(sctx, sctx->init_config); } /* Flush the context to re-emit both init_config states. */ sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); /* Set ring bindings. */ if (sctx->esgs_ring) { assert(sctx->chip_class <= VI); si_set_ring_buffer(sctx, SI_ES_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, true, true, 4, 64, 0); si_set_ring_buffer(sctx, SI_GS_RING_ESGS, sctx->esgs_ring, 0, sctx->esgs_ring->width0, false, false, 0, 0, 0); @@ -3044,21 +3044,21 @@ static void si_init_tess_factor_ring(struct si_context *sctx) factor_va >> 8); si_pm4_set_reg(sctx->init_config, R_0089B0_VGT_HS_OFFCHIP_PARAM, sctx->screen->vgt_hs_offchip_param); } /* Flush the context to re-emit the init_config state. * This is done only once in a lifetime of a context. */ si_pm4_upload_indirect_buffer(sctx, sctx->init_config); sctx->initial_gfx_cs_size = 0; /* force flush */ - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } /** * This is used when TCS is NULL in the VS->TCS->TES chain. In this case, * VS passes its outputs to TES directly, so the fixed-function shader only * has to write TESSOUTER and TESSINNER. */ static void si_generate_fixed_func_tcs(struct si_context *sctx) { struct ureg_src outer, inner; diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index 1f0de5e71ec..8964c6b730c 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -1862,21 +1862,21 @@ static void si_texture_transfer_unmap(struct pipe_context *ctx, * The idea is that we don't want to build IBs that use too much * memory and put pressure on the kernel memory manager and we also * want to make temporary and invalidated buffers go idle ASAP to * decrease the total memory usage or make them reusable. The memory * usage will be slightly higher than given here because of the buffer * cache in the winsys. * * The result is that the kernel memory manager is never a bottleneck. */ if (sctx->num_alloc_tex_transfer_bytes > sctx->screen->info.gart_size / 4) { - si_flush_gfx_cs(sctx, PIPE_FLUSH_ASYNC, NULL); + si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); sctx->num_alloc_tex_transfer_bytes = 0; } pipe_resource_reference(&transfer->resource, NULL); FREE(transfer); } static const struct u_resource_vtbl si_texture_vtbl = { NULL, /* get_handle */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 22b5a73143d..9b6d6e83032 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -232,31 +232,33 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, RADEON_USAGE_WRITE)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_WRITE)) { return NULL; } } else { if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { return NULL; } } } else { uint64_t time = os_time_get_nano(); @@ -265,35 +267,37 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs) { if (amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, RADEON_USAGE_WRITE)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in amdgpu_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) amdgpu_cs_sync_flush(rcs); } } amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_WRITE); } else { /* Mapping for write. */ if (cs) { if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in amdgpu_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) amdgpu_cs_sync_flush(rcs); } } amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE); } diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 1617a2fe32e..6652977e586 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -509,60 +509,64 @@ static void *radeon_bo_map(struct pb_buffer *buf, if (usage & PIPE_TRANSFER_DONTBLOCK) { if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_WRITE)) { return NULL; } } else { if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); return NULL; } if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { return NULL; } } } else { uint64_t time = os_time_get_nano(); if (!(usage & PIPE_TRANSFER_WRITE)) { /* Mapping for read. * * Since we are mapping for read, we don't need to wait * if the GPU is using the buffer for read too * (neither one is changing it). * * Only check whether the buffer is being used for write. */ if (cs && radeon_bo_is_referenced_by_cs_for_write(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_WRITE); } else { /* Mapping for write. */ if (cs) { if (radeon_bo_is_referenced_by_cs(cs, bo)) { - cs->flush_cs(cs->flush_data, 0, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_START_NEXT_GFX_IB_NOW, NULL); } else { /* Try to avoid busy-waiting in radeon_bo_wait. */ if (p_atomic_read(&bo->num_active_ioctls)) radeon_drm_cs_sync_flush(rcs); } } radeon_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE); } diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index a1975dff8df..9070464bec8 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -400,21 +400,22 @@ static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs) unsigned i; for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) { p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references); radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL); } cs->csc->num_relocs = cs->csc->num_validated_relocs; /* Flush if there are any relocs. Clean up otherwise. */ if (cs->csc->num_relocs) { - cs->flush_cs(cs->flush_data, PIPE_FLUSH_ASYNC, NULL); + cs->flush_cs(cs->flush_data, + RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); } else { radeon_cs_context_cleanup(cs->csc); cs->base.used_vram = 0; cs->base.used_gart = 0; assert(cs->base.current.cdw == 0); if (cs->base.current.cdw != 0) { fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__); } } -- 2.15.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev