From: Nicolai Hähnle <nicolai.haeh...@amd.com> When a buffer is added to a CS without the SYNCHRONIZED usage flag, we now no longer add a dependency on the buffer's fence(s).
However, we still need to add a fence to the buffer during flush, so that cache reclaim works correctly (and in the hypothetical case that the buffer is later added to a CS _with_ the SYNCHRONIZED flag). It is now possible that the submissions refererring to a buffer are no longer linearly ordered, and so we may have to keep multiple fences around. We keep the fences in a FIFO. It should usually stay quite short (# of contexts * 2, for gfx + dma rings). While we're at it, extract amdgpu_add_fence_dependency for a single buffer, which will make adding the distinction between real buffer and slab cases easier. --- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 73 ++++++++++++++++++--------- src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 6 ++- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 84 ++++++++++++++++++++++++------- 3 files changed, 118 insertions(+), 45 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 0dbd0fb..37a7ba1 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -66,83 +66,108 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, int r; r = amdgpu_bo_wait_for_idle(bo->bo, timeout, &buffer_busy); if (r) fprintf(stderr, "%s: amdgpu_bo_wait_for_idle failed %i\n", __func__, r); return !buffer_busy; } if (timeout == 0) { + unsigned idle_fences; + bool buffer_idle; + pipe_mutex_lock(ws->bo_fence_lock); - if (bo->fence) { - if (amdgpu_fence_wait(bo->fence, 0, false)) { - /* Release the idle fence to avoid checking it again later. */ - amdgpu_fence_reference(&bo->fence, NULL); - } else { - pipe_mutex_unlock(ws->bo_fence_lock); - return false; - } + + for (idle_fences = 0; idle_fences < bo->num_fences; ++idle_fences) { + if (!amdgpu_fence_wait(bo->fences[idle_fences], 0, false)) + break; } + + /* Release the idle fences to avoid checking them again later. */ + for (unsigned i = 0; i < idle_fences; ++i) + amdgpu_fence_reference(&bo->fences[i], NULL); + + memmove(&bo->fences[0], &bo->fences[idle_fences], + (bo->num_fences - idle_fences) * sizeof(*bo->fences)); + bo->num_fences -= idle_fences; + + buffer_idle = !bo->num_fences; pipe_mutex_unlock(ws->bo_fence_lock); - return true; + return buffer_idle; } else { - struct pipe_fence_handle *fence = NULL; - bool fence_idle = false; bool buffer_idle = true; - /* Take a reference to the fences, so that we can wait for it - * without the lock. */ pipe_mutex_lock(ws->bo_fence_lock); - amdgpu_fence_reference(&fence, bo->fence); - pipe_mutex_unlock(ws->bo_fence_lock); + while (bo->num_fences && buffer_idle) { + struct pipe_fence_handle *fence = NULL; + bool fence_idle = false; - /* Now wait for the fence. */ - if (fence) { + amdgpu_fence_reference(&fence, bo->fences[0]); + + /* Wait for the fence. */ + pipe_mutex_unlock(ws->bo_fence_lock); if (amdgpu_fence_wait(fence, abs_timeout, true)) fence_idle = true; else buffer_idle = false; - } + pipe_mutex_lock(ws->bo_fence_lock); + + /* Release an idle fence to avoid checking it again later, keeping in + * mind that the fence array may have been modified by other threads. + */ + if (fence_idle && bo->num_fences && bo->fences[0] == fence) { + amdgpu_fence_reference(&bo->fences[0], NULL); + memmove(&bo->fences[0], &bo->fences[1], + (bo->num_fences - 1) * sizeof(*bo->fences)); + bo->num_fences--; + } - /* Release idle fences to avoid checking them again later. */ - pipe_mutex_lock(ws->bo_fence_lock); - if (fence == bo->fence && fence_idle) - amdgpu_fence_reference(&bo->fence, NULL); - amdgpu_fence_reference(&fence, NULL); + amdgpu_fence_reference(&fence, NULL); + } pipe_mutex_unlock(ws->bo_fence_lock); return buffer_idle; } } static enum radeon_bo_domain amdgpu_bo_get_initial_domain( struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->initial_domain; } +static void amdgpu_bo_remove_fences(struct amdgpu_winsys_bo *bo) +{ + for (unsigned i = 0; i < bo->num_fences; ++i) + amdgpu_fence_reference(&bo->fences[i], NULL); + + FREE(bo->fences); + bo->num_fences = 0; + bo->max_fences = 0; +} + void amdgpu_bo_destroy(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); pipe_mutex_lock(bo->ws->global_bo_list_lock); LIST_DEL(&bo->global_list_item); bo->ws->num_buffers--; pipe_mutex_unlock(bo->ws->global_bo_list_lock); amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP); amdgpu_va_range_free(bo->va_handle); amdgpu_bo_free(bo->bo); - amdgpu_fence_reference(&bo->fence, NULL); + amdgpu_bo_remove_fences(bo); if (bo->initial_domain & RADEON_DOMAIN_VRAM) bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size); if (bo->map_count >= 1) { if (bo->initial_domain & RADEON_DOMAIN_VRAM) bo->ws->mapped_vram -= bo->base.size; else if (bo->initial_domain & RADEON_DOMAIN_GTT) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 07403dd..93cc83a 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -55,22 +55,24 @@ struct amdgpu_winsys_bo { /* how many command streams, which are being emitted in a separate * thread, is this bo referenced in? */ volatile int num_active_ioctls; /* whether buffer_get_handle or buffer_from_handle was called, * it can only transition from false to true */ volatile int is_shared; /* bool (int for atomicity) */ - /* Fence for buffer synchronization. */ - struct pipe_fence_handle *fence; + /* Fences for buffer synchronization. */ + unsigned num_fences; + unsigned max_fences; + struct pipe_fence_handle **fences; struct list_head global_list_item; }; bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf); void amdgpu_bo_destroy(struct pb_buffer *_buf); void amdgpu_bo_init_functions(struct amdgpu_winsys *ws); static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index e2f2974..d5ea705 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -794,63 +794,109 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, list[i].bo_size = cs->buffers[i].bo->base.size; list[i].vm_address = cs->buffers[i].bo->va; list[i].priority_usage = cs->buffers[i].priority_usage; } } return cs->num_buffers; } DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) -/* Since the kernel driver doesn't synchronize execution between different - * rings automatically, we have to add fence dependencies manually. - */ -static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) +static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, + struct amdgpu_cs_buffer *buffer) { struct amdgpu_cs_context *cs = acs->csc; - int i; + struct amdgpu_winsys_bo *bo = buffer->bo; + struct amdgpu_cs_fence *dep; + unsigned new_num_fences = 0; - cs->request.number_of_dependencies = 0; - - for (i = 0; i < cs->num_buffers; i++) { - struct amdgpu_cs_fence *dep; + for (unsigned j = 0; j < bo->num_fences; ++j) { + struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; unsigned idx; - struct amdgpu_fence *bo_fence = (void *)cs->buffers[i].bo->fence; - if (!bo_fence) - continue; - if (bo_fence->ctx == acs->ctx && - bo_fence->fence.ip_type == cs->request.ip_type && - bo_fence->fence.ip_instance == cs->request.ip_instance && - bo_fence->fence.ring == cs->request.ring) + bo_fence->fence.ip_type == cs->request.ip_type && + bo_fence->fence.ip_instance == cs->request.ip_instance && + bo_fence->fence.ring == cs->request.ring) continue; if (amdgpu_fence_wait((void *)bo_fence, 0, false)) continue; + amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); + new_num_fences++; + + if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) + continue; + if (bo_fence->submission_in_progress) os_wait_until_zero(&bo_fence->submission_in_progress, PIPE_TIMEOUT_INFINITE); idx = cs->request.number_of_dependencies++; if (idx >= cs->max_dependencies) { unsigned size; cs->max_dependencies = idx + 8; size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); cs->request.dependencies = realloc(cs->request.dependencies, size); } dep = &cs->request.dependencies[idx]; memcpy(dep, &bo_fence->fence, sizeof(*dep)); } + + for (unsigned j = new_num_fences; j < bo->num_fences; ++j) + amdgpu_fence_reference(&bo->fences[j], NULL); + + bo->num_fences = new_num_fences; +} + +/* Since the kernel driver doesn't synchronize execution between different + * rings automatically, we have to add fence dependencies manually. + */ +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) +{ + struct amdgpu_cs_context *cs = acs->csc; + int i; + + cs->request.number_of_dependencies = 0; + + for (i = 0; i < cs->num_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->buffers[i]); +} + +static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo, + struct pipe_fence_handle *fence) +{ + if (bo->num_fences >= bo->max_fences) { + unsigned new_max_fences = MAX2(1, bo->max_fences * 2); + struct pipe_fence_handle **new_fences = + REALLOC(bo->fences, + bo->num_fences * sizeof(*new_fences), + new_max_fences * sizeof(*new_fences)); + if (new_fences) { + bo->fences = new_fences; + bo->max_fences = new_max_fences; + } else { + fprintf(stderr, "amdgpu_add_fence: allocation failure, dropping fence\n"); + if (!bo->num_fences) + return; + + bo->num_fences--; /* prefer to keep a more recent fence if possible */ + amdgpu_fence_reference(&bo->fences[bo->num_fences], NULL); + } + } + + bo->fences[bo->num_fences] = NULL; + amdgpu_fence_reference(&bo->fences[bo->num_fences], fence); + bo->num_fences++; } void amdgpu_cs_submit_ib(void *job, int thread_index) { struct amdgpu_cs *acs = (struct amdgpu_cs*)job; struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; cs->request.fence_info.handle = NULL; @@ -1024,23 +1070,23 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, cur->request.ip_instance, cur->request.ring); } if (fence) amdgpu_fence_reference(fence, cur->fence); /* Prepare buffers. */ pipe_mutex_lock(ws->bo_fence_lock); amdgpu_add_fence_dependencies(cs); for (i = 0; i < num_buffers; i++) { - p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls); - amdgpu_fence_reference(&cur->buffers[i].bo->fence, - cur->fence); + struct amdgpu_winsys_bo *bo = cur->buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); } pipe_mutex_unlock(ws->bo_fence_lock); amdgpu_cs_sync_flush(rcs); /* Swap command streams. "cst" is going to be submitted. */ cs->csc = cs->cst; cs->cst = cur; /* Submit. */ -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev