From: Marek Olšák <marek.ol...@amd.com> First this happens:
1) amdgpu_cs_flush (lock bo_fence_lock) -> amdgpu_add_fence_dependency -> os_wait_until_zero (wait for submission_in_progress) - WAITING 2) amdgpu_bo_create -> pb_cache_reclaim_buffer (lock pb_cache::mutex) -> pb_cache_is_buffer_compat -> amdgpu_bo_wait (lock bo_fence_lock) - WAITING So both bo_fence_lock and pb_cache::mutex are held. amdgpu_bo_create can't continue. amdgpu_cs_flush is waiting for the CS ioctl to finish the job, but the CS ioctl is trying to release a buffer: 3) amdgpu_cs_submit_ib (CS thread - job entrypoint) -> amdgpu_cs_context_cleanup -> pb_reference -> pb_destroy -> amdgpu_bo_destroy_or_cache -> pb_cache_add_buffer (lock pb_cache::mutex) - DEADLOCK The simple solution is not to wait for submission_in_progress, which we need in order to create the list of dependencies for the CS ioctl. Instead of building the list of dependencies as a direct input to the CS ioctl, build the list of dependencies as a list of fences, and make the final list of dependencies in the CS thread itself. Therefore, amdgpu_cs_flush doesn't have to wait and can continue. Then, amdgpu_bo_create can continue and return. And then amdgpu_cs_submit_ib can continue. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=101294 --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 55 ++++++++++++++++++++++--------- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 4 ++- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index a0ef826..c88be06 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -745,39 +745,42 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL); } for (i = 0; i < cs->num_slab_buffers; i++) { p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } for (i = 0; i < cs->num_sparse_buffers; i++) { p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL); } + for (i = 0; i < cs->num_fence_dependencies; i++) + amdgpu_fence_reference(&cs->fence_dependencies[i], NULL); cs->num_real_buffers = 0; cs->num_slab_buffers = 0; cs->num_sparse_buffers = 0; + cs->num_fence_dependencies = 0; amdgpu_fence_reference(&cs->fence, NULL); memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); cs->last_added_bo = NULL; } static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) { amdgpu_cs_context_cleanup(cs); FREE(cs->flags); FREE(cs->real_buffers); FREE(cs->handles); FREE(cs->slab_buffers); FREE(cs->sparse_buffers); - FREE(cs->request.dependencies); + FREE(cs->fence_dependencies); } static struct radeon_winsys_cs * amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx) { @@ -974,21 +977,20 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, return cs->num_real_buffers; } DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, struct amdgpu_cs_buffer *buffer) { struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_winsys_bo *bo = buffer->bo; - struct amdgpu_cs_fence *dep; unsigned new_num_fences = 0; for (unsigned j = 0; j < bo->num_fences; ++j) { struct amdgpu_fence *bo_fence = (void *)bo->fences[j]; unsigned idx; if (bo_fence->ctx == acs->ctx && bo_fence->fence.ip_type == cs->request.ip_type && bo_fence->fence.ip_instance == cs->request.ip_instance && bo_fence->fence.ring == cs->request.ring) @@ -996,35 +998,35 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, if (amdgpu_fence_wait((void *)bo_fence, 0, false)) continue; amdgpu_fence_reference(&bo->fences[new_num_fences], bo->fences[j]); new_num_fences++; if (!(buffer->usage & RADEON_USAGE_SYNCHRONIZED)) continue; - if (bo_fence->submission_in_progress) - os_wait_until_zero(&bo_fence->submission_in_progress, - PIPE_TIMEOUT_INFINITE); - - idx = cs->request.number_of_dependencies++; - if (idx >= cs->max_dependencies) { + idx = cs->num_fence_dependencies++; + if (idx >= cs->max_fence_dependencies) { unsigned size; - - cs->max_dependencies = idx + 8; - size = cs->max_dependencies * sizeof(struct amdgpu_cs_fence); - cs->request.dependencies = realloc(cs->request.dependencies, size); + const unsigned increment = 8; + + cs->max_fence_dependencies = idx + increment; + size = cs->max_fence_dependencies * sizeof(cs->fence_dependencies[0]); + cs->fence_dependencies = realloc(cs->fence_dependencies, size); + /* Clear the newly-allocated elements. */ + memset(cs->fence_dependencies + idx, 0, + increment * sizeof(cs->fence_dependencies[0])); } - dep = &cs->request.dependencies[idx]; - memcpy(dep, &bo_fence->fence, sizeof(*dep)); + amdgpu_fence_reference(&cs->fence_dependencies[idx], + (struct pipe_fence_handle*)bo_fence); } for (unsigned j = new_num_fences; j < bo->num_fences; ++j) amdgpu_fence_reference(&bo->fences[j], NULL); bo->num_fences = new_num_fences; } /* Add the given list of fences to the buffer's fence list. * @@ -1081,21 +1083,21 @@ static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs, } } /* Since the kernel driver doesn't synchronize execution between different * rings automatically, we have to add fence dependencies manually. */ static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) { struct amdgpu_cs_context *cs = acs->csc; - cs->request.number_of_dependencies = 0; + cs->num_fence_dependencies = 0; amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); } /* Add backing of sparse buffers to the buffer list. * * This is done late, during submission, to keep the buffer list short before * submit, and to avoid managing fences for the backing buffers. @@ -1129,21 +1131,44 @@ static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) return true; } void amdgpu_cs_submit_ib(void *job, int thread_index) { struct amdgpu_cs *acs = (struct amdgpu_cs*)job; struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; + struct amdgpu_cs_fence *dependencies = NULL; + + /* Set dependencies (input fences). */ + if (cs->num_fence_dependencies) { + dependencies = alloca(sizeof(dependencies[0]) * + cs->num_fence_dependencies); + unsigned num = 0; + + for (i = 0; i < cs->num_fence_dependencies; i++) { + struct amdgpu_fence *fence = + (struct amdgpu_fence*)cs->fence_dependencies[i]; + + /* Past fences can't be unsubmitted because we have only 1 CS thread. */ + assert(!fence->submission_in_progress); + memcpy(&dependencies[num++], &fence->fence, sizeof(dependencies[0])); + } + cs->request.dependencies = dependencies; + cs->request.number_of_dependencies = num; + } else { + cs->request.dependencies = NULL; + cs->request.number_of_dependencies = 0; + } + /* Set the output fence. */ cs->request.fence_info.handle = NULL; if (amdgpu_cs_has_user_fence(cs)) { cs->request.fence_info.handle = acs->ctx->user_fence_bo; cs->request.fence_info.offset = acs->ring_type; } /* Create the buffer list. * Use a buffer list containing all allocated buffers if requested. */ if (debug_get_option_all_bos()) { diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index d700b8c..d83c1e0 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -98,21 +98,23 @@ struct amdgpu_cs_context { unsigned max_sparse_buffers; struct amdgpu_cs_buffer *sparse_buffers; int buffer_indices_hashlist[4096]; struct amdgpu_winsys_bo *last_added_bo; unsigned last_added_bo_index; unsigned last_added_bo_usage; uint64_t last_added_bo_priority_usage; - unsigned max_dependencies; + struct pipe_fence_handle **fence_dependencies; + unsigned num_fence_dependencies; + unsigned max_fence_dependencies; struct pipe_fence_handle *fence; /* the error returned from cs_flush for non-async submissions */ int error_code; }; struct amdgpu_cs { struct amdgpu_ib main; /* must be first because this is inherited */ struct amdgpu_ib const_ib; /* optional constant engine IB */ -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev