From: Nicolai Hähnle <nicolai.haeh...@amd.com> ... and implement the corresponding fence handling.
v2: - add missing bit in amdgpu_bo_is_referenced_by_cs_with_usage - remove pipe_mutex_* --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 147 +++++++++++++++++++++++++++--- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 9 +- 2 files changed, 140 insertions(+), 16 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index bffa725..3ae5d33 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -279,23 +279,26 @@ static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type) int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) { unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); int i = cs->buffer_indices_hashlist[hash]; struct amdgpu_cs_buffer *buffers; int num_buffers; if (bo->bo) { buffers = cs->real_buffers; num_buffers = cs->num_real_buffers; - } else { + } else if (!bo->sparse) { buffers = cs->slab_buffers; num_buffers = cs->num_slab_buffers; + } else { + buffers = cs->sparse_buffers; + num_buffers = cs->num_sparse_buffers; } /* not found or found */ if (i < 0 || (i < num_buffers && buffers[i].bo == bo)) return i; /* Hash collision, look for the BO in the list of buffers linearly. */ for (i = num_buffers - 1; i >= 0; i--) { if (buffers[i].bo == bo) { /* Put this buffer in the hash list. @@ -418,20 +421,77 @@ static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs, buffer->u.slab.real_idx = real_idx; p_atomic_inc(&bo->num_cs_references); cs->num_slab_buffers++; hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); cs->buffer_indices_hashlist[hash] = idx; return idx; } +static int amdgpu_lookup_or_add_sparse_buffer(struct amdgpu_cs *acs, + struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_cs_buffer *buffer; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + + if (idx >= 0) + return idx; + + /* New buffer, check if the backing array is large enough. */ + if (cs->num_sparse_buffers >= cs->max_sparse_buffers) { + unsigned new_max = + MAX2(cs->max_sparse_buffers + 16, (unsigned)(cs->max_sparse_buffers * 1.3)); + struct amdgpu_cs_buffer *new_buffers; + + new_buffers = REALLOC(cs->sparse_buffers, + cs->max_sparse_buffers * sizeof(*new_buffers), + new_max * sizeof(*new_buffers)); + if (!new_buffers) { + fprintf(stderr, "amdgpu_lookup_or_add_sparse_buffer: allocation failed\n"); + return -1; + } + + cs->max_sparse_buffers = new_max; + cs->sparse_buffers = new_buffers; + } + + idx = cs->num_sparse_buffers; + buffer = &cs->sparse_buffers[idx]; + + memset(buffer, 0, sizeof(*buffer)); + amdgpu_winsys_bo_reference(&buffer->bo, bo); + p_atomic_inc(&bo->num_cs_references); + cs->num_sparse_buffers++; + + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + cs->buffer_indices_hashlist[hash] = idx; + + /* We delay adding the backing buffers until we really have to. However, + * we cannot delay accounting for memory use. + */ + mtx_lock(&bo->u.sparse.commit_lock); + + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + if (bo->initial_domain & RADEON_DOMAIN_VRAM) + acs->main.base.used_vram += backing->bo->base.size; + else if (bo->initial_domain & RADEON_DOMAIN_GTT) + acs->main.base.used_gart += backing->bo->base.size; + } + + mtx_unlock(&bo->u.sparse.commit_lock); + + return idx; +} + static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) { /* Don't use the "domains" parameter. Amdgpu doesn't support changing * the buffer placement during command submission. */ struct amdgpu_cs *acs = amdgpu_cs(rcs); @@ -442,39 +502,49 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, /* Fast exit for no-op calls. * This is very effective with suballocators and linear uploaders that * are outside of the winsys. */ if (bo == cs->last_added_bo && (usage & cs->last_added_bo_usage) == usage && (1ull << priority) & cs->last_added_bo_priority_usage) return cs->last_added_bo_index; - if (!bo->bo) { - index = amdgpu_lookup_or_add_slab_buffer(acs, bo); - if (index < 0) - return 0; + if (!bo->sparse) { + if (!bo->bo) { + index = amdgpu_lookup_or_add_slab_buffer(acs, bo); + if (index < 0) + return 0; - buffer = &cs->slab_buffers[index]; - buffer->usage |= usage; + buffer = &cs->slab_buffers[index]; + buffer->usage |= usage; - usage &= ~RADEON_USAGE_SYNCHRONIZED; - index = buffer->u.slab.real_idx; + usage &= ~RADEON_USAGE_SYNCHRONIZED; + index = buffer->u.slab.real_idx; + } else { + index = amdgpu_lookup_or_add_real_buffer(acs, bo); + if (index < 0) + return 0; + } + + buffer = &cs->real_buffers[index]; + buffer->u.real.priority_usage |= 1llu << priority; + buffer->usage |= usage; } else { - index = amdgpu_lookup_or_add_real_buffer(acs, bo); + index = amdgpu_lookup_or_add_sparse_buffer(acs, bo); if (index < 0) return 0; - } - buffer = &cs->real_buffers[index]; - buffer->u.real.priority_usage |= 1llu << priority; - buffer->usage |= usage; + buffer = &cs->sparse_buffers[index]; + buffer->usage |= usage; + buffer->u.real.priority_usage |= 1llu << priority; + } cs->last_added_bo = bo; cs->last_added_bo_index = index; cs->last_added_bo_usage = buffer->usage; cs->last_added_bo_priority_usage = buffer->u.real.priority_usage; return index; } static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { @@ -671,38 +741,44 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) unsigned i; for (i = 0; i < cs->num_real_buffers; i++) { p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL); } for (i = 0; i < cs->num_slab_buffers; i++) { p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } + for (i = 0; i < cs->num_sparse_buffers; i++) { + p_atomic_dec(&cs->sparse_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->sparse_buffers[i].bo, NULL); + } cs->num_real_buffers = 0; cs->num_slab_buffers = 0; + cs->num_sparse_buffers = 0; amdgpu_fence_reference(&cs->fence, NULL); for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { cs->buffer_indices_hashlist[i] = -1; } cs->last_added_bo = NULL; } static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) { amdgpu_cs_context_cleanup(cs); FREE(cs->flags); FREE(cs->real_buffers); FREE(cs->handles); FREE(cs->slab_buffers); + FREE(cs->sparse_buffers); FREE(cs->request.dependencies); } static struct radeon_winsys_cs * amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx) @@ -1011,20 +1087,56 @@ static void amdgpu_add_fence_dependencies_list(struct amdgpu_cs *acs, * rings automatically, we have to add fence dependencies manually. */ static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) { struct amdgpu_cs_context *cs = acs->csc; cs->request.number_of_dependencies = 0; amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); + amdgpu_add_fence_dependencies_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); +} + +/* Add backing of sparse buffers to the buffer list. + * + * This is done late, during submission, to keep the buffer list short before + * submit, and to avoid managing fences for the backing buffers. + */ +static bool amdgpu_add_sparse_backing_buffers(struct amdgpu_cs_context *cs) +{ + for (unsigned i = 0; i < cs->num_sparse_buffers; ++i) { + struct amdgpu_cs_buffer *buffer = &cs->sparse_buffers[i]; + struct amdgpu_winsys_bo *bo = buffer->bo; + + mtx_lock(&bo->u.sparse.commit_lock); + + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + /* We can directly add the buffer here, because we know that each + * backing buffer occurs only once. + */ + int idx = amdgpu_do_add_real_buffer(cs, backing->bo); + if (idx < 0) { + fprintf(stderr, "%s: failed to add buffer\n", __FUNCTION__); + mtx_unlock(&bo->u.sparse.commit_lock); + return false; + } + + cs->real_buffers[idx].usage = buffer->usage & ~RADEON_USAGE_SYNCHRONIZED; + cs->real_buffers[idx].u.real.priority_usage = buffer->u.real.priority_usage; + p_atomic_inc(&backing->bo->num_active_ioctls); + } + + mtx_unlock(&bo->u.sparse.commit_lock); + } + + return true; } void amdgpu_cs_submit_ib(void *job, int thread_index) { struct amdgpu_cs *acs = (struct amdgpu_cs*)job; struct amdgpu_winsys *ws = acs->ctx->ws; struct amdgpu_cs_context *cs = acs->cst; int i, r; cs->request.fence_info.handle = NULL; @@ -1055,20 +1167,25 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) assert(num < ws->num_buffers); handles[num++] = bo->bo; } r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, handles, NULL, &cs->request.resources); free(handles); mtx_unlock(&ws->global_bo_list_lock); } else { + if (!amdgpu_add_sparse_backing_buffers(cs)) { + r = -ENOMEM; + goto bo_list_error; + } + if (cs->max_real_submit < cs->num_real_buffers) { FREE(cs->handles); FREE(cs->flags); cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers); cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers); if (!cs->handles || !cs->flags) { cs->max_real_submit = 0; r = -ENOMEM; @@ -1129,20 +1246,22 @@ bo_list_error: /* Cleanup. */ if (cs->request.resources) amdgpu_bo_list_destroy(cs->request.resources); cleanup: for (i = 0; i < cs->num_real_buffers; i++) p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); for (i = 0; i < cs->num_slab_buffers; i++) p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); + for (i = 0; i < cs->num_sparse_buffers; i++) + p_atomic_dec(&cs->sparse_buffers[i].bo->num_active_ioctls); amdgpu_cs_context_cleanup(cs); } /* Make sure the previous submission is completed. */ void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); /* Wait for any pending ioctl of this CS to complete. */ diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 242410f..d700b8c 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -87,20 +87,24 @@ struct amdgpu_cs_context { struct amdgpu_cs_buffer *real_buffers; unsigned max_real_submit; amdgpu_bo_handle *handles; uint8_t *flags; unsigned num_slab_buffers; unsigned max_slab_buffers; struct amdgpu_cs_buffer *slab_buffers; + unsigned num_sparse_buffers; + unsigned max_sparse_buffers; + struct amdgpu_cs_buffer *sparse_buffers; + int buffer_indices_hashlist[4096]; struct amdgpu_winsys_bo *last_added_bo; unsigned last_added_bo_index; unsigned last_added_bo_usage; uint64_t last_added_bo_priority_usage; unsigned max_dependencies; struct pipe_fence_handle *fence; @@ -219,22 +223,23 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, int index; struct amdgpu_cs_buffer *buffer; if (!bo->num_cs_references) return false; index = amdgpu_lookup_buffer(cs->csc, bo); if (index == -1) return false; - buffer = bo->bo ? &cs->csc->real_buffers[index] - : &cs->csc->slab_buffers[index]; + buffer = bo->bo ? &cs->csc->real_buffers[index] : + bo->sparse ? &cs->csc->sparse_buffers[index] : + &cs->csc->slab_buffers[index]; return (buffer->usage & usage) != 0; } static inline bool amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) { return bo->num_cs_references != 0; } -- 2.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev