From: Nicolai Hähnle <nicolai.haeh...@amd.com> --- src/gallium/drivers/radeon/radeon_winsys.h | 3 + src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 168 ++++++++++++++++++++++------- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 26 +++-- 3 files changed, 152 insertions(+), 45 deletions(-)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 8196358..0ab1f01 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -634,20 +634,23 @@ struct radeon_winsys { * Optionally chain a new chunk of the IB if necessary and supported. * * \param cs A command stream. * \param dw Number of CS dwords requested by the caller. */ bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw); /** * Return the buffer list. * + * This is the buffer list as passed to the kernel, i.e. it only contains + * the parent buffers of sub-allocated buffers. + * * \param cs Command stream * \param list Returned buffer list. Set to NULL to query the count only. * \return The buffer count. */ unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs, struct radeon_bo_list_item *list); /** * Flush a command stream. * diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 6fc47aa..c0e810c 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -260,129 +260,202 @@ static unsigned amdgpu_cs_epilog_dws(enum ring_type ring_type) if (ring_type == RING_GFX) return 4; /* for chaining */ return 0; } int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo) { unsigned hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); int i = cs->buffer_indices_hashlist[hash]; + struct amdgpu_cs_buffer *buffers; + int num_buffers; + + if (bo->bo) { + buffers = cs->real_buffers; + num_buffers = cs->num_real_buffers; + } else { + buffers = cs->slab_buffers; + num_buffers = cs->num_slab_buffers; + } /* not found or found */ - if (i == -1 || cs->buffers[i].bo == bo) + if (i < 0 || (i < num_buffers && buffers[i].bo == bo)) return i; /* Hash collision, look for the BO in the list of buffers linearly. */ - for (i = cs->num_buffers - 1; i >= 0; i--) { - if (cs->buffers[i].bo == bo) { + for (i = num_buffers - 1; i >= 0; i--) { + if (buffers[i].bo == bo) { /* Put this buffer in the hash list. * This will prevent additional hash collisions if there are * several consecutive lookup_buffer calls for the same buffer. * * Example: Assuming buffers A,B,C collide in the hash list, * the following sequence of buffers: * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC * will collide here: ^ and here: ^, * meaning that we should get very few collisions in the end. */ cs->buffer_indices_hashlist[hash] = i; return i; } } return -1; } static int -amdgpu_lookup_or_add_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) +amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo *bo) { struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_cs_buffer *buffer; unsigned hash; int idx = amdgpu_lookup_buffer(cs, bo); if (idx >= 0) return idx; /* New buffer, check if the backing array is large enough. */ - if (cs->num_buffers >= cs->max_num_buffers) { + if (cs->num_real_buffers >= cs->max_real_buffers) { unsigned new_max = - MAX2(cs->max_num_buffers + 16, (unsigned)(cs->max_num_buffers * 1.3)); + MAX2(cs->max_real_buffers + 16, (unsigned)(cs->max_real_buffers * 1.3)); struct amdgpu_cs_buffer *new_buffers; amdgpu_bo_handle *new_handles; uint8_t *new_flags; new_buffers = MALLOC(new_max * sizeof(*new_buffers)); new_handles = MALLOC(new_max * sizeof(*new_handles)); new_flags = MALLOC(new_max * sizeof(*new_flags)); if (!new_buffers || !new_handles || !new_flags) { fprintf(stderr, "amdgpu_lookup_or_add_buffer: allocation failed\n"); FREE(new_buffers); FREE(new_handles); FREE(new_flags); return -1; } - memcpy(new_buffers, cs->buffers, cs->num_buffers * sizeof(*new_buffers)); - memcpy(new_handles, cs->handles, cs->num_buffers * sizeof(*new_handles)); - memcpy(new_flags, cs->flags, cs->num_buffers * sizeof(*new_flags)); + memcpy(new_buffers, cs->real_buffers, cs->num_real_buffers * sizeof(*new_buffers)); + memcpy(new_handles, cs->handles, cs->num_real_buffers * sizeof(*new_handles)); + memcpy(new_flags, cs->flags, cs->num_real_buffers * sizeof(*new_flags)); - FREE(cs->buffers); + FREE(cs->real_buffers); FREE(cs->handles); FREE(cs->flags); - cs->max_num_buffers = new_max; - cs->buffers = new_buffers; + cs->max_real_buffers = new_max; + cs->real_buffers = new_buffers; cs->handles = new_handles; cs->flags = new_flags; } - idx = cs->num_buffers; - buffer = &cs->buffers[idx]; + idx = cs->num_real_buffers; + buffer = &cs->real_buffers[idx]; + memset(buffer, 0, sizeof(*buffer)); amdgpu_winsys_bo_reference(&buffer->bo, bo); cs->handles[idx] = bo->bo; cs->flags[idx] = 0; p_atomic_inc(&bo->num_cs_references); - cs->num_buffers++; + cs->num_real_buffers++; hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); cs->buffer_indices_hashlist[hash] = idx; if (bo->initial_domain & RADEON_DOMAIN_VRAM) acs->main.base.used_vram += bo->base.size; else if (bo->initial_domain & RADEON_DOMAIN_GTT) acs->main.base.used_gart += bo->base.size; return idx; } +static int amdgpu_lookup_or_add_slab_buffer(struct amdgpu_cs *acs, + struct amdgpu_winsys_bo *bo) +{ + struct amdgpu_cs_context *cs = acs->csc; + struct amdgpu_cs_buffer *buffer; + unsigned hash; + int idx = amdgpu_lookup_buffer(cs, bo); + int real_idx; + + if (idx >= 0) + return idx; + + real_idx = amdgpu_lookup_or_add_real_buffer(acs, bo->u.slab.real); + if (real_idx < 0) + return -1; + + /* New buffer, check if the backing array is large enough. */ + if (cs->num_slab_buffers >= cs->max_slab_buffers) { + unsigned new_max = + MAX2(cs->max_slab_buffers + 16, (unsigned)(cs->max_slab_buffers * 1.3)); + struct amdgpu_cs_buffer *new_buffers; + + new_buffers = REALLOC(cs->slab_buffers, + cs->max_slab_buffers * sizeof(*new_buffers), + new_max * sizeof(*new_buffers)); + if (!new_buffers) { + fprintf(stderr, "amdgpu_lookup_or_add_slab_buffer: allocation failed\n"); + return -1; + } + + cs->max_slab_buffers = new_max; + cs->slab_buffers = new_buffers; + } + + idx = cs->num_slab_buffers; + buffer = &cs->slab_buffers[idx]; + + memset(buffer, 0, sizeof(*buffer)); + amdgpu_winsys_bo_reference(&buffer->bo, bo); + buffer->u.slab.real_idx = real_idx; + p_atomic_inc(&bo->num_cs_references); + cs->num_slab_buffers++; + + hash = bo->unique_id & (ARRAY_SIZE(cs->buffer_indices_hashlist)-1); + cs->buffer_indices_hashlist[hash] = idx; + + return idx; +} + static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) { /* Don't use the "domains" parameter. Amdgpu doesn't support changing * the buffer placement during command submission. */ struct amdgpu_cs *acs = amdgpu_cs(rcs); struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; struct amdgpu_cs_buffer *buffer; - int index = amdgpu_lookup_or_add_buffer(acs, bo); + int index; - if (index < 0) - return 0; + if (!bo->bo) { + index = amdgpu_lookup_or_add_slab_buffer(acs, bo); + if (index < 0) + return 0; - buffer = &cs->buffers[index]; - buffer->priority_usage |= 1llu << priority; + buffer = &cs->slab_buffers[index]; + buffer->usage |= usage; + + usage &= ~RADEON_USAGE_SYNCHRONIZED; + index = buffer->u.slab.real_idx; + } else { + index = amdgpu_lookup_or_add_real_buffer(acs, bo); + if (index < 0) + return 0; + } + + buffer = &cs->real_buffers[index]; + buffer->u.real.priority_usage |= 1llu << priority; buffer->usage |= usage; cs->flags[index] = MAX2(cs->flags[index], priority / 4); return index; } static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *ws, struct amdgpu_ib *ib) { struct pb_buffer *pb; uint8_t *mapped; unsigned buffer_size; @@ -567,41 +640,45 @@ static bool amdgpu_init_cs_context(struct amdgpu_cs_context *cs, cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | AMDGPU_IB_FLAG_PREAMBLE; return true; } static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) { unsigned i; - for (i = 0; i < cs->num_buffers; i++) { - p_atomic_dec(&cs->buffers[i].bo->num_cs_references); - amdgpu_winsys_bo_reference(&cs->buffers[i].bo, NULL); - cs->handles[i] = NULL; - cs->flags[i] = 0; + for (i = 0; i < cs->num_real_buffers; i++) { + p_atomic_dec(&cs->real_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->real_buffers[i].bo, NULL); + } + for (i = 0; i < cs->num_slab_buffers; i++) { + p_atomic_dec(&cs->slab_buffers[i].bo->num_cs_references); + amdgpu_winsys_bo_reference(&cs->slab_buffers[i].bo, NULL); } - cs->num_buffers = 0; + cs->num_real_buffers = 0; + cs->num_slab_buffers = 0; amdgpu_fence_reference(&cs->fence, NULL); for (i = 0; i < ARRAY_SIZE(cs->buffer_indices_hashlist); i++) { cs->buffer_indices_hashlist[i] = -1; } } static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) { amdgpu_cs_context_cleanup(cs); FREE(cs->flags); - FREE(cs->buffers); + FREE(cs->real_buffers); FREE(cs->handles); + FREE(cs->slab_buffers); FREE(cs->request.dependencies); } static struct radeon_winsys_cs * amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx) @@ -783,27 +860,27 @@ static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw) return true; } static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, struct radeon_bo_list_item *list) { struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; int i; if (list) { - for (i = 0; i < cs->num_buffers; i++) { - list[i].bo_size = cs->buffers[i].bo->base.size; - list[i].vm_address = cs->buffers[i].bo->va; - list[i].priority_usage = cs->buffers[i].priority_usage; + for (i = 0; i < cs->num_real_buffers; i++) { + list[i].bo_size = cs->real_buffers[i].bo->base.size; + list[i].vm_address = cs->real_buffers[i].bo->va; + list[i].priority_usage = cs->real_buffers[i].u.real.priority_usage; } } - return cs->num_buffers; + return cs->num_real_buffers; } DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", false) static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, struct amdgpu_cs_buffer *buffer) { struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_winsys_bo *bo = buffer->bo; struct amdgpu_cs_fence *dep; @@ -854,22 +931,24 @@ static void amdgpu_add_fence_dependency(struct amdgpu_cs *acs, /* Since the kernel driver doesn't synchronize execution between different * rings automatically, we have to add fence dependencies manually. */ static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) { struct amdgpu_cs_context *cs = acs->csc; int i; cs->request.number_of_dependencies = 0; - for (i = 0; i < cs->num_buffers; i++) - amdgpu_add_fence_dependency(acs, &cs->buffers[i]); + for (i = 0; i < cs->num_real_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->real_buffers[i]); + for (i = 0; i < cs->num_slab_buffers; i++) + amdgpu_add_fence_dependency(acs, &cs->slab_buffers[i]); } static void amdgpu_add_fence(struct amdgpu_winsys_bo *bo, struct pipe_fence_handle *fence) { if (bo->num_fences >= bo->max_fences) { unsigned new_max_fences = MAX2(1, bo->max_fences * 2); struct pipe_fence_handle **new_fences = REALLOC(bo->fences, bo->num_fences * sizeof(*new_fences), @@ -927,21 +1006,21 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) assert(num < ws->num_buffers); handles[num++] = bo->bo; } r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, handles, NULL, &cs->request.resources); free(handles); pipe_mutex_unlock(ws->global_bo_list_lock); } else { - r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, + r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers, cs->handles, cs->flags, &cs->request.resources); } if (r) { fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); cs->request.resources = NULL; amdgpu_fence_signalled(cs->fence); cs->error_code = r; goto cleanup; @@ -964,22 +1043,24 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) user_fence = acs->ctx->user_fence_cpu_address_base + cs->request.fence_info.offset; amdgpu_fence_submitted(cs->fence, &cs->request, user_fence); } /* Cleanup. */ if (cs->request.resources) amdgpu_bo_list_destroy(cs->request.resources); cleanup: - for (i = 0; i < cs->num_buffers; i++) - p_atomic_dec(&cs->buffers[i].bo->num_active_ioctls); + for (i = 0; i < cs->num_real_buffers; i++) + p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls); + for (i = 0; i < cs->num_slab_buffers; i++) + p_atomic_dec(&cs->slab_buffers[i].bo->num_active_ioctls); amdgpu_cs_context_cleanup(cs); } /* Make sure the previous submission is completed. */ void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *ws = cs->ctx->ws; @@ -1040,21 +1121,21 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, if (rcs->current.cdw > rcs->current.max_dw) { fprintf(stderr, "amdgpu: command stream overflowed\n"); } /* If the CS is not empty or overflowed.... */ if (radeon_emitted(&cs->main.base, 0) && cs->main.base.current.cdw <= cs->main.base.current.max_dw && !debug_get_option_noop()) { struct amdgpu_cs_context *cur = cs->csc; - unsigned i, num_buffers = cur->num_buffers; + unsigned i, num_buffers; /* Set IB sizes. */ amdgpu_ib_finalize(&cs->main); if (cs->const_ib.ib_mapped) amdgpu_ib_finalize(&cs->const_ib); if (cs->const_preamble_ib.ib_mapped) amdgpu_ib_finalize(&cs->const_preamble_ib); @@ -1069,22 +1150,31 @@ static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs, cur->request.ip_type, cur->request.ip_instance, cur->request.ring); } if (fence) amdgpu_fence_reference(fence, cur->fence); /* Prepare buffers. */ pipe_mutex_lock(ws->bo_fence_lock); amdgpu_add_fence_dependencies(cs); + + num_buffers = cur->num_real_buffers; + for (i = 0; i < num_buffers; i++) { + struct amdgpu_winsys_bo *bo = cur->real_buffers[i].bo; + p_atomic_inc(&bo->num_active_ioctls); + amdgpu_add_fence(bo, cur->fence); + } + + num_buffers = cur->num_slab_buffers; for (i = 0; i < num_buffers; i++) { - struct amdgpu_winsys_bo *bo = cur->buffers[i].bo; + struct amdgpu_winsys_bo *bo = cur->slab_buffers[i].bo; p_atomic_inc(&bo->num_active_ioctls); amdgpu_add_fence(bo, cur->fence); } pipe_mutex_unlock(ws->bo_fence_lock); amdgpu_cs_sync_flush(rcs); /* Swap command streams. "cst" is going to be submitted. */ cs->csc = cs->cst; cs->cst = cur; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 51753db..5f181a5 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -38,21 +38,28 @@ struct amdgpu_ctx { struct amdgpu_winsys *ws; amdgpu_context_handle ctx; amdgpu_bo_handle user_fence_bo; uint64_t *user_fence_cpu_address_base; int refcount; }; struct amdgpu_cs_buffer { struct amdgpu_winsys_bo *bo; - uint64_t priority_usage; + union { + struct { + uint64_t priority_usage; + } real; + struct { + uint32_t real_idx; /* index of underlying real BO */ + } slab; + } u; enum radeon_bo_usage usage; }; enum ib_type { IB_CONST_PREAMBLE = 0, IB_CONST = 1, /* the const IB must be first */ IB_MAIN = 2, IB_NUM }; @@ -66,28 +73,31 @@ struct amdgpu_ib { unsigned max_ib_size; uint32_t *ptr_ib_size; enum ib_type ib_type; }; struct amdgpu_cs_context { struct amdgpu_cs_request request; struct amdgpu_cs_ib_info ib[IB_NUM]; /* Buffers. */ - unsigned max_num_buffers; - unsigned num_buffers; + unsigned max_real_buffers; + unsigned num_real_buffers; amdgpu_bo_handle *handles; uint8_t *flags; - struct amdgpu_cs_buffer *buffers; + struct amdgpu_cs_buffer *real_buffers; - int buffer_indices_hashlist[4096]; + unsigned num_slab_buffers; + unsigned max_slab_buffers; + struct amdgpu_cs_buffer *slab_buffers; + int buffer_indices_hashlist[4096]; unsigned max_dependencies; struct pipe_fence_handle *fence; /* the error returned from cs_flush for non-async submissions */ int error_code; }; struct amdgpu_cs { @@ -191,29 +201,33 @@ amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs, return num_refs == bo->ws->num_cs || (num_refs && amdgpu_lookup_buffer(cs->csc, bo) != -1); } static inline bool amdgpu_bo_is_referenced_by_cs_with_usage(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo, enum radeon_bo_usage usage) { int index; + struct amdgpu_cs_buffer *buffer; if (!bo->num_cs_references) return false; index = amdgpu_lookup_buffer(cs->csc, bo); if (index == -1) return false; - return (cs->csc->buffers[index].usage & usage) != 0; + buffer = bo->bo ? &cs->csc->real_buffers[index] + : &cs->csc->slab_buffers[index]; + + return (buffer->usage & usage) != 0; } static inline bool amdgpu_bo_is_referenced_by_any_cs(struct amdgpu_winsys_bo *bo) { return bo->num_cs_references != 0; } bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute); -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev