On Sat, May 7, 2016 at 5:12 PM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > Looks good to me, just two remarks below... > > > On 06.05.2016 13:31, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> Ported from the initial amdgpu winsys from the private AMD branch. >> >> The thread creates the buffer list, submits IBs, and cleans up >> the submission context, which can also destroy buffers. >> >> 3-5% reduction in CPU overhead is expected for apps submitting a lot >> of IBs per frame. This is most visible with DMA IBs. >> --- >> src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 26 ++- >> src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 4 + >> src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 311 >> +++++++++++++++++--------- >> src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 52 +++-- >> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 61 +++++ >> src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 9 + >> 6 files changed, 333 insertions(+), 130 deletions(-) >> >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> index 37a41c0..ec5fa6a 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c >> @@ -43,8 +43,21 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, >> uint64_t timeout, >> { >> struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); >> struct amdgpu_winsys *ws = bo->ws; >> + int64_t abs_timeout; >> int i; >> >> + if (timeout == 0) { >> + if (p_atomic_read(&bo->num_active_ioctls)) >> + return false; >> + >> + } else { >> + abs_timeout = os_time_get_absolute_timeout(timeout); >> + >> + /* Wait if any ioctl is being submitted with this buffer. */ >> + if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, >> abs_timeout)) >> + return false; >> + } > > > I'd suggest to do the cs_sync_flush here instead of below - there is less > action at a distance, and some additional code paths end up covered by a > flush as well.
Unfortunately, amdgpu_bo_wait is exposed via the winsys interface and doesn't accept a CS. We could extend it to accept a CS or two, but that would mean adding most of what r600_buffer_map_sync_with_rings is doing. It would be a bigger cleanup and I think it should be done as a separate patch if we wanted to go down that road. > > >> + >> if (bo->is_shared) { >> /* We can't use user fences for shared buffers, because user >> fences >> * are local to this process only. If we want to wait for all >> buffer >> @@ -61,7 +74,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, >> uint64_t timeout, >> } >> >> if (timeout == 0) { >> - /* Timeout == 0 is quite simple. */ >> pipe_mutex_lock(ws->bo_fence_lock); >> for (i = 0; i < RING_LAST; i++) >> if (bo->fence[i]) { >> @@ -80,7 +92,6 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, >> uint64_t timeout, >> struct pipe_fence_handle *fence[RING_LAST] = {}; >> bool fence_idle[RING_LAST] = {}; >> bool buffer_idle = true; >> - int64_t abs_timeout = os_time_get_absolute_timeout(timeout); >> >> /* Take references to all fences, so that we can wait for them >> * without the lock. */ >> @@ -214,8 +225,15 @@ static void *amdgpu_bo_map(struct pb_buffer *buf, >> RADEON_USAGE_WRITE); >> } else { >> /* Mapping for write. */ >> - if (cs && amdgpu_bo_is_referenced_by_cs(cs, bo)) >> - cs->flush_cs(cs->flush_data, 0, NULL); >> + if (cs) { >> + if (amdgpu_bo_is_referenced_by_cs(cs, bo)) { >> + cs->flush_cs(cs->flush_data, 0, NULL); >> + } else { >> + /* Try to avoid busy-waiting in radeon_bo_wait. */ >> + if (p_atomic_read(&bo->num_active_ioctls)) >> + amdgpu_cs_sync_flush(rcs); >> + } >> + } >> >> amdgpu_bo_wait((struct pb_buffer*)bo, PIPE_TIMEOUT_INFINITE, >> RADEON_USAGE_READWRITE); >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> index 69ada10..a768771 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h >> @@ -53,6 +53,10 @@ struct amdgpu_winsys_bo { >> /* how many command streams is this bo referenced in? */ >> int num_cs_references; >> >> + /* how many command streams, which are being emitted in a separate >> + * thread, is this bo referenced in? */ >> + volatile int num_active_ioctls; >> + >> /* whether buffer_get_handle or buffer_from_handle was called, >> * it can only transition from false to true >> */ >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c >> index 03e45a9..419523b 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c >> @@ -50,6 +50,7 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned >> ip_type, >> fence->fence.ip_type = ip_type; >> fence->fence.ip_instance = ip_instance; >> fence->fence.ring = ring; >> + fence->submission_in_progress = true; >> p_atomic_inc(&ctx->refcount); >> return (struct pipe_fence_handle *)fence; >> } >> @@ -62,6 +63,7 @@ static void amdgpu_fence_submitted(struct >> pipe_fence_handle *fence, >> >> rfence->fence.fence = request->seq_no; >> rfence->user_fence_cpu_address = user_fence_cpu_address; >> + rfence->submission_in_progress = false; >> } >> >> static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) >> @@ -69,6 +71,7 @@ static void amdgpu_fence_signalled(struct >> pipe_fence_handle *fence) >> struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; >> >> rfence->signalled = true; >> + rfence->submission_in_progress = false; >> } >> >> bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t >> timeout, >> @@ -88,6 +91,13 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, >> uint64_t timeout, >> else >> abs_timeout = os_time_get_absolute_timeout(timeout); >> >> + /* The fence might not have a number assigned if its IB is being >> + * submitted in the other thread right now. Wait until the submission >> + * is done. */ >> + if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress, >> + abs_timeout)) >> + return false; >> + >> user_fence_cpu = rfence->user_fence_cpu_address; >> if (user_fence_cpu && *user_fence_cpu >= rfence->fence.fence) { >> rfence->signalled = true; >> @@ -257,7 +267,7 @@ static bool amdgpu_get_new_ib(struct radeon_winsys >> *ws, struct amdgpu_ib *ib, >> return true; >> } >> >> -static boolean amdgpu_init_cs_context(struct amdgpu_cs *cs, >> +static boolean amdgpu_init_cs_context(struct amdgpu_cs_context *cs, >> enum ring_type ring_type) >> { >> int i; >> @@ -308,10 +318,18 @@ static boolean amdgpu_init_cs_context(struct >> amdgpu_cs *cs, >> for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) { >> cs->buffer_indices_hashlist[i] = -1; >> } >> + >> + cs->request.number_of_ibs = 1; >> + cs->request.ibs = &cs->ib[IB_MAIN]; >> + >> + cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE; >> + cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | >> + AMDGPU_IB_FLAG_PREAMBLE; >> + >> return TRUE; >> } >> >> -static void amdgpu_cs_context_cleanup(struct amdgpu_cs *cs) >> +static void amdgpu_cs_context_cleanup(struct amdgpu_cs_context *cs) >> { >> unsigned i; >> >> @@ -325,13 +343,14 @@ static void amdgpu_cs_context_cleanup(struct >> amdgpu_cs *cs) >> cs->num_buffers = 0; >> cs->used_gart = 0; >> cs->used_vram = 0; >> + amdgpu_fence_reference(&cs->fence, NULL); >> >> for (i = 0; i < Elements(cs->buffer_indices_hashlist); i++) { >> cs->buffer_indices_hashlist[i] = -1; >> } >> } >> >> -static void amdgpu_destroy_cs_context(struct amdgpu_cs *cs) >> +static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs) >> { >> amdgpu_cs_context_cleanup(cs); >> FREE(cs->flags); >> @@ -356,24 +375,35 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, >> return NULL; >> } >> >> + pipe_semaphore_init(&cs->flush_completed, 1); >> + >> cs->ctx = ctx; >> cs->flush_cs = flush; >> cs->flush_data = flush_ctx; >> cs->ring_type = ring_type; >> >> - if (!amdgpu_init_cs_context(cs, ring_type)) { >> + if (!amdgpu_init_cs_context(&cs->csc1, ring_type)) { >> FREE(cs); >> return NULL; >> } >> >> - if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, &cs->ib[IB_MAIN], >> IB_MAIN)) { >> - amdgpu_destroy_cs_context(cs); >> + if (!amdgpu_init_cs_context(&cs->csc2, ring_type)) { >> + amdgpu_destroy_cs_context(&cs->csc1); >> FREE(cs); >> return NULL; >> } >> >> - cs->request.number_of_ibs = 1; >> - cs->request.ibs = &cs->ib[IB_MAIN]; >> + /* Set the first submission context as current. */ >> + cs->csc = &cs->csc1; >> + cs->cst = &cs->csc2; >> + >> + if (!amdgpu_get_new_ib(&ctx->ws->base, &cs->main, >> &cs->csc->ib[IB_MAIN], >> + IB_MAIN)) { >> + amdgpu_destroy_cs_context(&cs->csc2); >> + amdgpu_destroy_cs_context(&cs->csc1); >> + FREE(cs); >> + return NULL; >> + } >> >> p_atomic_inc(&ctx->ws->num_cs); >> return &cs->main.base; >> @@ -389,12 +419,15 @@ amdgpu_cs_add_const_ib(struct radeon_winsys_cs *rcs) >> if (cs->ring_type != RING_GFX || cs->const_ib.ib_mapped) >> return NULL; >> >> - if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], >> IB_CONST)) >> + if (!amdgpu_get_new_ib(&ws->base, &cs->const_ib, >> &cs->csc->ib[IB_CONST], >> + IB_CONST)) >> return NULL; >> >> - cs->request.number_of_ibs = 2; >> - cs->request.ibs = &cs->ib[IB_CONST]; >> - cs->ib[IB_CONST].flags = AMDGPU_IB_FLAG_CE; >> + cs->csc->request.number_of_ibs = 2; >> + cs->csc->request.ibs = &cs->csc->ib[IB_CONST]; >> + >> + cs->cst->request.number_of_ibs = 2; >> + cs->cst->request.ibs = &cs->cst->ib[IB_CONST]; >> >> return &cs->const_ib.base; >> } >> @@ -412,19 +445,21 @@ amdgpu_cs_add_const_preamble_ib(struct >> radeon_winsys_cs *rcs) >> return NULL; >> >> if (!amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib, >> - &cs->ib[IB_CONST_PREAMBLE], >> IB_CONST_PREAMBLE)) >> + &cs->csc->ib[IB_CONST_PREAMBLE], >> IB_CONST_PREAMBLE)) >> return NULL; >> >> - cs->request.number_of_ibs = 3; >> - cs->request.ibs = &cs->ib[IB_CONST_PREAMBLE]; >> - cs->ib[IB_CONST_PREAMBLE].flags = AMDGPU_IB_FLAG_CE | >> AMDGPU_IB_FLAG_PREAMBLE; >> + cs->csc->request.number_of_ibs = 3; >> + cs->csc->request.ibs = &cs->csc->ib[IB_CONST_PREAMBLE]; >> + >> + cs->cst->request.number_of_ibs = 3; >> + cs->cst->request.ibs = &cs->cst->ib[IB_CONST_PREAMBLE]; >> >> return &cs->const_preamble_ib.base; >> } >> >> #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value) >> >> -int amdgpu_lookup_buffer(struct amdgpu_cs *cs, struct amdgpu_winsys_bo >> *bo) >> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct >> amdgpu_winsys_bo *bo) >> { >> unsigned hash = bo->unique_id & >> (Elements(cs->buffer_indices_hashlist)-1); >> int i = cs->buffer_indices_hashlist[hash]; >> @@ -452,13 +487,14 @@ int amdgpu_lookup_buffer(struct amdgpu_cs *cs, >> struct amdgpu_winsys_bo *bo) >> return -1; >> } >> >> -static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs, >> +static unsigned amdgpu_add_buffer(struct amdgpu_cs *acs, >> struct amdgpu_winsys_bo *bo, >> enum radeon_bo_usage usage, >> enum radeon_bo_domain domains, >> unsigned priority, >> enum radeon_bo_domain *added_domains) >> { >> + struct amdgpu_cs_context *cs = acs->csc; >> struct amdgpu_cs_buffer *buffer; >> unsigned hash = bo->unique_id & >> (Elements(cs->buffer_indices_hashlist)-1); >> int i = -1; >> @@ -526,9 +562,9 @@ static unsigned amdgpu_cs_add_buffer(struct >> radeon_winsys_cs *rcs, >> priority, &added_domains); >> >> if (added_domains & RADEON_DOMAIN_VRAM) >> - cs->used_vram += bo->base.size; >> + cs->csc->used_vram += bo->base.size; >> else if (added_domains & RADEON_DOMAIN_GTT) >> - cs->used_gart += bo->base.size; >> + cs->csc->used_gart += bo->base.size; >> >> return index; >> } >> @@ -538,7 +574,7 @@ static int amdgpu_cs_lookup_buffer(struct >> radeon_winsys_cs *rcs, >> { >> struct amdgpu_cs *cs = amdgpu_cs(rcs); >> >> - return amdgpu_lookup_buffer(cs, (struct amdgpu_winsys_bo*)buf); >> + return amdgpu_lookup_buffer(cs->csc, (struct amdgpu_winsys_bo*)buf); >> } >> >> static boolean amdgpu_cs_validate(struct radeon_winsys_cs *rcs) >> @@ -551,8 +587,8 @@ static boolean amdgpu_cs_memory_below_limit(struct >> radeon_winsys_cs *rcs, uint64 >> struct amdgpu_cs *cs = amdgpu_cs(rcs); >> struct amdgpu_winsys *ws = cs->ctx->ws; >> >> - vram += cs->used_vram; >> - gtt += cs->used_gart; >> + vram += cs->csc->used_vram; >> + gtt += cs->csc->used_gart; >> >> /* Anything that goes above the VRAM size should go to GTT. */ >> if (vram > ws->info.vram_size) >> @@ -565,7 +601,7 @@ static boolean amdgpu_cs_memory_below_limit(struct >> radeon_winsys_cs *rcs, uint64 >> static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs, >> struct radeon_bo_list_item >> *list) >> { >> - struct amdgpu_cs *cs = amdgpu_cs(rcs); >> + struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc; >> int i; >> >> if (list) { >> @@ -578,26 +614,18 @@ static unsigned amdgpu_cs_get_buffer_list(struct >> radeon_winsys_cs *rcs, >> return cs->num_buffers; >> } >> >> -static void amdgpu_cs_do_submission(struct amdgpu_cs *cs, >> - struct pipe_fence_handle **out_fence) >> -{ >> - struct amdgpu_winsys *ws = cs->ctx->ws; >> - struct pipe_fence_handle *fence; >> - int i, j, r; >> +DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE) >> >> - /* Create a fence. */ >> - fence = amdgpu_fence_create(cs->ctx, >> - cs->request.ip_type, >> - cs->request.ip_instance, >> - cs->request.ring); >> - if (out_fence) >> - amdgpu_fence_reference(out_fence, fence); >> +/* Since the kernel driver doesn't synchronize execution between >> different >> + * rings automatically, we have to add fence dependencies manually. >> + */ >> +static void amdgpu_add_fence_dependencies(struct amdgpu_cs *acs) >> +{ >> + struct amdgpu_cs_context *cs = acs->csc; >> + int i, j; >> >> cs->request.number_of_dependencies = 0; >> >> - /* Since the kernel driver doesn't synchronize execution between >> different >> - * rings automatically, we have to add fence dependencies manually. */ >> - pipe_mutex_lock(ws->bo_fence_lock); >> for (i = 0; i < cs->num_buffers; i++) { >> for (j = 0; j < RING_LAST; j++) { >> struct amdgpu_cs_fence *dep; >> @@ -607,7 +635,7 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs >> *cs, >> if (!bo_fence) >> continue; >> >> - if (bo_fence->ctx == cs->ctx && >> + if (bo_fence->ctx == acs->ctx && >> bo_fence->fence.ip_type == cs->request.ip_type && >> bo_fence->fence.ip_instance == cs->request.ip_instance && >> bo_fence->fence.ring == cs->request.ring) >> @@ -616,6 +644,10 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs >> *cs, >> if (amdgpu_fence_wait((void *)bo_fence, 0, false)) >> continue; >> >> + if (bo_fence->submission_in_progress) >> + os_wait_until_zero(&bo_fence->submission_in_progress, >> + PIPE_TIMEOUT_INFINITE); >> + >> idx = cs->request.number_of_dependencies++; >> if (idx >= cs->max_dependencies) { >> unsigned size; >> @@ -629,14 +661,62 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs >> *cs, >> memcpy(dep, &bo_fence->fence, sizeof(*dep)); >> } >> } >> +} >> + >> +void amdgpu_cs_submit_ib(struct amdgpu_cs *acs) >> +{ >> + struct amdgpu_winsys *ws = acs->ctx->ws; >> + struct amdgpu_cs_context *cs = acs->cst; >> + int i, r; >> >> cs->request.fence_info.handle = NULL; >> - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type != >> AMDGPU_HW_IP_VCE) { >> - cs->request.fence_info.handle = cs->ctx->user_fence_bo; >> - cs->request.fence_info.offset = cs->ring_type; >> + if (cs->request.ip_type != AMDGPU_HW_IP_UVD && >> + cs->request.ip_type != AMDGPU_HW_IP_VCE) { >> + cs->request.fence_info.handle = acs->ctx->user_fence_bo; >> + cs->request.fence_info.offset = acs->ring_type; >> + } >> + >> + /* Create the buffer list. >> + * Use a buffer list containing all allocated buffers if requested. >> + */ >> + if (debug_get_option_all_bos()) { >> + struct amdgpu_winsys_bo *bo; >> + amdgpu_bo_handle *handles; >> + unsigned num = 0; >> + >> + pipe_mutex_lock(ws->global_bo_list_lock); >> + >> + handles = malloc(sizeof(handles[0]) * ws->num_buffers); >> + if (!handles) { >> + pipe_mutex_unlock(ws->global_bo_list_lock); >> + amdgpu_cs_context_cleanup(cs); >> + return; >> + } >> + >> + LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) { >> + assert(num < ws->num_buffers); >> + handles[num++] = bo->bo; >> + } >> + >> + r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, >> + handles, NULL, >> + &cs->request.resources); >> + free(handles); >> + pipe_mutex_unlock(ws->global_bo_list_lock); >> + } else { >> + r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, >> + cs->handles, cs->flags, >> + &cs->request.resources); >> } >> >> - r = amdgpu_cs_submit(cs->ctx->ctx, 0, &cs->request, 1); >> + if (r) { >> + fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r); >> + cs->request.resources = NULL; >> + amdgpu_fence_signalled(cs->fence); >> + goto cleanup; >> + } >> + >> + r = amdgpu_cs_submit(acs->ctx->ctx, 0, &cs->request, 1); >> if (r) { >> if (r == -ENOMEM) >> fprintf(stderr, "amdgpu: Not enough memory for command >> submission.\n"); >> @@ -644,30 +724,43 @@ static void amdgpu_cs_do_submission(struct amdgpu_cs >> *cs, >> fprintf(stderr, "amdgpu: The CS has been rejected, " >> "see dmesg for more information.\n"); >> >> - amdgpu_fence_signalled(fence); >> + amdgpu_fence_signalled(cs->fence); >> } else { >> /* Success. */ >> uint64_t *user_fence = NULL; >> - if (cs->request.ip_type != AMDGPU_HW_IP_UVD && cs->request.ip_type >> != AMDGPU_HW_IP_VCE) >> - user_fence = cs->ctx->user_fence_cpu_address_base + >> + if (cs->request.ip_type != AMDGPU_HW_IP_UVD && >> + cs->request.ip_type != AMDGPU_HW_IP_VCE) >> + user_fence = acs->ctx->user_fence_cpu_address_base + >> cs->request.fence_info.offset; >> - amdgpu_fence_submitted(fence, &cs->request, user_fence); >> - >> - for (i = 0; i < cs->num_buffers; i++) >> - amdgpu_fence_reference(&cs->buffers[i].bo->fence[cs->ring_type], >> - fence); >> + amdgpu_fence_submitted(cs->fence, &cs->request, user_fence); >> } >> - pipe_mutex_unlock(ws->bo_fence_lock); >> - amdgpu_fence_reference(&fence, NULL); >> + >> + /* Cleanup. */ >> + if (cs->request.resources) >> + amdgpu_bo_list_destroy(cs->request.resources); >> + >> +cleanup: >> + for (i = 0; i < cs->num_buffers; i++) >> + p_atomic_dec(&cs->buffers[i].bo->num_active_ioctls); >> + >> + amdgpu_cs_context_cleanup(cs); >> } >> >> -static void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) >> +/* Make sure the previous submission is completed. */ >> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs) >> { >> - /* no-op */ >> + struct amdgpu_cs *cs = amdgpu_cs(rcs); >> + >> + /* Wait for any pending ioctl of this CS to complete. */ >> + if (cs->ctx->ws->thread) { >> + /* wait and set the semaphore to "busy" */ >> + pipe_semaphore_wait(&cs->flush_completed); >> + /* set the semaphore to "idle" */ >> + pipe_semaphore_signal(&cs->flush_completed); >> + } >> } >> >> DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE) >> -DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE) >> >> static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, >> unsigned flags, >> @@ -720,74 +813,69 @@ static void amdgpu_cs_flush(struct radeon_winsys_cs >> *rcs, >> RADEON_USAGE_READ, 0, RADEON_PRIO_IB1); >> >> /* If the CS is not empty or overflowed.... */ >> - if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw && >> !debug_get_option_noop()) { >> - int r; >> - >> - /* Use a buffer list containing all allocated buffers if requested. >> */ >> - if (debug_get_option_all_bos()) { >> - struct amdgpu_winsys_bo *bo; >> - amdgpu_bo_handle *handles; >> - unsigned num = 0; >> - >> - pipe_mutex_lock(ws->global_bo_list_lock); >> - >> - handles = malloc(sizeof(handles[0]) * ws->num_buffers); >> - if (!handles) { >> - pipe_mutex_unlock(ws->global_bo_list_lock); >> - goto cleanup; >> - } >> + if (cs->main.base.cdw && cs->main.base.cdw <= cs->main.base.max_dw && >> + !debug_get_option_noop()) { >> + struct amdgpu_cs_context *cur = cs->csc; >> + unsigned i, num_buffers = cur->num_buffers; >> >> - LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, global_list_item) { >> - assert(num < ws->num_buffers); >> - handles[num++] = bo->bo; >> - } >> - >> - r = amdgpu_bo_list_create(ws->dev, ws->num_buffers, >> - handles, NULL, >> - &cs->request.resources); >> - free(handles); >> - pipe_mutex_unlock(ws->global_bo_list_lock); >> - } else { >> - r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, >> - cs->handles, cs->flags, >> - &cs->request.resources); >> - } >> - >> - if (r) { >> - fprintf(stderr, "amdgpu: resource list creation failed (%d)\n", >> r); >> - cs->request.resources = NULL; >> - goto cleanup; >> - } >> - >> - cs->ib[IB_MAIN].size = cs->main.base.cdw; >> + /* Set IB sizes. */ >> + cur->ib[IB_MAIN].size = cs->main.base.cdw; >> cs->main.used_ib_space += cs->main.base.cdw * 4; >> >> if (cs->const_ib.ib_mapped) { >> - cs->ib[IB_CONST].size = cs->const_ib.base.cdw; >> + cur->ib[IB_CONST].size = cs->const_ib.base.cdw; >> cs->const_ib.used_ib_space += cs->const_ib.base.cdw * 4; >> } >> >> if (cs->const_preamble_ib.ib_mapped) { >> - cs->ib[IB_CONST_PREAMBLE].size = cs->const_preamble_ib.base.cdw; >> + cur->ib[IB_CONST_PREAMBLE].size = >> cs->const_preamble_ib.base.cdw; >> cs->const_preamble_ib.used_ib_space += >> cs->const_preamble_ib.base.cdw * 4; >> } >> >> - amdgpu_cs_do_submission(cs, fence); >> + /* Create a fence. */ >> + amdgpu_fence_reference(&cur->fence, NULL); >> + cur->fence = amdgpu_fence_create(cs->ctx, >> + cur->request.ip_type, >> + cur->request.ip_instance, >> + cur->request.ring); >> + if (fence) >> + amdgpu_fence_reference(fence, cur->fence); >> + >> + /* Prepare buffers. */ >> + pipe_mutex_lock(ws->bo_fence_lock); >> + amdgpu_add_fence_dependencies(cs); >> + for (i = 0; i < num_buffers; i++) { >> + p_atomic_inc(&cur->buffers[i].bo->num_active_ioctls); >> + >> amdgpu_fence_reference(&cur->buffers[i].bo->fence[cs->ring_type], >> + cur->fence); >> + } >> + pipe_mutex_unlock(ws->bo_fence_lock); >> >> - /* Cleanup. */ >> - if (cs->request.resources) >> - amdgpu_bo_list_destroy(cs->request.resources); >> - } >> + amdgpu_cs_sync_flush(rcs); >> >> -cleanup: >> - amdgpu_cs_context_cleanup(cs); >> + /* Swap command streams. "cst" is going to be submitted. */ >> + cs->csc = cs->cst; >> + cs->cst = cur; >> + >> + /* Submit. */ >> + if (ws->thread && (flags & RADEON_FLUSH_ASYNC)) { >> + /* Set the semaphore to "busy". */ >> + pipe_semaphore_wait(&cs->flush_completed); >> + amdgpu_ws_queue_cs(ws, cs); >> + } else { >> + amdgpu_cs_submit_ib(cs); >> + } >> + } else { >> + amdgpu_cs_context_cleanup(cs->csc); >> + } >> >> - amdgpu_get_new_ib(&ws->base, &cs->main, &cs->ib[IB_MAIN], IB_MAIN); >> + amdgpu_get_new_ib(&ws->base, &cs->main, &cs->csc->ib[IB_MAIN], >> IB_MAIN); >> if (cs->const_ib.ib_mapped) >> - amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->ib[IB_CONST], >> IB_CONST); >> + amdgpu_get_new_ib(&ws->base, &cs->const_ib, &cs->csc->ib[IB_CONST], >> + IB_CONST); >> if (cs->const_preamble_ib.ib_mapped) >> amdgpu_get_new_ib(&ws->base, &cs->const_preamble_ib, >> - &cs->ib[IB_CONST_PREAMBLE], >> IB_CONST_PREAMBLE); >> + &cs->csc->ib[IB_CONST_PREAMBLE], >> IB_CONST_PREAMBLE); >> >> ws->num_cs_flushes++; >> } >> @@ -796,11 +884,14 @@ static void amdgpu_cs_destroy(struct >> radeon_winsys_cs *rcs) >> { >> struct amdgpu_cs *cs = amdgpu_cs(rcs); >> >> - amdgpu_destroy_cs_context(cs); >> + amdgpu_cs_sync_flush(rcs); >> + pipe_semaphore_destroy(&cs->flush_completed); >> p_atomic_dec(&cs->ctx->ws->num_cs); >> pb_reference(&cs->main.big_ib_buffer, NULL); >> pb_reference(&cs->const_ib.big_ib_buffer, NULL); >> pb_reference(&cs->const_preamble_ib.big_ib_buffer, NULL); >> + amdgpu_destroy_cs_context(&cs->csc1); >> + amdgpu_destroy_cs_context(&cs->csc2); >> FREE(cs); >> } >> >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h >> b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h >> index 4ed830b..1caec0a 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h >> @@ -66,18 +66,7 @@ enum { >> IB_NUM >> }; >> >> -struct amdgpu_cs { >> - struct amdgpu_ib main; /* must be first because this is inherited */ >> - struct amdgpu_ib const_ib; /* optional constant engine IB */ >> - struct amdgpu_ib const_preamble_ib; >> - struct amdgpu_ctx *ctx; >> - >> - /* Flush CS. */ >> - void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle >> **fence); >> - void *flush_data; >> - >> - /* amdgpu_cs_submit parameters */ >> - enum ring_type ring_type; >> +struct amdgpu_cs_context { >> struct amdgpu_cs_request request; >> struct amdgpu_cs_ib_info ib[IB_NUM]; >> >> @@ -94,6 +83,32 @@ struct amdgpu_cs { >> uint64_t used_gart; >> >> unsigned max_dependencies; >> + >> + struct pipe_fence_handle *fence; >> +}; >> + >> +struct amdgpu_cs { >> + struct amdgpu_ib main; /* must be first because this is inherited */ >> + struct amdgpu_ib const_ib; /* optional constant engine IB */ >> + struct amdgpu_ib const_preamble_ib; >> + struct amdgpu_ctx *ctx; >> + enum ring_type ring_type; >> + >> + /* We flip between these two CS. While one is being consumed >> + * by the kernel in another thread, the other one is being filled >> + * by the pipe driver. */ >> + struct amdgpu_cs_context csc1; >> + struct amdgpu_cs_context csc2; >> + /* The currently-used CS. */ >> + struct amdgpu_cs_context *csc; >> + /* The CS being currently-owned by the other thread. */ >> + struct amdgpu_cs_context *cst; >> + >> + /* Flush CS. */ >> + void (*flush_cs)(void *ctx, unsigned flags, struct pipe_fence_handle >> **fence); >> + void *flush_data; >> + >> + pipe_semaphore flush_completed; >> }; >> >> struct amdgpu_fence { >> @@ -103,6 +118,9 @@ struct amdgpu_fence { >> struct amdgpu_cs_fence fence; >> uint64_t *user_fence_cpu_address; >> >> + /* If the fence is unknown due to an IB still being submitted >> + * in the other thread. */ >> + volatile int submission_in_progress; /* bool (int for atomicity) */ >> volatile int signalled; /* bool (int for atomicity) */ >> }; >> >> @@ -128,7 +146,7 @@ static inline void amdgpu_fence_reference(struct >> pipe_fence_handle **dst, >> *rdst = rsrc; >> } >> >> -int amdgpu_lookup_buffer(struct amdgpu_cs *csc, struct amdgpu_winsys_bo >> *bo); >> +int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct >> amdgpu_winsys_bo *bo); >> >> static inline struct amdgpu_cs * >> amdgpu_cs(struct radeon_winsys_cs *base) >> @@ -142,7 +160,7 @@ amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs, >> { >> int num_refs = bo->num_cs_references; >> return num_refs == bo->ws->num_cs || >> - (num_refs && amdgpu_lookup_buffer(cs, bo) != -1); >> + (num_refs && amdgpu_lookup_buffer(cs->csc, bo) != -1); >> } >> >> static inline boolean >> @@ -155,11 +173,11 @@ amdgpu_bo_is_referenced_by_cs_with_usage(struct >> amdgpu_cs *cs, >> if (!bo->num_cs_references) >> return FALSE; >> >> - index = amdgpu_lookup_buffer(cs, bo); >> + index = amdgpu_lookup_buffer(cs->csc, bo); >> if (index == -1) >> return FALSE; >> >> - return (cs->buffers[index].usage & usage) != 0; >> + return (cs->csc->buffers[index].usage & usage) != 0; >> } >> >> static inline boolean >> @@ -170,6 +188,8 @@ amdgpu_bo_is_referenced_by_any_cs(struct >> amdgpu_winsys_bo *bo) >> >> bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t >> timeout, >> bool absolute); >> +void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs); >> void amdgpu_cs_init_functions(struct amdgpu_winsys *ws); >> +void amdgpu_cs_submit_ib(struct amdgpu_cs *cs); >> >> #endif >> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> index e6db145..4bd3165 100644 >> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c >> @@ -308,6 +308,13 @@ static void amdgpu_winsys_destroy(struct >> radeon_winsys *rws) >> { >> struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; >> >> + if (ws->thread) { >> + ws->kill_thread = 1; >> + pipe_semaphore_signal(&ws->cs_queued); >> + pipe_thread_wait(ws->thread); >> + } >> + pipe_semaphore_destroy(&ws->cs_queued); >> + pipe_mutex_destroy(ws->cs_queue_lock); >> pipe_mutex_destroy(ws->bo_fence_lock); >> pb_cache_deinit(&ws->bo_cache); >> pipe_mutex_destroy(ws->global_bo_list_lock); >> @@ -392,6 +399,55 @@ static int compare_dev(void *key1, void *key2) >> return key1 != key2; >> } >> >> +void amdgpu_ws_queue_cs(struct amdgpu_winsys *ws, struct amdgpu_cs *cs) >> +{ >> +retry: >> + pipe_mutex_lock(ws->cs_queue_lock); >> + if (ws->num_enqueued_cs >= ARRAY_SIZE(ws->cs_queue)) { >> + /* no room left for a flush */ >> + pipe_mutex_unlock(ws->cs_queue_lock); >> + goto retry; >> + } > > > This is a busy loop - shouldn't happen in normal programs, but it's still > not very nice. How about adding a cs_queue_space semaphore that is > initialized with the array size? Alternatively, by open-coding that > semaphore (mutex + condvar), one additional mutex could be avoided, but I > don't feel strongly either way. OK, I'll add the semaphore. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev