[AMD Official Use Only - General]
-----Original Message----- From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Christian König Sent: Thursday, March 3, 2022 1:53 PM To: amd-gfx@lists.freedesktop.org; Olsak, Marek <marek.ol...@amd.com> Cc: Koenig, Christian <christian.koe...@amd.com> Subject: [PATCH 10/10] drm/amdgpu: add gang submit frontend Allows submitting jobs as gang which needs to run on multiple engines at the same time. All members of the gang get the same implicit, explicit and VM dependencies. So no gang member will start running until everything else is ready. The last job is considered the gang leader (usually a submission to the GFX ring) and used for signaling output dependencies. Each job is remembered individually as user of a buffer object, so there is no joining of work at the end. Signed-off-by: Christian König <christian.koe...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 244 ++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 12 +- 3 files changed, 173 insertions(+), 92 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index c6541f7b8f54..7429e64919fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -69,6 +69,7 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, unsigned int *num_ibs) { struct drm_sched_entity *entity; + unsigned int i; int r; r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type, @@ -83,11 +84,19 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, return -EINVAL; /* Currently we don't support submitting to multiple entities */ - if (p->entity && p->entity != entity) + for (i = 0; i < p->gang_size; ++i) { + if (p->entities[i] == entity) + goto found; + } + + if (i == AMDGPU_CS_GANG_SIZE) return -EINVAL; - p->entity = entity; - ++(*num_ibs); + p->entities[i] = entity; + p->gang_size = i + 1; + +found: + ++(num_ibs[i]); return 0; } @@ -161,11 +170,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { }; struct amdgpu_vm *vm = &fpriv->vm; uint64_t *chunk_array_user; uint64_t *chunk_array; - unsigned size, num_ibs = 0; uint32_t uf_offset = 0; + unsigned int size; int ret; int i; @@ -228,7 +238,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, if (size < sizeof(struct drm_amdgpu_cs_chunk_ib)) goto free_partial_kdata; - ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs); + ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs); if (ret) goto free_partial_kdata; break; @@ -265,21 +275,27 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } } - ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm); - if (ret) - goto free_all_kdata; + if (!p->gang_size) + return -EINVAL; - ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm); - if (ret) - goto free_all_kdata; + for (i = 0; i < p->gang_size; ++i) { + ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm); + if (ret) + goto free_all_kdata; + + ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i], + &fpriv->vm); + if (ret) + goto free_all_kdata; + } - if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) { + if (p->ctx->vram_lost_counter != p->jobs[0]->vram_lost_counter) { ret = -ECANCELED; goto free_all_kdata; } if (p->uf_entry.tv.bo) - p->job->uf_addr = uf_offset; + p->jobs[p->gang_size - 1]->uf_addr = uf_offset; kvfree(chunk_array); /* Use this opportunity to fill in task info for the vm */ @@ -301,22 +317,18 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, return ret; } -static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, - struct amdgpu_cs_chunk *chunk, - unsigned int *num_ibs, - unsigned int *ce_preempt, - unsigned int *de_preempt) +static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, struct amdgpu_job *job, + struct amdgpu_ib *ib, struct amdgpu_cs_chunk *chunk, + unsigned int *ce_preempt, unsigned int *de_preempt) { - struct amdgpu_ring *ring = to_amdgpu_ring(p->job->base.sched); + struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata; struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct amdgpu_ib *ib = &p->job->ibs[*num_ibs]; struct amdgpu_vm *vm = &fpriv->vm; int r; - /* MM engine doesn't support user fences */ - if (p->job->uf_addr && ring->funcs->no_user_fence) + if (job->uf_addr && ring->funcs->no_user_fence) return -EINVAL; if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX && @@ -333,7 +345,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, } if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE) - p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; + job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ? chunk_ib->ib_bytes : 0, @@ -346,8 +358,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, ib->gpu_addr = chunk_ib->va_start; ib->length_dw = chunk_ib->ib_bytes / 4; ib->flags = chunk_ib->flags; - - (*num_ibs)++; return 0; } @@ -396,7 +406,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, dma_fence_put(old); } - r = amdgpu_sync_fence(&p->job->sync, fence); + r = amdgpu_sync_fence(&p->jobs[0]->sync, fence); dma_fence_put(fence); if (r) return r; @@ -418,7 +428,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, return r; } - r = amdgpu_sync_fence(&p->job->sync, fence); + r = amdgpu_sync_fence(&p->jobs[0]->sync, fence); dma_fence_put(fence); return r; @@ -541,20 +551,30 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p, static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p) { - unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0; + unsigned int ce_preempt = 0, de_preempt = 0; + unsigned int job_idx = 0, ib_idx = 0; int i, r; for (i = 0; i < p->nchunks; ++i) { struct amdgpu_cs_chunk *chunk; + struct amdgpu_job *job; chunk = &p->chunks[i]; switch (chunk->chunk_id) { case AMDGPU_CHUNK_ID_IB: - r = amdgpu_cs_p2_ib(p, chunk, &num_ibs, + job = p->jobs[job_idx]; + r = amdgpu_cs_p2_ib(p, job, &job->ibs[ib_idx], chunk, &ce_preempt, &de_preempt); if (r) return r; + + if (++ib_idx == job->num_ibs) { + ++job_idx; + ib_idx = 0; + ce_preempt = 0; + de_preempt = 0; + } break; case AMDGPU_CHUNK_ID_DEPENDENCIES: case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES: @@ -825,6 +845,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_list_entry *e; struct list_head duplicates; + unsigned int i; int r; INIT_LIST_HEAD(&p->validated); @@ -905,16 +926,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, e->bo_va = amdgpu_vm_bo_find(vm, bo); } - /* Move fence waiting after getting reservation lock of - * PD root. Then there is no need on a ctx mutex lock. - */ - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); - if (unlikely(r != 0)) { - if (r != -ERESTARTSYS) - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); - goto error_validate; - } - amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, &p->bytes_moved_vis_threshold); p->bytes_moved = 0; @@ -938,14 +949,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, p->bytes_moved_vis); - amdgpu_job_set_resources(p->job, p->bo_list->gds_obj, - p->bo_list->gws_obj, p->bo_list->oa_obj); + for (i = 0; i < p->gang_size; ++i) + amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj, + p->bo_list->gws_obj, + p->bo_list->oa_obj); if (!r && p->uf_entry.tv.bo) { struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo); r = amdgpu_ttm_alloc_gart(&uf->tbo); - p->job->uf_addr += amdgpu_bo_gpu_offset(uf); + p->jobs[p->gang_size - 1]->uf_addr += amdgpu_bo_gpu_offset(uf); } error_validate: @@ -955,20 +968,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, return r; } -static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p) { - int i; + int i, j; if (!trace_amdgpu_cs_enabled()) return; - for (i = 0; i < parser->job->num_ibs; i++) - trace_amdgpu_cs(parser, i); + for (i = 0; i < p->gang_size; ++i) { + struct amdgpu_job *job = p->jobs[i]; + + for (j = 0; j < job->num_ibs; ++j) + trace_amdgpu_cs(p, job, &job->ibs[j]); + } } -static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) +static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, + struct amdgpu_job *job) { - struct amdgpu_job *job = p->job; struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched); unsigned int i; int r; @@ -1007,14 +1024,13 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) memcpy(ib->ptr, kptr, job->ibs[i].length_dw * 4); amdgpu_bo_kunmap(aobj); - r = amdgpu_ring_parse_cs(ring, p, p->job, - &p->job->ibs[i]); + r = amdgpu_ring_parse_cs(ring, p, job, &job->ibs[i]); if (r) return r; } else { ib->ptr = (uint32_t *)kptr; - r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, - &p->job->ibs[i]); + r = amdgpu_ring_patch_cs_in_place(ring, p, job, + &job->ibs[i]); amdgpu_bo_kunmap(aobj); if (r) return r; @@ -1024,14 +1040,29 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) return 0; } +static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p) { + unsigned int i; + int r; + + for (i = 0; i < p->gang_size; ++i) { + r = amdgpu_cs_patch_ibs(p, p->jobs[i]); + if (r) + return r; + } + return 0; +} + static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct amdgpu_device *adev = p->adev; + struct amdgpu_job *job = p->jobs[0]; struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_list_entry *e; struct amdgpu_bo_va *bo_va; struct amdgpu_bo *bo; + unsigned int i; int r; r = amdgpu_vm_clear_freed(adev, vm, NULL); @@ -1042,7 +1073,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_vm_fence(&p->job->sync, fpriv->prt_va->last_pt_update); + r = amdgpu_sync_vm_fence(&job->sync, fpriv->prt_va->last_pt_update); if (r) return r; @@ -1052,7 +1083,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update); + r = amdgpu_sync_vm_fence(&job->sync, bo_va->last_pt_update); if (r) return r; } @@ -1071,7 +1102,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update); + r = amdgpu_sync_vm_fence(&job->sync, bo_va->last_pt_update); if (r) return r; } @@ -1084,11 +1115,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_vm_fence(&p->job->sync, vm->last_update); + r = amdgpu_sync_vm_fence(&job->sync, vm->last_update); if (r) return r; - p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); + for (i = 0; i < p->gang_size; ++i) { + job = p->jobs[i]; + + if (!job->vm) + continue; + + job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); + } if (amdgpu_vm_debug) { /* Invalidate all BOs to test for userspace bugs */ @@ -1109,7 +1147,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; + struct amdgpu_job *job = p->jobs[0]; struct amdgpu_bo_list_entry *e; + unsigned int i; int r; list_for_each_entry(e, &p->validated, tv.head) { @@ -1119,12 +1159,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) sync_mode = amdgpu_bo_explicit_sync(bo) ? AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; - r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode, + r = amdgpu_sync_resv(p->adev, &job->sync, resv, sync_mode, &fpriv->vm); if (r) return r; } - return 0; + + for (i = 1; i < p->gang_size; ++i) { + r = amdgpu_sync_clone(&job->sync, &p->jobs[i]->sync); + if (r) + return r; + } + + r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]); + if (r && r != -ERESTARTSYS) + DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); + + return r; } static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) @@ -1147,17 +1198,27 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) { + struct amdgpu_job *last = p->jobs[p->gang_size - 1]; struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct drm_sched_entity *entity = p->entity; struct amdgpu_bo_list_entry *e; - struct amdgpu_job *job; + unsigned int i; uint64_t seq; int r; - job = p->job; - p->job = NULL; + for (i = 0; i < p->gang_size; ++i) + drm_sched_job_arm(&p->jobs[i]->base); - drm_sched_job_arm(&job->base); + for (i = 0; i < (p->gang_size - 1); ++i) { + struct dma_fence *fence; + + fence = &p->jobs[i]->base.s_fence->scheduled; + r = amdgpu_sync_fence(&last->sync, fence); + if (r) + goto error_cleanup; + } + + for (i = 0; i < p->gang_size; ++i) + amdgpu_job_set_gang_leader(p->jobs[i], last); /* No memory allocation is allowed while holding the notifier lock. * The lock is held until amdgpu_cs_submit is finished and fence is @@ -1175,44 +1236,58 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, } if (r) { r = -EAGAIN; - goto error_abort; + goto error_unlock; } - p->fence = dma_fence_get(&job->base.s_fence->finished); + p->fence = dma_fence_get(&last->base.s_fence->finished); - amdgpu_ctx_add_fence(p->ctx, entity, p->fence, &seq); + amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1], p->fence, + &seq); amdgpu_cs_post_dependencies(p); - if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && + if ((last->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && !p->ctx->preamble_presented) { - job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; + last->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; p->ctx->preamble_presented = true; } cs->out.handle = seq; - job->uf_sequence = seq; - - amdgpu_job_free_resources(job); + last->uf_sequence = seq; - trace_amdgpu_cs_ioctl(job); amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket); - drm_sched_entity_push_job(&job->base); + for (i = 0; i < p->gang_size; ++i) { + amdgpu_job_free_resources(p->jobs[i]); + trace_amdgpu_cs_ioctl(p->jobs[i]); + drm_sched_entity_push_job(&p->jobs[i]->base); + p->jobs[i] = NULL; + } amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm); - /* Make sure all BOs are remembered as writers */ - amdgpu_bo_list_for_each_entry(e, p->bo_list) + list_for_each_entry(e, &p->validated, tv.head) { + + /* Everybody except for the gang leader uses BOOKKEEP */ + for (i = 0; i < (p->gang_size - 1); ++i) { + dma_resv_add_fence(e->tv.bo->base.resv, + &p->jobs[i]->base.s_fence->finished, + DMA_RESV_USAGE_BOOKKEEP); + } + + /* The gang leader as remembered as writer */ e->tv.num_shared = 0; + } p->jobs[i] = NULL is done in previous loop. Now when running &p->jobs[i]->base.s_fence->finished there is NULL pointer crash. Thank you, Yogesh ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); mutex_unlock(&p->adev->notifier_lock); return 0; -error_abort: - drm_sched_job_cleanup(&job->base); +error_unlock: mutex_unlock(&p->adev->notifier_lock); - amdgpu_job_free(job); + +error_cleanup: + for (i = 0; i < p->gang_size; ++i) + drm_sched_job_cleanup(&p->jobs[i]->base); return r; } @@ -1229,17 +1304,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser) dma_fence_put(parser->fence); - if (parser->ctx) { + if (parser->ctx) amdgpu_ctx_put(parser->ctx); - } if (parser->bo_list) amdgpu_bo_list_put(parser->bo_list); for (i = 0; i < parser->nchunks; i++) kvfree(parser->chunks[i].kdata); kvfree(parser->chunks); - if (parser->job) - amdgpu_job_free(parser->job); + for (i = 0; i < parser->gang_size; ++i) { + if (parser->jobs[i]) + amdgpu_job_free(parser->jobs[i]); + } if (parser->uf_entry.tv.bo) { struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo); @@ -1283,7 +1359,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) goto error_fini; } - r = amdgpu_cs_patch_ibs(&parser); + r = amdgpu_cs_patch_jobs(&parser); if (r) goto error_backoff; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h index 652b5593499f..ba5860c08270 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h @@ -27,6 +27,8 @@ #include "amdgpu_bo_list.h" #include "amdgpu_ring.h" +#define AMDGPU_CS_GANG_SIZE 4 + struct amdgpu_bo_va_mapping; struct amdgpu_cs_chunk { @@ -50,9 +52,10 @@ struct amdgpu_cs_parser { unsigned nchunks; struct amdgpu_cs_chunk *chunks; - /* scheduler job object */ - struct drm_sched_entity *entity; - struct amdgpu_job *job; + /* scheduler job objects */ + unsigned int gang_size; + struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE]; + struct amdgpu_job *jobs[AMDGPU_CS_GANG_SIZE]; /* buffer objects */ struct ww_acquire_ctx ticket; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index d855cb53c7e0..a5167cb91ba5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create, ); TRACE_EVENT(amdgpu_cs, - TP_PROTO(struct amdgpu_cs_parser *p, int i), - TP_ARGS(p, i), + TP_PROTO(struct amdgpu_cs_parser *p, + struct amdgpu_job *job, + struct amdgpu_ib *ib), + TP_ARGS(p, job, ib), TP_STRUCT__entry( __field(struct amdgpu_bo_list *, bo_list) __field(u32, ring) @@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs, TP_fast_assign( __entry->bo_list = p->bo_list; - __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx; - __entry->dw = p->job->ibs[i].length_dw; + __entry->ring = to_amdgpu_ring(job->base.sched)->idx; + __entry->dw = ib->length_dw; __entry->fences = amdgpu_fence_count_emitted( - to_amdgpu_ring(p->entity->rq->sched)); + to_amdgpu_ring(job->base.sched)); ), TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u", __entry->bo_list, __entry->ring, __entry->dw, -- 2.25.1