On 22.08.25 15:43, Pierre-Eric Pelloux-Prayer wrote:
> For hw engines that can't load balance jobs, entities are
> "statically" load balanced: on their first submit, they select
> the best scheduler based on its score.
> The score is made up of 2 parts:
> * the job queue depth (how much jobs are executing/waiting)
> * the number of entities assigned
> 
> The second part is only relevant for the static load balance:
> it's a way to consider how many entities are attached to this
> scheduler, knowing that if they ever submit jobs they will go
> to this one.
> 
> For rings that can load balance jobs freely, idle entities
> aren't a concern and shouldn't impact the scheduler's decisions.
> 
> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-pra...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 23 ++++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>  2 files changed, 19 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index f5d5c45ddc0d..4a078d2d98c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -206,9 +206,11 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx 
> *ctx, u32 hw_ip,
>  {
>       struct drm_gpu_scheduler **scheds = NULL, *sched = NULL;
>       struct amdgpu_device *adev = ctx->mgr->adev;
> +     bool static_load_balancing = false;
>       struct amdgpu_ctx_entity *entity;
>       enum drm_sched_priority drm_prio;
>       unsigned int hw_prio, num_scheds;
> +     struct amdgpu_ring *aring;
>       int32_t ctx_prio;
>       int r;
>  
> @@ -236,17 +238,22 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx 
> *ctx, u32 hw_ip,
>               r = amdgpu_xcp_select_scheds(adev, hw_ip, hw_prio, fpriv,
>                                               &num_scheds, &scheds);
>               if (r)
> -                     goto cleanup_entity;
> +                     goto error_free_entity;
>       }
>  
>       /* disable load balance if the hw engine retains context among 
> dependent jobs */
> -     if (hw_ip == AMDGPU_HW_IP_VCN_ENC ||
> -         hw_ip == AMDGPU_HW_IP_VCN_DEC ||
> -         hw_ip == AMDGPU_HW_IP_UVD_ENC ||
> -         hw_ip == AMDGPU_HW_IP_UVD) {
> +     static_load_balancing = hw_ip == AMDGPU_HW_IP_VCN_ENC ||
> +                             hw_ip == AMDGPU_HW_IP_VCN_DEC ||
> +                             hw_ip == AMDGPU_HW_IP_UVD_ENC ||
> +                             hw_ip == AMDGPU_HW_IP_UVD;

Please make that a property of the, e.g. a bool in struct amdgpu_ring_funcs.

Making it depend on the HW IP type was a bad idea in the first place since this 
all depends on the HW/FW version and not the general IP type.

Apart from that looks good to me.

Regards,
Christian.

> +
> +     if (static_load_balancing) {
>               sched = drm_sched_pick_best(scheds, num_scheds);
>               scheds = &sched;
>               num_scheds = 1;
> +             aring = container_of(sched, struct amdgpu_ring, sched);
> +             entity->sched_ring_score = aring->sched_score;
> +             atomic_inc(entity->sched_ring_score);
>       }
>  
>       r = drm_sched_entity_init(&entity->entity, drm_prio, scheds, num_scheds,
> @@ -264,6 +271,9 @@ static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, 
> u32 hw_ip,
>       drm_sched_entity_fini(&entity->entity);
>  
>  error_free_entity:
> +     if (static_load_balancing)
> +             atomic_dec(entity->sched_ring_score);
> +
>       kfree(entity);
>  
>       return r;
> @@ -514,6 +524,9 @@ static void amdgpu_ctx_do_release(struct kref *ref)
>                       if (!ctx->entities[i][j])
>                               continue;
>  
> +                     if (ctx->entities[i][j]->sched_ring_score)
> +                             
> atomic_dec(ctx->entities[i][j]->sched_ring_score);
> +
>                       drm_sched_entity_destroy(&ctx->entities[i][j]->entity);
>               }
>       }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 090dfe86f75b..076a0e165ce0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -39,6 +39,7 @@ struct amdgpu_ctx_entity {
>       uint32_t                hw_ip;
>       uint64_t                sequence;
>       struct drm_sched_entity entity;
> +     atomic_t                *sched_ring_score;
>       struct dma_fence        *fences[];
>  };
>  

Reply via email to