During process kill, drm_sched_entity_flush() will kill the vm entities. The following job submissions of this process will fail, and the resources of these jobs have not been released, nor have the fences been signalled, causing tasks to hang.
Fix by not doing job init when the entity is stopped. And when the job is already submitted, free the job resource if the entity is stopped. Signed-off-by: Liu01 Tong <tong.li...@amd.com> Signed-off-by: Lin.Cao <linca...@amd.com> --- drivers/gpu/drm/scheduler/sched_entity.c | 13 +++++++------ drivers/gpu/drm/scheduler/sched_main.c | 5 +++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index ac678de7fe5e..1e744b2eb2db 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -570,6 +570,13 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) bool first; ktime_t submit_ts; + if (entity->stopped) { + DRM_ERROR("Trying to push job to a killed entity\n"); + INIT_WORK(&sched_job->work, drm_sched_entity_kill_jobs_work); + schedule_work(&sched_job->work); + return; + } + trace_drm_sched_job(sched_job, entity); atomic_inc(entity->rq->sched->score); WRITE_ONCE(entity->last_user, current->group_leader); @@ -589,12 +596,6 @@ void drm_sched_entity_push_job(struct drm_sched_job *sched_job) /* Add the entity to the run queue */ spin_lock(&entity->lock); - if (entity->stopped) { - spin_unlock(&entity->lock); - - DRM_ERROR("Trying to push to a killed entity\n"); - return; - } rq = entity->rq; sched = rq->sched; diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index bfea608a7106..c15b17d9ffe3 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -795,6 +795,11 @@ int drm_sched_job_init(struct drm_sched_job *job, return -ENOENT; } + if (unlikely(entity->stopped)) { + pr_err("*ERROR* %s: entity is stopped!\n", __func__); + return -EINVAL; + } + if (unlikely(!credits)) { pr_err("*ERROR* %s: credits cannot be 0!\n", __func__); return -EINVAL; -- 2.34.1