Re: [PATCH v2] drm/scheduler: remove timeout work_struct from drm_sched_job

Lucas Stach Wed, 26 Sep 2018 00:40:18 -0700

Hi Nayan,

Am Mittwoch, den 26.09.2018, 02:09 +0900 schrieb Nayan Deshmukh:
> having a delayed work item per job is redundant as we only need one
> per scheduler to track the time out the currently executing job.
> 
> v2: the first element of the ring mirror list is the currently
> executing job so we don't need a additional variable for it
> 
> Signed-off-by: Nayan Deshmukh <nayan26deshm...@gmail.com>
> Suggested-by: Christian König <christian.koe...@amd.com>
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 31 ++++++++++++++++---------------
>  include/drm/gpu_scheduler.h            |  6 +++---
>  2 files changed, 19 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 9ca741f3a0bc..4e8505d51795 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -197,19 +197,15 @@ static void drm_sched_job_finish(struct work_struct 
> *work)
>        * manages to find this job as the next job in the list, the fence
>        * signaled check below will prevent the timeout to be restarted.
>        */
> -     cancel_delayed_work_sync(&s_job->work_tdr);
> +     cancel_delayed_work_sync(&sched->work_tdr);
>  
>       spin_lock(&sched->job_list_lock);
> -     /* queue TDR for next job */
> -     if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> -         !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
> -             struct drm_sched_job *next = list_next_entry(s_job, node);
> -
> -             if (!dma_fence_is_signaled(&next->s_fence->finished))
> -                     schedule_delayed_work(&next->work_tdr, sched->timeout);
> -     }
>       /* remove job from ring_mirror_list */
>       list_del(&s_job->node);
> +     /* queue TDR for next job */
> +     if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> +         !list_empty(&sched->ring_mirror_list))
> +             schedule_delayed_work(&sched->work_tdr, sched->timeout);
>       spin_unlock(&sched->job_list_lock);
>  
>       dma_fence_put(&s_job->s_fence->finished);
> @@ -236,16 +232,21 @@ static void drm_sched_job_begin(struct drm_sched_job 
> *s_job)
>       if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>           list_first_entry_or_null(&sched->ring_mirror_list,
>                                    struct drm_sched_job, node) == s_job)
> -             schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> +             schedule_delayed_work(&sched->work_tdr, sched->timeout);
>       spin_unlock(&sched->job_list_lock);
>  }
>  
>  static void drm_sched_job_timedout(struct work_struct *work)
>  {
> -     struct drm_sched_job *job = container_of(work, struct drm_sched_job,
> -                                              work_tdr.work);
> +     struct drm_gpu_scheduler *sched;
> +     struct drm_sched_job *job;
> +
> +     sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
> +     job = list_first_entry_or_null(&sched->ring_mirror_list,
> +                                    struct drm_sched_job, node);
>  
> -     job->sched->ops->timedout_job(job);
> +     if (job)
> +             job->sched->ops->timedout_job(job);


I don't think this is fully robust. Jobs are only removed from the
ring_mirror_list once the job_finish worker has run. If execution of
this worker is delayed for any reason (though it's really unlikely for
a delay as long as the job timeout to happen) you are blaming the wrong
job here.

So I think what you need to to is find the first job in the ring mirror
list with an unsignaled finish fence to robustly find the stuck job.

Regards,
Lucas

>  }
>  
>  /**
> @@ -315,7 +316,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler 
> *sched)
>       s_job = list_first_entry_or_null(&sched->ring_mirror_list,
>                                        struct drm_sched_job, node);
>       if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
> -             schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> +             schedule_delayed_work(&sched->work_tdr, sched->timeout);
>  
>       list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>               struct drm_sched_fence *s_fence = s_job->s_fence;
> @@ -384,7 +385,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
>  
>       INIT_WORK(&job->finish_work, drm_sched_job_finish);
>       INIT_LIST_HEAD(&job->node);
> -     INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
>  
>       return 0;
>  }
> @@ -575,6 +575,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>       INIT_LIST_HEAD(&sched->ring_mirror_list);
>       spin_lock_init(&sched->job_list_lock);
>       atomic_set(&sched->hw_rq_count, 0);
> +     INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>       atomic_set(&sched->num_jobs, 0);
>       atomic64_set(&sched->job_id_count, 0);
>  
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index daec50f887b3..d87b268f1781 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct 
> dma_fence *f);
>   *               finished to remove the job from the
>   *               @drm_gpu_scheduler.ring_mirror_list.
>   * @node: used to append this struct to the 
> @drm_gpu_scheduler.ring_mirror_list.
> - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the 
> timeout
> - *            interval is over.
>   * @id: a unique id assigned to each job scheduled on the scheduler.
>   * @karma: increment on every hang caused by this job. If this exceeds the 
> hang
>   *         limit of the scheduler then the job is marked guilty and will not
> @@ -195,7 +193,6 @@ struct drm_sched_job {
>       struct dma_fence_cb             finish_cb;
>       struct work_struct              finish_work;
>       struct list_head                node;
> -     struct delayed_work             work_tdr;
>       uint64_t                        id;
>       atomic_t                        karma;
>       enum drm_sched_priority         s_priority;
> @@ -259,6 +256,8 @@ struct drm_sched_backend_ops {
>   *                 finished.
>   * @hw_rq_count: the number of jobs currently in the hardware queue.
>   * @job_id_count: used to assign unique id to the each job.
> + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> + *            timeout interval is over.
>   * @thread: the kthread on which the scheduler which run.
>   * @ring_mirror_list: the list of jobs which are currently in the job queue.
>   * @job_list_lock: lock to protect the ring_mirror_list.
> @@ -278,6 +277,7 @@ struct drm_gpu_scheduler {
>       wait_queue_head_t               job_scheduled;
>       atomic_t                        hw_rq_count;
>       atomic64_t                      job_id_count;
> +     struct delayed_work             work_tdr;
>       struct task_struct              *thread;
>       struct list_head                ring_mirror_list;
>       spinlock_t                      job_list_lock;

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2] drm/scheduler: remove timeout work_struct from drm_sched_job

Reply via email to