On 2020-12-04 3:13 a.m., Christian König wrote:
> Thinking more about that I came to the conclusion that the whole 
> approach here isn't correct.
> 
> See even when the job has been completed or canceled we still want to 
> restart the timer.
> 
> The reason for this is that the timer is then not restarted for the 
> current job, but for the next job in the queue.

Got it. I'll make that change in patch 5/5 as this patch, 4/5,
only changes the timer timeout function from void to enum, and
doesn't affect behaviour.

> The only valid reason to not restart the timer is that the whole device 
> was hot plugged and we return -ENODEV here. E.g. what Andrey has been 
> working on.

Yes, perhaps something like DRM_TASK_STATUS_ENODEV.
We can add it now or later when Andrey adds his
hotplug/unplug patches.

Regards,
Luben

> 
> Regards,
> Christian.
> 
> Am 04.12.20 um 04:17 schrieb Luben Tuikov:
>> The driver's job timeout handler now returns
>> status indicating back to the DRM layer whether
>> the task (job) was successfully aborted or whether
>> more time should be given to the task to complete.
>>
>> Default behaviour as of this patch, is preserved,
>> except in obvious-by-comment case in the Panfrost
>> driver, as documented below.
>>
>> All drivers which make use of the
>> drm_sched_backend_ops' .timedout_job() callback
>> have been accordingly renamed and return the
>> would've-been default value of
>> DRM_TASK_STATUS_ALIVE to restart the task's
>> timeout timer--this is the old behaviour, and
>> is preserved by this patch.
>>
>> In the case of the Panfrost driver, its timedout
>> callback correctly first checks if the job had
>> completed in due time and if so, it now returns
>> DRM_TASK_STATUS_COMPLETE to notify the DRM layer
>> that the task can be moved to the done list, to be
>> freed later. In the other two subsequent checks,
>> the value of DRM_TASK_STATUS_ALIVE is returned, as
>> per the default behaviour.
>>
>> A more involved driver's solutions can be had
>> in subequent patches.
>>
>> Signed-off-by: Luben Tuikov <luben.tui...@amd.com>
>> Reported-by: kernel test robot <l...@intel.com>
>>
>> Cc: Alexander Deucher <alexander.deuc...@amd.com>
>> Cc: Andrey Grodzovsky <andrey.grodzov...@amd.com>
>> Cc: Christian König <christian.koe...@amd.com>
>> Cc: Daniel Vetter <daniel.vet...@ffwll.ch>
>> Cc: Lucas Stach <l.st...@pengutronix.de>
>> Cc: Russell King <linux+etna...@armlinux.org.uk>
>> Cc: Christian Gmeiner <christian.gmei...@gmail.com>
>> Cc: Qiang Yu <yuq...@gmail.com>
>> Cc: Rob Herring <r...@kernel.org>
>> Cc: Tomeu Vizoso <tomeu.viz...@collabora.com>
>> Cc: Steven Price <steven.pr...@arm.com>
>> Cc: Alyssa Rosenzweig <alyssa.rosenzw...@collabora.com>
>> Cc: Eric Anholt <e...@anholt.net>
>>
>> v2: Use enum as the status of a driver's job
>>      timeout callback method.
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  6 +++--
>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c | 10 +++++++-
>>   drivers/gpu/drm/lima/lima_sched.c       |  4 +++-
>>   drivers/gpu/drm/panfrost/panfrost_job.c |  9 ++++---
>>   drivers/gpu/drm/scheduler/sched_main.c  |  4 +---
>>   drivers/gpu/drm/v3d/v3d_sched.c         | 32 +++++++++++++------------
>>   include/drm/gpu_scheduler.h             | 20 +++++++++++++---
>>   7 files changed, 57 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index ff48101bab55..a111326cbdde 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -28,7 +28,7 @@
>>   #include "amdgpu.h"
>>   #include "amdgpu_trace.h"
>>   
>> -static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>> +static enum drm_task_status amdgpu_job_timedout(struct drm_sched_job *s_job)
>>   {
>>      struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>      struct amdgpu_job *job = to_amdgpu_job(s_job);
>> @@ -41,7 +41,7 @@ static void amdgpu_job_timedout(struct drm_sched_job 
>> *s_job)
>>          amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) 
>> {
>>              DRM_ERROR("ring %s timeout, but soft recovered\n",
>>                        s_job->sched->name);
>> -            return;
>> +            return DRM_TASK_STATUS_ALIVE;
>>      }
>>   
>>      amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> @@ -53,10 +53,12 @@ static void amdgpu_job_timedout(struct drm_sched_job 
>> *s_job)
>>   
>>      if (amdgpu_device_should_recover_gpu(ring->adev)) {
>>              amdgpu_device_gpu_recover(ring->adev, job);
>> +            return DRM_TASK_STATUS_ALIVE;
>>      } else {
>>              drm_sched_suspend_timeout(&ring->sched);
>>              if (amdgpu_sriov_vf(adev))
>>                      adev->virt.tdr_debug = true;
>> +            return DRM_TASK_STATUS_ALIVE;
>>      }
>>   }
>>   
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> index cd46c882269c..c49516942328 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> @@ -82,7 +82,8 @@ static struct dma_fence *etnaviv_sched_run_job(struct 
>> drm_sched_job *sched_job)
>>      return fence;
>>   }
>>   
>> -static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
>> +static enum drm_task_status etnaviv_sched_timedout_job(struct drm_sched_job
>> +                                                   *sched_job)
>>   {
>>      struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
>>      struct etnaviv_gpu *gpu = submit->gpu;
>> @@ -120,9 +121,16 @@ static void etnaviv_sched_timedout_job(struct 
>> drm_sched_job *sched_job)
>>   
>>      drm_sched_resubmit_jobs(&gpu->sched);
>>   
>> +    /* Tell the DRM scheduler that this task needs
>> +     * more time.
>> +     */
>> +    drm_sched_start(&gpu->sched, true);
>> +    return DRM_TASK_STATUS_ALIVE;
>> +
>>   out_no_timeout:
>>      /* restart scheduler after GPU is usable again */
>>      drm_sched_start(&gpu->sched, true);
>> +    return DRM_TASK_STATUS_ALIVE;
>>   }
>>   
>>   static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
>> diff --git a/drivers/gpu/drm/lima/lima_sched.c 
>> b/drivers/gpu/drm/lima/lima_sched.c
>> index 63b4c5643f9c..66d9236b8760 100644
>> --- a/drivers/gpu/drm/lima/lima_sched.c
>> +++ b/drivers/gpu/drm/lima/lima_sched.c
>> @@ -415,7 +415,7 @@ static void lima_sched_build_error_task_list(struct 
>> lima_sched_task *task)
>>      mutex_unlock(&dev->error_task_list_lock);
>>   }
>>   
>> -static void lima_sched_timedout_job(struct drm_sched_job *job)
>> +static enum drm_task_status lima_sched_timedout_job(struct drm_sched_job 
>> *job)
>>   {
>>      struct lima_sched_pipe *pipe = to_lima_pipe(job->sched);
>>      struct lima_sched_task *task = to_lima_task(job);
>> @@ -449,6 +449,8 @@ static void lima_sched_timedout_job(struct drm_sched_job 
>> *job)
>>   
>>      drm_sched_resubmit_jobs(&pipe->base);
>>      drm_sched_start(&pipe->base, true);
>> +
>> +    return DRM_TASK_STATUS_ALIVE;
>>   }
>>   
>>   static void lima_sched_free_job(struct drm_sched_job *job)
>> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
>> b/drivers/gpu/drm/panfrost/panfrost_job.c
>> index 04e6f6f9b742..845148a722e4 100644
>> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
>> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
>> @@ -432,7 +432,8 @@ static void panfrost_scheduler_start(struct 
>> panfrost_queue_state *queue)
>>      mutex_unlock(&queue->lock);
>>   }
>>   
>> -static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>> +static enum drm_task_status panfrost_job_timedout(struct drm_sched_job
>> +                                              *sched_job)
>>   {
>>      struct panfrost_job *job = to_panfrost_job(sched_job);
>>      struct panfrost_device *pfdev = job->pfdev;
>> @@ -443,7 +444,7 @@ static void panfrost_job_timedout(struct drm_sched_job 
>> *sched_job)
>>       * spurious. Bail out.
>>       */
>>      if (dma_fence_is_signaled(job->done_fence))
>> -            return;
>> +            return DRM_TASK_STATUS_COMPLETE;
>>   
>>      dev_err(pfdev->dev, "gpu sched timeout, js=%d, config=0x%x, 
>> status=0x%x, head=0x%x, tail=0x%x, sched_job=%p",
>>              js,
>> @@ -455,11 +456,13 @@ static void panfrost_job_timedout(struct drm_sched_job 
>> *sched_job)
>>   
>>      /* Scheduler is already stopped, nothing to do. */
>>      if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
>> -            return;
>> +            return DRM_TASK_STATUS_ALIVE;
>>   
>>      /* Schedule a reset if there's no reset in progress. */
>>      if (!atomic_xchg(&pfdev->reset.pending, 1))
>>              schedule_work(&pfdev->reset.work);
>> +
>> +    return DRM_TASK_STATUS_ALIVE;
>>   }
>>   
>>   static const struct drm_sched_backend_ops panfrost_sched_ops = {
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index 3eb7618a627d..b9876cad94f2 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -526,7 +526,7 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, 
>> bool full_recovery)
>>   EXPORT_SYMBOL(drm_sched_start);
>>   
>>   /**
>> - * drm_sched_resubmit_jobs - helper to relunch job from pending ring list
>> + * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
>>    *
>>    * @sched: scheduler instance
>>    *
>> @@ -560,8 +560,6 @@ void drm_sched_resubmit_jobs(struct drm_gpu_scheduler 
>> *sched)
>>              } else {
>>                      s_job->s_fence->parent = fence;
>>              }
>> -
>> -
>>      }
>>   }
>>   EXPORT_SYMBOL(drm_sched_resubmit_jobs);
>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c 
>> b/drivers/gpu/drm/v3d/v3d_sched.c
>> index 452682e2209f..3740665ec479 100644
>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>> @@ -259,7 +259,7 @@ v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
>>      return NULL;
>>   }
>>   
>> -static void
>> +static enum drm_task_status
>>   v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job 
>> *sched_job)
>>   {
>>      enum v3d_queue q;
>> @@ -285,6 +285,8 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct 
>> drm_sched_job *sched_job)
>>      }
>>   
>>      mutex_unlock(&v3d->reset_lock);
>> +
>> +    return DRM_TASK_STATUS_ALIVE;
>>   }
>>   
>>   /* If the current address or return address have changed, then the GPU
>> @@ -292,7 +294,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct 
>> drm_sched_job *sched_job)
>>    * could fail if the GPU got in an infinite loop in the CL, but that
>>    * is pretty unlikely outside of an i-g-t testcase.
>>    */
>> -static void
>> +static enum drm_task_status
>>   v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
>>                  u32 *timedout_ctca, u32 *timedout_ctra)
>>   {
>> @@ -304,39 +306,39 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, 
>> enum v3d_queue q,
>>      if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
>>              *timedout_ctca = ctca;
>>              *timedout_ctra = ctra;
>> -            return;
>> +            return DRM_TASK_STATUS_ALIVE;
>>      }
>>   
>> -    v3d_gpu_reset_for_timeout(v3d, sched_job);
>> +    return v3d_gpu_reset_for_timeout(v3d, sched_job);
>>   }
>>   
>> -static void
>> +static enum drm_task_status
>>   v3d_bin_job_timedout(struct drm_sched_job *sched_job)
>>   {
>>      struct v3d_bin_job *job = to_bin_job(sched_job);
>>   
>> -    v3d_cl_job_timedout(sched_job, V3D_BIN,
>> -                        &job->timedout_ctca, &job->timedout_ctra);
>> +    return v3d_cl_job_timedout(sched_job, V3D_BIN,
>> +                               &job->timedout_ctca, &job->timedout_ctra);
>>   }
>>   
>> -static void
>> +static enum drm_task_status
>>   v3d_render_job_timedout(struct drm_sched_job *sched_job)
>>   {
>>      struct v3d_render_job *job = to_render_job(sched_job);
>>   
>> -    v3d_cl_job_timedout(sched_job, V3D_RENDER,
>> -                        &job->timedout_ctca, &job->timedout_ctra);
>> +    return v3d_cl_job_timedout(sched_job, V3D_RENDER,
>> +                               &job->timedout_ctca, &job->timedout_ctra);
>>   }
>>   
>> -static void
>> +static enum drm_task_status
>>   v3d_generic_job_timedout(struct drm_sched_job *sched_job)
>>   {
>>      struct v3d_job *job = to_v3d_job(sched_job);
>>   
>> -    v3d_gpu_reset_for_timeout(job->v3d, sched_job);
>> +    return v3d_gpu_reset_for_timeout(job->v3d, sched_job);
>>   }
>>   
>> -static void
>> +static enum drm_task_status
>>   v3d_csd_job_timedout(struct drm_sched_job *sched_job)
>>   {
>>      struct v3d_csd_job *job = to_csd_job(sched_job);
>> @@ -348,10 +350,10 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job)
>>       */
>>      if (job->timedout_batches != batches) {
>>              job->timedout_batches = batches;
>> -            return;
>> +            return DRM_TASK_STATUS_ALIVE;
>>      }
>>   
>> -    v3d_gpu_reset_for_timeout(v3d, sched_job);
>> +    return v3d_gpu_reset_for_timeout(v3d, sched_job);
>>   }
>>   
>>   static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>> index 2e0c368e19f6..cedfc5394e52 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -206,6 +206,11 @@ static inline bool drm_sched_invalidate_job(struct 
>> drm_sched_job *s_job,
>>      return s_job && atomic_inc_return(&s_job->karma) > threshold;
>>   }
>>   
>> +enum drm_task_status {
>> +    DRM_TASK_STATUS_COMPLETE,
>> +    DRM_TASK_STATUS_ALIVE
>> +};
>> +
>>   /**
>>    * struct drm_sched_backend_ops
>>    *
>> @@ -230,10 +235,19 @@ struct drm_sched_backend_ops {
>>      struct dma_fence *(*run_job)(struct drm_sched_job *sched_job);
>>   
>>      /**
>> -         * @timedout_job: Called when a job has taken too long to execute,
>> -         * to trigger GPU recovery.
>> +     * @timedout_job: Called when a job has taken too long to execute,
>> +     * to trigger GPU recovery.
>> +     *
>> +     * Return DRM_TASK_STATUS_ALIVE, if the task (job) is healthy
>> +     * and executing in the hardware, i.e. it needs more time.
>> +     *
>> +     * Return DRM_TASK_STATUS_COMPLETE, if the task (job) has
>> +     * been aborted or is unknown to the hardware, i.e. if
>> +     * the task is out of the hardware, and maybe it is now
>> +     * in the done list, or it was completed long ago, or
>> +     * if it is unknown to the hardware.
>>       */
>> -    void (*timedout_job)(struct drm_sched_job *sched_job);
>> +    enum drm_task_status (*timedout_job)(struct drm_sched_job *sched_job);
>>   
>>      /**
>>            * @free_job: Called once the job's finished fence has been 
>> signaled
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to