Applied to drm-misc-next

On 4/16/2025 12:25 PM, Maciej Falkowski wrote:
> From: Karol Wachowski <karol.wachow...@intel.com>
> 
> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
> The enhancement aims to improve the reliability of device hang detection by
> monitoring heartbeat updates.
> 
> Each progressing inference will update heartbeat counter allowing driver to
> monitor its progression. Limit maximum number of reschedules when heartbeat
> indicates progression to 30.
> 
> The heartbeat mechanism provides a more robust method for detecting device
> hangs, potentially reducing false positive recoveries due to long running
> inferences.
> 
> Signed-off-by: Karol Wachowski <karol.wachow...@intel.com>
> Signed-off-by: Maciej Falkowski <maciej.falkow...@linux.intel.com>
> ---
>  drivers/accel/ivpu/ivpu_drv.c |  4 ++++
>  drivers/accel/ivpu/ivpu_drv.h |  1 +
>  drivers/accel/ivpu/ivpu_fw.h  |  1 +
>  drivers/accel/ivpu/ivpu_pm.c  | 20 ++++++++++++++++++++
>  4 files changed, 26 insertions(+)
> 
> diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
> index eff1d3ca075f..0e7748c5e117 100644
> --- a/drivers/accel/ivpu/ivpu_drv.c
> +++ b/drivers/accel/ivpu/ivpu_drv.c
> @@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
>  {
>       int ret;
>  
> +     drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
> +     drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
> +
>       /* Update boot params located at first 4KB of FW memory */
>       ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
>  
> @@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
>       vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
>       vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
>       atomic64_set(&vdev->unique_id_counter, 0);
> +     atomic_set(&vdev->job_timeout_counter, 0);
>       xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
>       xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
>       xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
> diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
> index 92753effb1c9..5497e7030e91 100644
> --- a/drivers/accel/ivpu/ivpu_drv.h
> +++ b/drivers/accel/ivpu/ivpu_drv.h
> @@ -154,6 +154,7 @@ struct ivpu_device {
>       struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
>       struct xarray submitted_jobs_xa;
>       struct ivpu_ipc_consumer job_done_consumer;
> +     atomic_t job_timeout_counter;
>  
>       atomic64_t unique_id_counter;
>  
> diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
> index 1d0b2bd9d65c..9a3935be1c05 100644
> --- a/drivers/accel/ivpu/ivpu_fw.h
> +++ b/drivers/accel/ivpu/ivpu_fw.h
> @@ -39,6 +39,7 @@ struct ivpu_fw_info {
>       u64 read_only_addr;
>       u32 read_only_size;
>       u32 sched_mode;
> +     u64 last_heartbeat;
>  };
>  
>  int ivpu_fw_init(struct ivpu_device *vdev);
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index b5891e91f7ab..1fe03fc16bbc 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, 
> ulong, 0644);
>  MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in 
> milliseconds, 0 - default");
>  
>  #define PM_RESCHEDULE_LIMIT     5
> +#define PM_TDR_HEARTBEAT_LIMIT  30
>  
>  static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
>  {
> @@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device 
> *vdev)
>       ivpu_fw_log_reset(vdev);
>       ivpu_fw_load(vdev);
>       fw->entry_point = fw->cold_boot_entry_point;
> +     fw->last_heartbeat = 0;
>  }
>  
>  static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
> @@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct 
> *work)
>  {
>       struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, 
> job_timeout_work.work);
>       struct ivpu_device *vdev = pm->vdev;
> +     u64 heartbeat;
>  
> +     if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <= 
> vdev->fw->last_heartbeat) {
> +             ivpu_err(vdev, "Job timeout detected, heartbeat not 
> progressed\n");
> +             goto recovery;
> +     }
> +
> +     if (atomic_fetch_inc(&vdev->job_timeout_counter) > 
> PM_TDR_HEARTBEAT_LIMIT) {
> +             ivpu_err(vdev, "Job timeout detected, heartbeat limit 
> exceeded\n");
> +             goto recovery;
> +     }
> +
> +     vdev->fw->last_heartbeat = heartbeat;
> +     ivpu_start_job_timeout_detection(vdev);
> +     return;
> +
> +recovery:
> +     atomic_set(&vdev->job_timeout_counter, 0);
>       ivpu_pm_trigger_recovery(vdev, "TDR");
>  }
>  
> @@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device 
> *vdev)
>  void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
>  {
>       cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
> +     atomic_set(&vdev->job_timeout_counter, 0);
>  }
>  
>  int ivpu_pm_suspend_cb(struct device *dev)

Reply via email to