Applied to drm-misc-next
On 4/16/2025 12:25 PM, Maciej Falkowski wrote:
> From: Karol Wachowski <karol.wachow...@intel.com>
>
> Introduce a heartbeat-based Timeout Detection and Recovery (TDR) mechanism.
> The enhancement aims to improve the reliability of device hang detection by
> monitoring heartbeat updates.
>
> Each progressing inference will update heartbeat counter allowing driver to
> monitor its progression. Limit maximum number of reschedules when heartbeat
> indicates progression to 30.
>
> The heartbeat mechanism provides a more robust method for detecting device
> hangs, potentially reducing false positive recoveries due to long running
> inferences.
>
> Signed-off-by: Karol Wachowski <karol.wachow...@intel.com>
> Signed-off-by: Maciej Falkowski <maciej.falkow...@linux.intel.com>
> ---
> drivers/accel/ivpu/ivpu_drv.c | 4 ++++
> drivers/accel/ivpu/ivpu_drv.h | 1 +
> drivers/accel/ivpu/ivpu_fw.h | 1 +
> drivers/accel/ivpu/ivpu_pm.c | 20 ++++++++++++++++++++
> 4 files changed, 26 insertions(+)
>
> diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
> index eff1d3ca075f..0e7748c5e117 100644
> --- a/drivers/accel/ivpu/ivpu_drv.c
> +++ b/drivers/accel/ivpu/ivpu_drv.c
> @@ -374,6 +374,9 @@ int ivpu_boot(struct ivpu_device *vdev)
> {
> int ret;
>
> + drm_WARN_ON(&vdev->drm, atomic_read(&vdev->job_timeout_counter));
> + drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
> +
> /* Update boot params located at first 4KB of FW memory */
> ivpu_fw_boot_params_setup(vdev, ivpu_bo_vaddr(vdev->fw->mem));
>
> @@ -573,6 +576,7 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
> vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
> vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
> atomic64_set(&vdev->unique_id_counter, 0);
> + atomic_set(&vdev->job_timeout_counter, 0);
> xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
> xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
> xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
> diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
> index 92753effb1c9..5497e7030e91 100644
> --- a/drivers/accel/ivpu/ivpu_drv.h
> +++ b/drivers/accel/ivpu/ivpu_drv.h
> @@ -154,6 +154,7 @@ struct ivpu_device {
> struct mutex submitted_jobs_lock; /* Protects submitted_jobs */
> struct xarray submitted_jobs_xa;
> struct ivpu_ipc_consumer job_done_consumer;
> + atomic_t job_timeout_counter;
>
> atomic64_t unique_id_counter;
>
> diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
> index 1d0b2bd9d65c..9a3935be1c05 100644
> --- a/drivers/accel/ivpu/ivpu_fw.h
> +++ b/drivers/accel/ivpu/ivpu_fw.h
> @@ -39,6 +39,7 @@ struct ivpu_fw_info {
> u64 read_only_addr;
> u32 read_only_size;
> u32 sched_mode;
> + u64 last_heartbeat;
> };
>
> int ivpu_fw_init(struct ivpu_device *vdev);
> diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
> index b5891e91f7ab..1fe03fc16bbc 100644
> --- a/drivers/accel/ivpu/ivpu_pm.c
> +++ b/drivers/accel/ivpu/ivpu_pm.c
> @@ -34,6 +34,7 @@ module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms,
> ulong, 0644);
> MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in
> milliseconds, 0 - default");
>
> #define PM_RESCHEDULE_LIMIT 5
> +#define PM_TDR_HEARTBEAT_LIMIT 30
>
> static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
> {
> @@ -44,6 +45,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device
> *vdev)
> ivpu_fw_log_reset(vdev);
> ivpu_fw_load(vdev);
> fw->entry_point = fw->cold_boot_entry_point;
> + fw->last_heartbeat = 0;
> }
>
> static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
> @@ -189,7 +191,24 @@ static void ivpu_job_timeout_work(struct work_struct
> *work)
> {
> struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info,
> job_timeout_work.work);
> struct ivpu_device *vdev = pm->vdev;
> + u64 heartbeat;
>
> + if (ivpu_jsm_get_heartbeat(vdev, 0, &heartbeat) || heartbeat <=
> vdev->fw->last_heartbeat) {
> + ivpu_err(vdev, "Job timeout detected, heartbeat not
> progressed\n");
> + goto recovery;
> + }
> +
> + if (atomic_fetch_inc(&vdev->job_timeout_counter) >
> PM_TDR_HEARTBEAT_LIMIT) {
> + ivpu_err(vdev, "Job timeout detected, heartbeat limit
> exceeded\n");
> + goto recovery;
> + }
> +
> + vdev->fw->last_heartbeat = heartbeat;
> + ivpu_start_job_timeout_detection(vdev);
> + return;
> +
> +recovery:
> + atomic_set(&vdev->job_timeout_counter, 0);
> ivpu_pm_trigger_recovery(vdev, "TDR");
> }
>
> @@ -204,6 +223,7 @@ void ivpu_start_job_timeout_detection(struct ivpu_device
> *vdev)
> void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
> {
> cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
> + atomic_set(&vdev->job_timeout_counter, 0);
> }
>
> int ivpu_pm_suspend_cb(struct device *dev)