amd: Unwind for failed device suspend

Alex Deucher Wed, 29 Oct 2025 14:19:30 -0700

On Thu, Oct 23, 2025 at 12:53 PM Mario Limonciello
<[email protected]> wrote:
>
> If device suspend has failed, add a recovery flow that will attempt
> to unwind the suspend and get things back up and running.
>
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627
> Signed-off-by: Mario Limonciello <[email protected]>


Patch is:
Acked-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++---
>  1 file changed, 72 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 3ffb9bb1ec0b..645b15aa34f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev)
>  int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
>  {
>         struct amdgpu_device *adev = drm_to_adev(dev);
> -       int r = 0;
> +       int r, rec;
>
>         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>                 return 0;
> @@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>                         return r;
>         }
>
> -       if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3))
> -               dev_warn(adev->dev, "smart shift update failed\n");
> +       r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3);
> +       if (r)
> +               goto unwind_sriov;
>
>         if (notify_clients)
>                 drm_client_dev_suspend(adev_to_drm(adev), false);
> @@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, 
> bool notify_clients)
>
>         r = amdgpu_device_ip_suspend_phase1(adev);
>         if (r)
> -               return r;
> +               goto unwind_smartshift;
>
>         amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && 
> !adev->in_runpm);
>         r = amdgpu_userq_suspend(adev);
>         if (r)
> -               return r;
> +               goto unwind_ip_phase1;
>
>         r = amdgpu_device_evict_resources(adev);
>         if (r)
> -               return r;
> +               goto unwind_userq;
>
>         amdgpu_ttm_set_buffer_funcs_status(adev, false);
>
> @@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, 
> bool notify_clients)
>
>         r = amdgpu_device_ip_suspend_phase2(adev);
>         if (r)
> -               return r;
> +               goto unwind_evict;
>
>         if (amdgpu_sriov_vf(adev))
>                 amdgpu_virt_release_full_gpu(adev, false);
>
>         r = amdgpu_dpm_notify_rlc_state(adev, false);
>         if (r)
> -               return r;
> +               goto unwind_ip_phase2;
>
>         return 0;
> +
> +unwind_ip_phase2:
> +       /* suspend phase 2 = resume phase 2 + resume phase 1 */
> +       rec = amdgpu_device_ip_resume_phase2(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase2: 
> %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_device_fw_loading(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_device_ip_resume_phase1(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: 
> %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_evict:
> +       if (adev->mman.buffer_funcs_ring->sched.ready)
> +               amdgpu_ttm_set_buffer_funcs_status(adev, true);
> +       amdgpu_fence_driver_hw_init(adev);
> +
> +unwind_userq:
> +       rec = amdgpu_userq_resume(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize user queues: 
> %d\n", rec);
> +               return r;
> +       }
> +       rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && 
> !adev->in_runpm);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_ip_phase1:
> +       /* suspend phase 1 = resume phase 3 */
> +       rec = amdgpu_device_ip_resume_phase3(adev);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-initialize IPs phase1: 
> %d\n", rec);
> +               return r;
> +       }
> +
> +unwind_smartshift:
> +       rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0);
> +       if (rec) {
> +               dev_warn(adev->dev, "failed to re-update smart shift: %d\n", 
> rec);
> +               return r;
> +       }
> +
> +unwind_sriov:
> +       if (amdgpu_sriov_vf(adev)) {
> +               rec = amdgpu_virt_request_full_gpu(adev, true);
> +               if (rec) {
> +                       dev_warn(adev->dev, "failed to reinitialize sriov: 
> %d\n", rec);
> +                       return r;
> +               }
> +       }
> +
> +       adev->in_suspend = adev->in_s0ix = adev->in_s3 = false;
> +
> +       return r;
>  }
>
>  static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> --
> 2.51.1
>

Re: [PATCH v4 1/2] drm/amd: Unwind for failed device suspend

Reply via email to