On Thu, Oct 23, 2025 at 12:53 PM Mario Limonciello <[email protected]> wrote: > > If device suspend has failed, add a recovery flow that will attempt > to unwind the suspend and get things back up and running. > > Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4627 > Signed-off-by: Mario Limonciello <[email protected]>
Patch is: Acked-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++++++--- > 1 file changed, 72 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 3ffb9bb1ec0b..645b15aa34f1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5231,7 +5231,7 @@ void amdgpu_device_complete(struct drm_device *dev) > int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) > { > struct amdgpu_device *adev = drm_to_adev(dev); > - int r = 0; > + int r, rec; > > if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) > return 0; > @@ -5247,8 +5247,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool > notify_clients) > return r; > } > > - if (amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3)) > - dev_warn(adev->dev, "smart shift update failed\n"); > + r = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D3); > + if (r) > + goto unwind_sriov; > > if (notify_clients) > drm_client_dev_suspend(adev_to_drm(adev), false); > @@ -5259,16 +5260,16 @@ int amdgpu_device_suspend(struct drm_device *dev, > bool notify_clients) > > r = amdgpu_device_ip_suspend_phase1(adev); > if (r) > - return r; > + goto unwind_smartshift; > > amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && > !adev->in_runpm); > r = amdgpu_userq_suspend(adev); > if (r) > - return r; > + goto unwind_ip_phase1; > > r = amdgpu_device_evict_resources(adev); > if (r) > - return r; > + goto unwind_userq; > > amdgpu_ttm_set_buffer_funcs_status(adev, false); > > @@ -5276,16 +5277,79 @@ int amdgpu_device_suspend(struct drm_device *dev, > bool notify_clients) > > r = amdgpu_device_ip_suspend_phase2(adev); > if (r) > - return r; > + goto unwind_evict; > > if (amdgpu_sriov_vf(adev)) > amdgpu_virt_release_full_gpu(adev, false); > > r = amdgpu_dpm_notify_rlc_state(adev, false); > if (r) > - return r; > + goto unwind_ip_phase2; > > return 0; > + > +unwind_ip_phase2: > + /* suspend phase 2 = resume phase 2 + resume phase 1 */ > + rec = amdgpu_device_ip_resume_phase2(adev); > + if (rec) { > + dev_warn(adev->dev, "failed to re-initialize IPs phase2: > %d\n", rec); > + return r; > + } > + rec = amdgpu_device_fw_loading(adev); > + if (rec) { > + dev_warn(adev->dev, "failed to reload firmwares: %d\n", rec); > + return r; > + } > + rec = amdgpu_device_ip_resume_phase1(adev); > + if (rec) { > + dev_warn(adev->dev, "failed to re-initialize IPs phase1: > %d\n", rec); > + return r; > + } > + > +unwind_evict: > + if (adev->mman.buffer_funcs_ring->sched.ready) > + amdgpu_ttm_set_buffer_funcs_status(adev, true); > + amdgpu_fence_driver_hw_init(adev); > + > +unwind_userq: > + rec = amdgpu_userq_resume(adev); > + if (rec) { > + dev_warn(adev->dev, "failed to re-initialize user queues: > %d\n", rec); > + return r; > + } > + rec = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && > !adev->in_runpm); > + if (rec) { > + dev_warn(adev->dev, "failed to re-initialize kfd: %d\n", rec); > + return r; > + } > + > +unwind_ip_phase1: > + /* suspend phase 1 = resume phase 3 */ > + rec = amdgpu_device_ip_resume_phase3(adev); > + if (rec) { > + dev_warn(adev->dev, "failed to re-initialize IPs phase1: > %d\n", rec); > + return r; > + } > + > +unwind_smartshift: > + rec = amdgpu_acpi_smart_shift_update(adev, AMDGPU_SS_DEV_D0); > + if (rec) { > + dev_warn(adev->dev, "failed to re-update smart shift: %d\n", > rec); > + return r; > + } > + > +unwind_sriov: > + if (amdgpu_sriov_vf(adev)) { > + rec = amdgpu_virt_request_full_gpu(adev, true); > + if (rec) { > + dev_warn(adev->dev, "failed to reinitialize sriov: > %d\n", rec); > + return r; > + } > + } > + > + adev->in_suspend = adev->in_s0ix = adev->in_s3 = false; > + > + return r; > } > > static inline int amdgpu_virt_resume(struct amdgpu_device *adev) > -- > 2.51.1 >
