A patch has been laying dormant for more than a year that would fix suspend/resume GPU hangs happening on SI/CIK's pm suspend/resume cycles. See: https://gitlab.freedesktop.org/drm/amd/-/issues/2524
I'm not the original proposer of the patch, however this person hasn't been answering after a few months since Alex Deutcher had suggested he send the patch to amd-gfx mailing list for review. This patch handles the mutex locks/unlocks needed to prevent the suspend/resume hangs. It mimics what was included in commit 3712e7a494596b26861f4dc9b81676d1d0272eaf Author: Evan Quan <evan.q...@amd.com> Date: Tue Nov 16 14:30:20 2021 +0800 drm/amd/pm: unified lock protections in amdgpu_dpm.c While you could add my "Reviewed-by", I've not tested it and it may be possible to narrow the locks/unlocks around less calls. I'm willing to test it on Pitcairn and Tahiti for any regression I could find if requested. Alexandre Demers ---- >From e62461803e84c181d6d237e27a215b788d72fa41 Mon Sep 17 00:00:00 2001 From: "chr[]" <chris@socke> Date: Sun, 23 Apr 2023 06:13:47 +0200 Subject: [PATCH] amdgpu: fix suspend/resume issues resume and irq handler happily races in set_power_state() * amdgpu_legacy_dpm_compute_clocks() needs lock * protect irq work handler * fix dpm_enabled usage --- drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c | 19 ++++++++++++++---- .../gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c | 2 ++ drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c | 20 +++++++++++++++---- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c index f5e08b60f66e..e260224b6152 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c @@ -3056,6 +3056,7 @@ static int kv_dpm_hw_init(void *handle) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); if (ret) @@ -3063,6 +3064,8 @@ static int kv_dpm_hw_init(void *handle) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); + return ret; } @@ -3081,10 +3084,13 @@ static int kv_dpm_suspend(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ kv_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } return 0; } @@ -3094,18 +3100,23 @@ static int kv_dpm_resume(void *handle) int ret; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); /* asic init will reset to the boot state */ kv_dpm_setup_asic(adev); ret = kv_dpm_enable(adev); if (ret) adev->pm.dpm_enabled = false; - else + else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + return ret; } static bool kv_dpm_is_idle(void *handle) diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c index d3fe149d8476..665c218d9003 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c @@ -1047,6 +1047,7 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) if (!adev->pm.dpm_enabled) return; + mutex_lock(&adev->pm.mutex); if (!pp_funcs->read_sensor(adev->powerplay.pp_handle, AMDGPU_PP_SENSOR_GPU_TEMP, (void *)&temp, @@ -1068,4 +1069,5 @@ void amdgpu_dpm_thermal_work_handler(struct work_struct *work) adev->pm.dpm.state = dpm_state; amdgpu_legacy_dpm_compute_clocks(adev->powerplay.pp_handle); + mutex_unlock(&adev->pm.mutex); } diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c index 49c398ec0aaf..15084872975b 100644 --- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c +++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c @@ -7797,6 +7797,7 @@ static int si_dpm_hw_init(void *handle) if (!amdgpu_dpm) return 0; + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); if (ret) @@ -7804,6 +7805,7 @@ static int si_dpm_hw_init(void *handle) else adev->pm.dpm_enabled = true; amdgpu_legacy_dpm_compute_clocks(adev); + mutex_unlock(&adev->pm.mutex); return ret; } @@ -7822,11 +7824,15 @@ static int si_dpm_suspend(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; if (adev->pm.dpm_enabled) { + mutex_lock(&adev->pm.mutex); + adev->pm.dpm_enabled = false; /* disable dpm */ si_dpm_disable(adev); /* reset the power state */ adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps = adev->pm.dpm.boot_ps; + mutex_unlock(&adev->pm.mutex); } + return 0; } @@ -7835,18 +7841,24 @@ static int si_dpm_resume(void *handle) int ret; struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (adev->pm.dpm_enabled) { + if (!amdgpu_dpm) + return 0; + + if (!adev->pm.dpm_enabled) { /* asic init will reset to the boot state */ + mutex_lock(&adev->pm.mutex); si_dpm_setup_asic(adev); ret = si_dpm_enable(adev); if (ret) adev->pm.dpm_enabled = false; - else + else { adev->pm.dpm_enabled = true; - if (adev->pm.dpm_enabled) amdgpu_legacy_dpm_compute_clocks(adev); + } + mutex_unlock(&adev->pm.mutex); } - return 0; + + return ret; } static bool si_dpm_is_idle(void *handle) -- 2.30.2