A patch has been laying dormant for more than a year that would fix
suspend/resume GPU hangs happening on SI/CIK's pm suspend/resume
cycles. See: https://gitlab.freedesktop.org/drm/amd/-/issues/2524

I'm not the original proposer of the patch, however this person hasn't
been answering after a few months since Alex Deutcher had suggested he
send the patch to amd-gfx mailing list for review.

This patch handles the mutex locks/unlocks needed to prevent the
suspend/resume hangs. It mimics what was included in

commit 3712e7a494596b26861f4dc9b81676d1d0272eaf
Author: Evan Quan <evan.q...@amd.com>
Date:   Tue Nov 16 14:30:20 2021 +0800

    drm/amd/pm: unified lock protections in amdgpu_dpm.c

While you could add my "Reviewed-by", I've not tested it and it may be
possible to narrow the locks/unlocks around less calls.

I'm willing to test it on Pitcairn and Tahiti for any regression I
could find if requested.

Alexandre Demers

----

>From e62461803e84c181d6d237e27a215b788d72fa41 Mon Sep 17 00:00:00 2001
From: "chr[]" <chris@socke>
Date: Sun, 23 Apr 2023 06:13:47 +0200
Subject: [PATCH] amdgpu: fix suspend/resume issues

resume and irq handler happily races in set_power_state()

* amdgpu_legacy_dpm_compute_clocks() needs lock
* protect irq work handler
* fix dpm_enabled usage
---
 drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c    | 19 ++++++++++++++----
 .../gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c    |  2 ++
 drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c    | 20 +++++++++++++++----
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c
b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c
index f5e08b60f66e..e260224b6152 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/kv_dpm.c
@@ -3056,6 +3056,7 @@ static int kv_dpm_hw_init(void *handle)
    if (!amdgpu_dpm)
        return 0;

+   mutex_lock(&adev->pm.mutex);
    kv_dpm_setup_asic(adev);
    ret = kv_dpm_enable(adev);
    if (ret)
@@ -3063,6 +3064,8 @@ static int kv_dpm_hw_init(void *handle)
    else
        adev->pm.dpm_enabled = true;
    amdgpu_legacy_dpm_compute_clocks(adev);
+   mutex_unlock(&adev->pm.mutex);
+
    return ret;
 }

@@ -3081,10 +3084,13 @@ static int kv_dpm_suspend(void *handle)
    struct amdgpu_device *adev = (struct amdgpu_device *)handle;

    if (adev->pm.dpm_enabled) {
+       mutex_lock(&adev->pm.mutex);
+       adev->pm.dpm_enabled = false;
        /* disable dpm */
        kv_dpm_disable(adev);
        /* reset the power state */
        adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps =
adev->pm.dpm.boot_ps;
+       mutex_unlock(&adev->pm.mutex);
    }
    return 0;
 }
@@ -3094,18 +3100,23 @@ static int kv_dpm_resume(void *handle)
    int ret;
    struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-   if (adev->pm.dpm_enabled) {
+   if (!amdgpu_dpm)
+       return 0;
+
+   if (!adev->pm.dpm_enabled) {
+       mutex_lock(&adev->pm.mutex);
        /* asic init will reset to the boot state */
        kv_dpm_setup_asic(adev);
        ret = kv_dpm_enable(adev);
        if (ret)
            adev->pm.dpm_enabled = false;
-       else
+       else {
            adev->pm.dpm_enabled = true;
-       if (adev->pm.dpm_enabled)
            amdgpu_legacy_dpm_compute_clocks(adev);
+       }
+       mutex_unlock(&adev->pm.mutex);
    }
-   return 0;
+   return ret;
 }

 static bool kv_dpm_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c
b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c
index d3fe149d8476..665c218d9003 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/legacy_dpm.c
@@ -1047,6 +1047,7 @@ void amdgpu_dpm_thermal_work_handler(struct
work_struct *work)
    if (!adev->pm.dpm_enabled)
        return;

+   mutex_lock(&adev->pm.mutex);
    if (!pp_funcs->read_sensor(adev->powerplay.pp_handle,
                   AMDGPU_PP_SENSOR_GPU_TEMP,
                   (void *)&temp,
@@ -1068,4 +1069,5 @@ void amdgpu_dpm_thermal_work_handler(struct
work_struct *work)
    adev->pm.dpm.state = dpm_state;

    amdgpu_legacy_dpm_compute_clocks(adev->powerplay.pp_handle);
+   mutex_unlock(&adev->pm.mutex);
 }
diff --git a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
index 49c398ec0aaf..15084872975b 100644
--- a/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
+++ b/drivers/gpu/drm/amd/pm/legacy-dpm/si_dpm.c
@@ -7797,6 +7797,7 @@ static int si_dpm_hw_init(void *handle)
    if (!amdgpu_dpm)
        return 0;

+   mutex_lock(&adev->pm.mutex);
    si_dpm_setup_asic(adev);
    ret = si_dpm_enable(adev);
    if (ret)
@@ -7804,6 +7805,7 @@ static int si_dpm_hw_init(void *handle)
    else
        adev->pm.dpm_enabled = true;
    amdgpu_legacy_dpm_compute_clocks(adev);
+   mutex_unlock(&adev->pm.mutex);
    return ret;
 }

@@ -7822,11 +7824,15 @@ static int si_dpm_suspend(void *handle)
    struct amdgpu_device *adev = (struct amdgpu_device *)handle;

    if (adev->pm.dpm_enabled) {
+       mutex_lock(&adev->pm.mutex);
+       adev->pm.dpm_enabled = false;
        /* disable dpm */
        si_dpm_disable(adev);
        /* reset the power state */
        adev->pm.dpm.current_ps = adev->pm.dpm.requested_ps =
adev->pm.dpm.boot_ps;
+       mutex_unlock(&adev->pm.mutex);
    }
+
    return 0;
 }

@@ -7835,18 +7841,24 @@ static int si_dpm_resume(void *handle)
    int ret;
    struct amdgpu_device *adev = (struct amdgpu_device *)handle;

-   if (adev->pm.dpm_enabled) {
+   if (!amdgpu_dpm)
+       return 0;
+
+   if (!adev->pm.dpm_enabled) {
        /* asic init will reset to the boot state */
+       mutex_lock(&adev->pm.mutex);
        si_dpm_setup_asic(adev);
        ret = si_dpm_enable(adev);
        if (ret)
            adev->pm.dpm_enabled = false;
-       else
+       else {
            adev->pm.dpm_enabled = true;
-       if (adev->pm.dpm_enabled)
            amdgpu_legacy_dpm_compute_clocks(adev);
+       }
+       mutex_unlock(&adev->pm.mutex);
    }
-   return 0;
+
+   return ret;
 }

 static bool si_dpm_is_idle(void *handle)
-- 
2.30.2

Reply via email to