Make the device state machine work in stack like way to better support
suspend/resume by following changes:

1. amdgpu_driver_load_kms()
        amdgpu_device_init()
                amdgpu_device_ip_early_init()
                        ip_blocks[i].early_init()
                        ip_blocks[i].status.valid = true
                amdgpu_device_ip_init()
                        amdgpu_ras_init()
                        ip_blocks[i].sw_init()
                        ip_blocks[i].status.sw = true
                        ip_blocks[i].hw_init()
                        ip_blocks[i].status.hw = true
                amdgpu_device_ip_late_init()
                        ip_blocks[i].late_init()
                        ip_blocks[i].status.late_initialized = true
                        amdgpu_ras_late_init()
                                ras_blocks[i].ras_late_init()
                                        amdgpu_ras_feature_enable_on_boot()

2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
        amdgpu_device_suspend()
                amdgpu_ras_early_fini()
                        ras_blocks[i].ras_early_fini()
                                amdgpu_ras_feature_disable()
                amdgpu_ras_suspend()
                        amdgpu_ras_disable_all_features()
+++             ip_blocks[i].early_fini()
+++             ip_blocks[i].status.late_initialized = false
                ip_blocks[i].suspend()

3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
        amdgpu_device_resume()
                amdgpu_device_ip_resume()
                        ip_blocks[i].resume()
                amdgpu_device_ip_late_init()
                        ip_blocks[i].late_init()
                        ip_blocks[i].status.late_initialized = true
                        amdgpu_ras_late_init()
                                ras_blocks[i].ras_late_init()
                                        amdgpu_ras_feature_enable_on_boot()
                amdgpu_ras_resume()
                        amdgpu_ras_enable_all_features()

4. amdgpu_driver_unload_kms()
        amdgpu_device_fini_hw()
                amdgpu_ras_early_fini()
                        ras_blocks[i].ras_early_fini()
+++             ip_blocks[i].early_fini()
+++             ip_blocks[i].status.late_initialized = false
                ip_blocks[i].hw_fini()
                ip_blocks[i].status.hw = false

5. amdgpu_driver_release_kms()
        amdgpu_device_fini_sw()
                amdgpu_device_ip_fini()
                        ip_blocks[i].sw_fini()
                        ip_blocks[i].status.sw = false
---                     ip_blocks[i].status.valid = false
+++                     amdgpu_ras_fini()
                        ip_blocks[i].late_fini()
+++                     ip_blocks[i].status.valid = false
---                     ip_blocks[i].status.late_initialized = false
---                     amdgpu_ras_fini()

The main changes include:
1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
2) set ip_blocks[i].status.late_initialized to false after calling
   callback `early_fini`. We have auditted all usages of the
   late_initialized flag and no functional changes found.
3) only set ip_blocks[i].status.valid = false after calling the
   `late_fini` callback.
4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.

There's one more task left to analyze GPU reset related state machine
transitions.

Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6b503fb7e366..c2e4057ecd82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
                adev->ip_blocks[i].status.sw = false;
        }
 
+       amdgpu_ras_fini(adev);
+
        for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
                if (!adev->ip_blocks[i].status.valid)
                        continue;
@@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
                adev->ip_blocks[i].status.valid = false;
        }
 
-       amdgpu_ras_fini(adev);
-
        return 0;
 }
 
@@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct 
amdgpu_device *adev)
        if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
                dev_warn(adev->dev, "Failed to disallow df cstate");
 
+       for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
+               if (!adev->ip_blocks[i].status.valid)
+                       continue;
+               if (!adev->ip_blocks[i].status.late_initialized)
+                       continue;
+
+               if (adev->ip_blocks[i].version->funcs->early_fini) {
+                       r = 
adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
+                       if (r) {
+                               DRM_ERROR(" of IP block <%s> failed %d\n",
+                                         
adev->ip_blocks[i].version->funcs->name, r);
+                               return r;
+                       }
+               }
+
+               adev->ip_blocks[i].status.late_initialized = false;
+       }
+
        for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
                if (!adev->ip_blocks[i].status.valid)
                        continue;
-- 
2.43.5

Reply via email to