On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Make the device state machine work in stack like way to better support
> suspend/resume by following changes:
> 
> 1. amdgpu_driver_load_kms()
>       amdgpu_device_init()
>               amdgpu_device_ip_early_init()
>                       ip_blocks[i].early_init()
>                       ip_blocks[i].status.valid = true
>               amdgpu_device_ip_init()
>                       amdgpu_ras_init()
>                       ip_blocks[i].sw_init()
>                       ip_blocks[i].status.sw = true
>                       ip_blocks[i].hw_init()
>                       ip_blocks[i].status.hw = true
>               amdgpu_device_ip_late_init()
>                       ip_blocks[i].late_init()
>                       ip_blocks[i].status.late_initialized = true
>                       amdgpu_ras_late_init()
>                               ras_blocks[i].ras_late_init()
>                                       amdgpu_ras_feature_enable_on_boot()
> 
> 2. amdgpu_pmops_suspend()/amdgpu_pmops_freeze()/amdgpu_pmops_poweroff()
>       amdgpu_device_suspend()
>               amdgpu_ras_early_fini()
>                       ras_blocks[i].ras_early_fini()
>                               amdgpu_ras_feature_disable()
>               amdgpu_ras_suspend()
>                       amdgpu_ras_disable_all_features()
> +++           ip_blocks[i].early_fini()
> +++           ip_blocks[i].status.late_initialized = false

As said in the previous patch, please don't add confusion. You could
maintain a state machine like early fini done/late fini done etc, but
please don't introduce this kind of confusing things.

Thanks,
Lijo

>               ip_blocks[i].suspend()
> 
> 3. amdgpu_pmops_resume()/amdgpu_pmops_thaw()/amdgpu_pmops_restore()
>       amdgpu_device_resume()
>               amdgpu_device_ip_resume()
>                       ip_blocks[i].resume()
>               amdgpu_device_ip_late_init()
>                       ip_blocks[i].late_init()
>                       ip_blocks[i].status.late_initialized = true
>                       amdgpu_ras_late_init()
>                               ras_blocks[i].ras_late_init()
>                                       amdgpu_ras_feature_enable_on_boot()
>               amdgpu_ras_resume()
>                       amdgpu_ras_enable_all_features()
> 
> 4. amdgpu_driver_unload_kms()
>       amdgpu_device_fini_hw()
>               amdgpu_ras_early_fini()
>                       ras_blocks[i].ras_early_fini()
> +++           ip_blocks[i].early_fini()
> +++           ip_blocks[i].status.late_initialized = false
>               ip_blocks[i].hw_fini()
>               ip_blocks[i].status.hw = false
> 
> 5. amdgpu_driver_release_kms()
>       amdgpu_device_fini_sw()
>               amdgpu_device_ip_fini()
>                       ip_blocks[i].sw_fini()
>                       ip_blocks[i].status.sw = false
> ---                   ip_blocks[i].status.valid = false
> +++                   amdgpu_ras_fini()
>                       ip_blocks[i].late_fini()
> +++                   ip_blocks[i].status.valid = false
> ---                   ip_blocks[i].status.late_initialized = false
> ---                   amdgpu_ras_fini()
> 
> The main changes include:
> 1) invoke ip_blocks[i].early_fini in amdgpu_pmops_suspend().
> 2) set ip_blocks[i].status.late_initialized to false after calling
>    callback `early_fini`. We have auditted all usages of the
>    late_initialized flag and no functional changes found.
> 3) only set ip_blocks[i].status.valid = false after calling the
>    `late_fini` callback.
> 4) call amdgpu_ras_fini() before invoking ip_blocks[i].late_fini.
> 
> There's one more task left to analyze GPU reset related state machine
> transitions.
> 
> Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6b503fb7e366..c2e4057ecd82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3449,6 +3449,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
> *adev)
>               adev->ip_blocks[i].status.sw = false;
>       }
>  
> +     amdgpu_ras_fini(adev);
> +
>       for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>               if (!adev->ip_blocks[i].status.valid)
>                       continue;
> @@ -3457,8 +3459,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
> *adev)
>               adev->ip_blocks[i].status.valid = false;
>       }
>  
> -     amdgpu_ras_fini(adev);
> -
>       return 0;
>  }
>  
> @@ -3516,6 +3516,24 @@ static int amdgpu_device_ip_suspend_phase1(struct 
> amdgpu_device *adev)
>       if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
>               dev_warn(adev->dev, "Failed to disallow df cstate");
>  
> +     for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
> +             if (!adev->ip_blocks[i].status.valid)
> +                     continue;
> +             if (!adev->ip_blocks[i].status.late_initialized)
> +                     continue;
> +
> +             if (adev->ip_blocks[i].version->funcs->early_fini) {
> +                     r = 
> adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
> +                     if (r) {
> +                             DRM_ERROR(" of IP block <%s> failed %d\n",
> +                                       
> adev->ip_blocks[i].version->funcs->name, r);
> +                             return r;
> +                     }
> +             }
> +
> +             adev->ip_blocks[i].status.late_initialized = false;
> +     }
> +
>       for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
>               if (!adev->ip_blocks[i].status.valid)
>                       continue;

Reply via email to