On 1/13/2025 7:12 AM, Jiang Liu wrote:
> Enhance amdgpu_ras_pre_fini() to better support suspend/resume by:
> 1) fix possible resource leakage. amdgpu_release_ras_context() only
>    kfree(con) but doesn't release resources associated with the con
>    object.
> 2) call amdgpu_ras_pre_fini() in amdgpu_device_suspend() to undo what
>    has been done by amdgpu_ras_late_init(), because amdgpu_device_resume()
>    will invoke amdgpu_ras_late_init() on resume.
> 3) move amdgpu_ras_recovery_fini() from amdgpu_ras_pre_fini() to
>    amdgpu_ras_fini()
> 4) move calling of `obj->ras_fini()` from amdgpu_ras_fini() to
>    amdgpu_ras_pre_fini().
> 
> Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 44 +++++++++++++---------
>  2 files changed, 31 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 0a121aab5c74..2bfe113e17c7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4613,6 +4613,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>       return 0;
>  
>  release_ras_con:
> +     amdgpu_ras_pre_fini(adev);
> +     amdgpu_ras_fini(adev);
>       if (amdgpu_sriov_vf(adev))
>               amdgpu_virt_release_full_gpu(adev, true);
>  
> @@ -4627,8 +4629,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>               adev->virt.ops = NULL;
>               r = -EAGAIN;
>       }
> -     amdgpu_release_ras_context(adev);
> -
>  failed:
>       amdgpu_vf_error_trans_all(adev);
>  
> @@ -4921,6 +4921,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>  
>       cancel_delayed_work_sync(&adev->delayed_init_work);
>  
> +     /* disable ras feature must before hw fini */
> +     amdgpu_ras_pre_fini(adev);
>       amdgpu_ras_suspend(adev);

Based on the usages above, it makes more sense to keep
amdgpu_ras_pre_fini as a static function and call in
ras_fini/ras_suspend (contain the calls at ras layer and avoid another
new public interface).

Copying Tao to take a look.

Thanks,
Lijo

>  
>       amdgpu_device_ip_suspend_phase1(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 7bbab7297c97..5ac63f9cffda 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4270,42 +4270,49 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
>  int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
>  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct amdgpu_ras_block_list *node, *tmp;
> +     struct amdgpu_ras_block_object *obj;
>  
> -     if (!adev->ras_enabled || !con)
> -             return 0;
> +     if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
> +             goto disable;
>  
> +     list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
> +             obj = node->ras_obj;
> +             if (!obj)
> +                     continue;
> +
> +             if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block))
> +                     continue;
> +
> +             if (obj->ras_fini)
> +                     obj->ras_fini(adev, &obj->ras_comm);
> +             else
> +                     amdgpu_ras_block_late_fini_default(adev, 
> &obj->ras_comm);
> +     }
>  
> +disable:
>       /* Need disable ras on all IPs here before ip [hw/sw]fini */
> -     if (AMDGPU_RAS_GET_FEATURES(con->features))
> +     if (con && AMDGPU_RAS_GET_FEATURES(con->features))
>               amdgpu_ras_disable_all_features(adev, 0);
> -     amdgpu_ras_recovery_fini(adev);
> +
>       return 0;
>  }
>  
>  int amdgpu_ras_fini(struct amdgpu_device *adev)
>  {
>       struct amdgpu_ras_block_list *ras_node, *tmp;
> -     struct amdgpu_ras_block_object *obj = NULL;
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  
>       if (!adev->ras_enabled || !con)
> -             return 0;
> +             goto out_free_context;
>  
>       list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
> -             if (ras_node->ras_obj) {
> -                     obj = ras_node->ras_obj;
> -                     if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) 
> &&
> -                         obj->ras_fini)
> -                             obj->ras_fini(adev, &obj->ras_comm);
> -                     else
> -                             amdgpu_ras_block_late_fini_default(adev, 
> &obj->ras_comm);
> -             }
> -
>               /* Clear ras blocks from ras_list and free ras block list node 
> */
>               list_del(&ras_node->node);
>               kfree(ras_node);
>       }
>  
> +     amdgpu_ras_recovery_fini(adev);
>       amdgpu_ras_fs_fini(adev);
>       amdgpu_ras_interrupt_remove_all(adev);
>  
> @@ -4323,8 +4330,11 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>  
>       cancel_delayed_work_sync(&con->ras_counte_delay_work);
>  
> -     amdgpu_ras_set_context(adev, NULL);
> -     kfree(con);
> +out_free_context:
> +     if (con) {
> +             amdgpu_ras_set_context(adev, NULL);
> +             kfree(con);
> +     }
>  
>       return 0;
>  }

Reply via email to