On 1/12/2025 19:42, Jiang Liu wrote:
Enhance amdgpu_ras_block_late_fini() to revert what has been done
by amdgpu_ras_block_late_init(), and fix a possible resource leakage
in function amdgpu_ras_block_late_init().

Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++------
  1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index f0cd14ff78a7..7bbab7297c97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4104,13 +4104,13 @@ int amdgpu_ras_block_late_init(struct amdgpu_device 
*adev,
             ras_obj->hw_ops->query_ras_error_status)) {
                r = amdgpu_ras_sysfs_create(adev, ras_block);
                if (r)
-                       goto interrupt;
+                       goto cleanup;
/* Those are the cached values at init.
                 */
                query_info = kzalloc(sizeof(*query_info), GFP_KERNEL);
                if (!query_info)
-                       return -ENOMEM;
+                       goto cleanup;

AFAICT  you still need to set "r = -ENOMEM" here for this error flow.

                memcpy(&query_info->head, ras_block, sizeof(struct 
ras_common_if));
if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) {
@@ -4123,11 +4123,8 @@ int amdgpu_ras_block_late_init(struct amdgpu_device 
*adev,
return 0; -interrupt:
-       if (ras_obj->ras_cb)
-               amdgpu_ras_interrupt_remove_handler(adev, ras_block);
  cleanup:
-       amdgpu_ras_feature_enable(adev, ras_block, 0);
+       amdgpu_ras_block_late_fini(adev, ras_block);
        return r;
  }
@@ -4142,9 +4139,16 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
                          struct ras_common_if *ras_block)
  {
        struct amdgpu_ras_block_object *ras_obj;
+
        if (!ras_block)
                return;
+ amdgpu_ras_feature_enable(adev, ras_block, 0);
+
+       /* in resume/reset phase, no need to delete ras fs node */
+       if (adev->in_suspend || amdgpu_in_reset(adev))
+               return;
+
        amdgpu_ras_sysfs_remove(adev, ras_block);
ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);

Reply via email to