amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Grodzovsky, Andrey Fri, 30 Aug 2019 07:04:00 -0700

On 8/30/19 8:24 AM, Tao Zhou wrote:
> ras recovery_init should be called after ttm and smu init,
> bad page reserve should be put in front of gpu reset since i2c
> may be unstable during gpu reset
> add cleanup for recovery_init and recovery_fini
>
> Signed-off-by: Tao Zhou <tao.zh...@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
>   3 files changed, 33 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 67b09cb2a9e2..4e4895ac926d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>               goto failed;
>       }
>   
> +     /* retired pages will be loaded from eeprom and reserved here,
> +      * new bo may be created, it should be called after ttm init,
> +      * accessing eeprom also relies on smu (unlock i2c bus) and it
> +      * should be called after smu init
> +      *
> +      * Note: theoretically, this should be called before all vram 
> allocations
> +      * to protect retired page from abusing, but there are some allocations
> +      * in front of smu fw loading
> +      */
> +     amdgpu_ras_recovery_init(adev);



You probably should check for return value here.


> +
>       /* must succeed. */
>       amdgpu_ras_resume(adev);
>   
> @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct 
> amdgpu_hive_info *hive,
>                                               break;
>                               }
>                       }
> -
> -                     list_for_each_entry(tmp_adev, device_list_handle,
> -                                     gmc.xgmi.head) {
> -                             amdgpu_ras_reserve_bad_pages(tmp_adev);
> -                     }
>               }
>       }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 02120aa3cb5d..ad67a109122f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct 
> amdgpu_device *adev)
>       return 0;
>   }
>   
> -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       struct ras_err_handler_data **data = &con->eh_data;
>       int ret;
>   
> -     *data = kmalloc(sizeof(**data),
> -                     GFP_KERNEL|__GFP_ZERO);
> +     *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
>       if (!*data)
>               return -ENOMEM;
>   
> @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct 
> amdgpu_device *adev)
>   
>       ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
>       if (ret)
> -             return ret;
> +             goto cancel_work;


Why you need do go to cancel_work_sync(&con->recovery_work) at such 
early stage of device init, is RAS active already by this time and RAS 
interrupt might fire triggering reset  ?

Andrey


>   
>       if (adev->psp.ras.ras->eeprom_control.num_recs) {
>               ret = amdgpu_ras_load_bad_pages(adev);
>               if (ret)
> -                     return ret;
> +                     goto cancel_work;
>               ret = amdgpu_ras_reserve_bad_pages(adev);
>               if (ret)
> -                     return ret;
> +                     goto release;
>       }
>   
>       return 0;
> +
> +release:
> +     amdgpu_ras_release_bad_pages(adev);
> +cancel_work:
> +     cancel_work_sync(&con->recovery_work);
> +     con->eh_data = NULL;
> +     kfree((*data)->bps);
> +     kfree((*data)->bps_bo);
> +     kfree(*data);
> +
> +     return ret;
>   }
>   
>   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
> @@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct 
> amdgpu_device *adev)
>       mutex_lock(&con->recovery_lock);
>       con->eh_data = NULL;
>       kfree(data->bps);
> +     kfree(data->bps_bo);
>       kfree(data);
>       mutex_unlock(&con->recovery_lock);
>   
> @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>                       return r;
>       }
>   
> -     if (amdgpu_ras_recovery_init(adev))
> -             goto recovery_out;
> -
>       amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
>   
>       if (amdgpu_ras_fs_init(adev))
> @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>                       con->hw_supported, con->supported);
>       return 0;
>   fs_out:
> -     amdgpu_ras_recovery_fini(adev);
> -recovery_out:
>       amdgpu_ras_set_context(adev, NULL);
>       kfree(con);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 42e1d379e21c..6d00f79b612b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct 
> amdgpu_device *adev,
>       return ras && (ras->supported & (1 << block));
>   }
>   
> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
>               unsigned int block);
>   
> @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct 
> amdgpu_device *adev,
>   {
>       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>   
> +     /* save bad page to eeprom before gpu reset,
> +      * i2c may be unstable in gpu reset
> +      */
> +     amdgpu_ras_reserve_bad_pages(adev);
>       if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
>               schedule_work(&ras->recovery_work);
>       return 0;
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Reply via email to