amdgpu: Hook EEPROM table to RAS

Zhou1, Tao Sun, 01 Sep 2019 20:01:03 -0700


> -----Original Message-----
> From: Chen, Guchun <guchun.c...@amd.com>
> Sent: 2019年9月2日 10:11
> To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey <andrey.grodzov...@amd.com>; Li, Dennis
> <dennis...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>
> Cc: Zhou1, Tao <tao.zh...@amd.com>
> Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey
> <andrey.grodzov...@amd.com>; Chen, Guchun <guchun.c...@amd.com>;
> Li, Dennis <dennis...@amd.com>; Zhang, Hawking
> <hawking.zh...@amd.com>
> Cc: Zhou1, Tao <tao.zh...@amd.com>
> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS
> 
> support eeprom records load and save for ras, move EEPROM records
> storing to bad page reserving
> 
> Signed-off-by: Tao Zhou <tao.zh...@amd.com>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++--
> ----
>  1 file changed, 83 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 24663ec41248..02120aa3cb5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct
> amdgpu_device *adev,
>       return ret;
>  }
> 
> +/*
> + * write error record array to eeprom, the function should be
> + * protected by recovery_lock
> + */
> +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_err_handler_data *data;
> +     struct amdgpu_ras_eeprom_control *control =
> +                                     &adev->psp.ras.ras->eeprom_control;
> +     int save_count;
> +
> +     if (!con || !con->eh_data)
> +             return 0;
> +
> +     data = con->eh_data;
> +     if (!data)
> +             return 0;
> [Guchun]Such check (!data) is redundant and not needed. As we have
> checked !con->eh_data earlier, and the whole function is protected by
> recovery_lock.


[Tao] OK, I'll remove it.

> 
> +     save_count = data->count - control->num_recs;
> +     /* only new entries are saved */
> +     if (save_count > 0)
> +             if (amdgpu_ras_eeprom_process_recods(&con-
> >eeprom_control,
> +                                                     &data->bps[control-
> >num_recs],
> +                                                     true,
> +                                                     save_count)) {
> +                     DRM_ERROR("Failed to save EEPROM table data!");
> +                     return -EIO;
> +             }
> +
> +     return 0;
> +}
> +
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
> +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
> +     struct amdgpu_ras_eeprom_control *control =
> +                                     &adev->psp.ras.ras->eeprom_control;
> +     struct eeprom_table_record *bps = NULL;
> +     int ret = 0;
> +
> +     /* no bad page record, skip eeprom access */
> +     if (!control->num_recs)
> +             return ret;
> +
> +     bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
> +     if (!bps)
> +             return -ENOMEM;
> +
> +     if (amdgpu_ras_eeprom_process_recods(control, bps, false,
> +             control->num_recs)) {
> +             DRM_ERROR("Failed to load EEPROM table records!");
> +             ret = -EIO;
> +             goto out;
> +     }
> +
> +     ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> +
> +out:
> +     kfree(bps);
> +     return ret;
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>       struct ras_err_handler_data *data;
>       uint64_t bp;
>       struct amdgpu_bo *bo;
> -     int i;
> +     int i, ret = 0;
> 
>       if (!con || !con->eh_data)
>               return 0;
> @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>               data->bps_bo[i] = bo;
>               data->last_reserved = i + 1;
>       }
> +
> +     ret = amdgpu_ras_save_bad_pages(adev);
>  out:
>       mutex_unlock(&con->recovery_lock);
> -     return 0;
> +     return ret;
>  }
> 
>  /* called when driver unload */
> @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>       return 0;
>  }
> 
> -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
> -     /* TODO
> -      * write the array to eeprom when SMU disabled.
> -      */
> -     return 0;
> -}
> -
> -/*
> - * read error record array in eeprom and reserve enough space for
> - * storing new bad pages
> - */
> -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
> -     struct eeprom_table_record *bps = NULL;
> -     int ret;
> -
> -     ret = amdgpu_ras_add_bad_pages(adev, bps,
> -                             adev->umc.max_ras_err_cnt_per_query);
> -
> -     return ret;
> -}
> -
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       struct ras_err_handler_data **data = &con->eh_data;
> +     int ret;
> 
>       *data = kmalloc(sizeof(**data),
>                       GFP_KERNEL|__GFP_ZERO);
> @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>       atomic_set(&con->in_recovery, 0);
>       con->adev = adev;
> 
> -     amdgpu_ras_load_bad_pages(adev);
> -     amdgpu_ras_reserve_bad_pages(adev);
> +     ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-
> >eeprom_control);
> +     if (ret)
> +             return ret;
> +
> +     if (adev->psp.ras.ras->eeprom_control.num_recs) {
> +             ret = amdgpu_ras_load_bad_pages(adev);
> +             if (ret)
> +                     return ret;
> +             ret = amdgpu_ras_reserve_bad_pages(adev);
> +             if (ret)
> +                     return ret;
> +     }
> 
>       return 0;
>  }
> @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct
> amdgpu_device *adev)
>       struct ras_err_handler_data *data = con->eh_data;
> 
>       cancel_work_sync(&con->recovery_work);
> -     amdgpu_ras_save_bad_pages(adev);
>       amdgpu_ras_release_bad_pages(adev);
> 
>       mutex_lock(&con->recovery_lock);
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

Reply via email to