RE: [PATCH] drm/amdgpu: allow query error counters for specific IP block

Zhou1, Tao Tue, 03 Jan 2023 19:15:42 -0800

[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zh...@amd.com>


> -----Original Message-----
> From: Zhang, Hawking <hawking.zh...@amd.com>
> Sent: Wednesday, January 4, 2023 12:25 AM
> To: amd-gfx@lists.freedesktop.org; Zhou1, Tao <tao.zh...@amd.com>; Yang,
> Stanley <stanley.y...@amd.com>; Li, Candice <candice...@amd.com>; Chai,
> Thomas <yipeng.c...@amd.com>
> Cc: Zhang, Hawking <hawking.zh...@amd.com>
> Subject: [PATCH] drm/amdgpu: allow query error counters for specific IP block
> 
> amdgpu_ras_block_late_init will be invoked in IP specific ras_late_init call 
> as a
> common helper for all the IP blocks.
> 
> However, when amdgpu_ras_block_late_init call
> amdgpu_ras_query_error_count to query ras error counters,
> amdgpu_ras_query_error_count queries all the IP blocks that support ras query
> interface.
> 
> This results to wrong error counters cached in software copies when there are
> ras errors detected at time zero or warm reset procedure. i.e., in
> sdma_ras_late_init phase, it counts on sdma/mmhub errors, while, in
> mmhub_ras_late_init phase, it still counts on sdma/mmhub errors.
> 
> The change updates amdgpu_ras_query_error_count interface to allow query
> specific ip error counter.
> It introduces a new input parameter: query_info. if query_info is NULL,  it 
> means
> query all the IP blocks, otherwise, only query the ip block specified by 
> query_info.
> 
> Signed-off-by: Hawking Zhang <hawking.zh...@amd.com>
> Reviewed-by: Tao Zhou <tao.zh...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 +++++++++++++++++++------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 +-
>  2 files changed, 71 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 35b9f2ed2838..7fed63dc09bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1130,11 +1130,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device
> *adev,  }
> 
>  /**
> - * amdgpu_ras_query_error_count -- Get error counts of all IPs
> + * amdgpu_ras_query_error_count_helper -- Get error counter for
> +specific IP
> + * @adev: pointer to AMD GPU device
> + * @ce_count: pointer to an integer to be set to the count of correctible 
> errors.
> + * @ue_count: pointer to an integer to be set to the count of uncorrectible
> errors.
> + * @query_info: pointer to ras_query_if
> + *
> + * Return 0 for query success or do nothing, otherwise return an error
> + * on failures
> + */
> +static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
> +                                            unsigned long *ce_count,
> +                                            unsigned long *ue_count,
> +                                            struct ras_query_if *query_info) 
> {
> +     int ret;
> +
> +     if (!query_info)
> +             /* do nothing if query_info is not specified */
> +             return 0;
> +
> +     ret = amdgpu_ras_query_error_status(adev, query_info);
> +     if (ret)
> +             return ret;
> +
> +     *ce_count += query_info->ce_count;
> +     *ue_count += query_info->ue_count;
> +
> +     /* some hardware/IP supports read to clear
> +      * no need to explictly reset the err status after the query call */
> +     if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
> +         adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
> +             if (amdgpu_ras_reset_error_status(adev, query_info-
> >head.block))
> +                     dev_warn(adev->dev,
> +                              "Failed to reset error counter and error
> status\n");
> +     }
> +
> +     return 0;
> +}
> +
> +/**
> + * amdgpu_ras_query_error_count -- Get error counts of all IPs or
> +specific IP
>   * @adev: pointer to AMD GPU device
>   * @ce_count: pointer to an integer to be set to the count of correctible 
> errors.
>   * @ue_count: pointer to an integer to be set to the count of uncorrectible
>   * errors.
> + * @query_info: pointer to ras_query_if if the query request is only
> + for
> + * specific ip block; if info is NULL, then the qurey request is for
> + * all the ip blocks that support query ras error counters/status
>   *
>   * If set, @ce_count or @ue_count, count and return the corresponding
>   * error counts in those integer pointers. Return 0 if the device @@ -1142,11
> +1185,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   */
>  int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>                                unsigned long *ce_count,
> -                              unsigned long *ue_count)
> +                              unsigned long *ue_count,
> +                              struct ras_query_if *query_info)
>  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       struct ras_manager *obj;
>       unsigned long ce, ue;
> +     int ret;
> 
>       if (!adev->ras_enabled || !con)
>               return -EOPNOTSUPP;
> @@ -1158,26 +1203,23 @@ int amdgpu_ras_query_error_count(struct
> amdgpu_device *adev,
> 
>       ce = 0;
>       ue = 0;
> -     list_for_each_entry(obj, &con->head, node) {
> -             struct ras_query_if info = {
> -                     .head = obj->head,
> -             };
> -             int res;
> -
> -             res = amdgpu_ras_query_error_status(adev, &info);
> -             if (res)
> -                     return res;
> +     if (!query_info) {
> +             /* query all the ip blocks that support ras query interface */
> +             list_for_each_entry(obj, &con->head, node) {
> +                     struct ras_query_if info = {
> +                             .head = obj->head,
> +                     };
> 
> -             if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2)
> &&
> -                 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
> -                     if (amdgpu_ras_reset_error_status(adev,
> info.head.block))
> -                             dev_warn(adev->dev, "Failed to reset error
> counter and error status");
> +                     ret = amdgpu_ras_query_error_count_helper(adev, &ce,
> &ue, &info);
>               }
> -
> -             ce += info.ce_count;
> -             ue += info.ue_count;
> +     } else {
> +             /* query specific ip block */
> +             ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue,
> +query_info);
>       }
> 
> +     if (ret)
> +             return ret;
> +
>       if (ce_count)
>               *ce_count = ce;
> 
> @@ -2408,7 +2450,7 @@ static void amdgpu_ras_counte_dw(struct
> work_struct *work)
> 
>       /* Cache new values.
>        */
> -     if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0)
> {
> +     if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL)
> ==
> +0) {
>               atomic_set(&con->ras_ce_count, ce_count);
>               atomic_set(&con->ras_ue_count, ue_count);
>       }
> @@ -2589,6 +2631,7 @@ int amdgpu_ras_block_late_init(struct
> amdgpu_device *adev,  {
>       struct amdgpu_ras_block_object *ras_obj = NULL;
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +     struct ras_query_if *query_info;
>       unsigned long ue_count, ce_count;
>       int r;
> 
> @@ -2630,11 +2673,17 @@ int amdgpu_ras_block_late_init(struct
> amdgpu_device *adev,
> 
>       /* Those are the cached values at init.
>        */
> -     if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0)
> {
> +     query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
> +     if (!query_info)
> +             return -ENOMEM;
> +     memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
> +
> +     if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count,
> +query_info) == 0) {
>               atomic_set(&con->ras_ce_count, ce_count);
>               atomic_set(&con->ras_ue_count, ue_count);
>       }
> 
> +     kfree(query_info);
>       return 0;
> 
>  interrupt:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index bf5a95104ec1..f2ad999993f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -540,7 +540,8 @@ void amdgpu_ras_suspend(struct amdgpu_device
> *adev);
> 
>  int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>                                unsigned long *ce_count,
> -                              unsigned long *ue_count);
> +                              unsigned long *ue_count,
> +                              struct ras_query_if *query_info);
> 
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> --
> 2.17.1

RE: [PATCH] drm/amdgpu: allow query error counters for specific IP block

Reply via email to