[AMD Official Use Only]


> -----Original Message-----
> From: Yang, Stanley <stanley.y...@amd.com>
> Sent: Tuesday, December 21, 2021 2:05 PM
> To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org; Zhang,
> Hawking <hawking.zh...@amd.com>; Chai, Thomas <yipeng.c...@amd.com>
> Subject: 回复: [PATCH] drm/amdgpu: save error count in RAS poison handler
> 
> [AMD Official Use Only]
> 
> > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> > +   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > +                   adev->umc.ras_if) {
> > +           struct ras_common_if *ras_if = adev->umc.ras_if;
> > +           struct ras_ih_if ih_info = {
> > +                   .head = *ras_if,
> > +                   .cb = amdgpu_umc_process_ras_data_cb,
> > +           };
> > +
> > +           amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > +           kfree(ras_if);
> > +   }
> > +}
> > +
> > +
> > +
> [Yang, Stanley] it's better remove extra blank lines.
> >  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> >             struct amdgpu_irq_src *source,
> >             struct amdgpu_iv_entry *entry)
> 
> Other than above, patch is reviewed-by: Stanley.Yang <stanley.y...@amd.com>
> 
> > -----邮件原件-----
> > 发件人: Zhou1, Tao <tao.zh...@amd.com>
> > 发送时间: Monday, December 20, 2021 4:51 PM
> > 收件人: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> > <hawking.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; Chai,
> > Thomas <yipeng.c...@amd.com>
> > 抄送: Zhou1, Tao <tao.zh...@amd.com>
> > 主题: [PATCH] drm/amdgpu: save error count in RAS poison handler
> >
> > Otherwise the RAS error count couldn't be queried from sysfs.
> >
> > Signed-off-by: Tao Zhou <tao.zh...@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |   2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 170 ++++++++++++------
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |   3 +-
> >  3 files changed, 99 insertions(+), 76 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > index 0bf09a94d944..776a947b45df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> > @@ -727,7 +727,7 @@ void
> > amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device
> > *adev, bo
> >
> >     /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
> >     if (!adev->gmc.xgmi.connected_to_cpu)
> > -           amdgpu_umc_do_page_retirement(adev, &err_data, NULL,
> > reset);
> > +           amdgpu_umc_poison_handler(adev, &err_data, reset);
> >     else if (reset)
> >             amdgpu_amdkfd_gpu_reset(adev);
> >  }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 0c33f367a4e5..1c2dbd00f647 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -23,79 +23,7 @@
> >
> >  #include "amdgpu_ras.h"
> >
> > -static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> > -           void *ras_error_status,
> > -           struct amdgpu_iv_entry *entry)
> > -{
> > -   return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > entry, true);
> > -}
> > -
> > -int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) -{
> > -   int r;
> > -   struct ras_fs_if fs_info = {
> > -           .sysfs_name = "umc_err_count",
> > -   };
> > -   struct ras_ih_if ih_info = {
> > -           .cb = amdgpu_umc_process_ras_data_cb,
> > -   };
> > -
> > -   if (!adev->umc.ras_if) {
> > -           adev->umc.ras_if =
> > -                   kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> > -           if (!adev->umc.ras_if)
> > -                   return -ENOMEM;
> > -           adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> > -           adev->umc.ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > -           adev->umc.ras_if->sub_block_index = 0;
> > -   }
> > -   ih_info.head = fs_info.head = *adev->umc.ras_if;
> > -
> > -   r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> > -                            &fs_info, &ih_info);
> > -   if (r)
> > -           goto free;
> > -
> > -   if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> > -           r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> > -           if (r)
> > -                   goto late_fini;
> > -   } else {
> > -           r = 0;
> > -           goto free;
> > -   }
> > -
> > -   /* ras init of specific umc version */
> > -   if (adev->umc.ras_funcs &&
> > -       adev->umc.ras_funcs->err_cnt_init)
> > -           adev->umc.ras_funcs->err_cnt_init(adev);
> > -
> > -   return 0;
> > -
> > -late_fini:
> > -   amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> > -free:
> > -   kfree(adev->umc.ras_if);
> > -   adev->umc.ras_if = NULL;
> > -   return r;
> > -}
> > -
> > -void amdgpu_umc_ras_fini(struct amdgpu_device *adev) -{
> > -   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > -                   adev->umc.ras_if) {
> > -           struct ras_common_if *ras_if = adev->umc.ras_if;
> > -           struct ras_ih_if ih_info = {
> > -                   .head = *ras_if,
> > -                   .cb = amdgpu_umc_process_ras_data_cb,
> > -           };
> > -
> > -           amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > -           kfree(ras_if);
> > -   }
> > -}
> > -
> > -int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> > +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> >             void *ras_error_status,
> >             struct amdgpu_iv_entry *entry,
> >             bool reset)
> > @@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct
> > amdgpu_device *adev,
> >     return AMDGPU_RAS_SUCCESS;
> >  }
> >
> > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> > +           void *ras_error_status,
> > +           bool reset)
> > +{
> > +   int ret;
> > +   struct ras_err_data *err_data = (struct ras_err_data
> > *)ras_error_status;
> > +   struct ras_common_if head = {
> > +           .block = AMDGPU_RAS_BLOCK__UMC,
> > +   };
> > +   struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
> > +
> > +   ret =
> > +           amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > NULL, reset);
> > +
> > +   if (ret == AMDGPU_RAS_SUCCESS && obj) {
> > +           obj->err_data.ue_count += err_data->ue_count;
> > +           obj->err_data.ce_count += err_data->ce_count;
> > +   }
> > +
> > +   return ret;
> > +}
> > +
> > +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
> > +           void *ras_error_status,
> > +           struct amdgpu_iv_entry *entry)
> > +{
> > +   return amdgpu_umc_do_page_retirement(adev, ras_error_status,
> > entry,
> > +true); }
> > +
> > +int amdgpu_umc_ras_late_init(struct amdgpu_device *adev) {
> > +   int r;
> > +   struct ras_fs_if fs_info = {
> > +           .sysfs_name = "umc_err_count",
> > +   };
> > +   struct ras_ih_if ih_info = {
> > +           .cb = amdgpu_umc_process_ras_data_cb,
> > +   };
> > +
> > +   if (!adev->umc.ras_if) {
> > +           adev->umc.ras_if =
> > +                   kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
> > +           if (!adev->umc.ras_if)
> > +                   return -ENOMEM;
> > +           adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
> > +           adev->umc.ras_if->type =
> > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> > +           adev->umc.ras_if->sub_block_index = 0;
> > +   }
> > +   ih_info.head = fs_info.head = *adev->umc.ras_if;
> > +
> > +   r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
> > +                            &fs_info, &ih_info);
> > +   if (r)
> > +           goto free;
> > +
> > +   if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
> > +           r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
> > +           if (r)
> > +                   goto late_fini;
> > +   } else {
> > +           r = 0;
> > +           goto free;
> > +   }
> > +
> > +   /* ras init of specific umc version */
> > +   if (adev->umc.ras_funcs &&
> > +       adev->umc.ras_funcs->err_cnt_init)
> > +           adev->umc.ras_funcs->err_cnt_init(adev);
> > +
> > +   return 0;
> > +
> > +late_fini:
> > +   amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
> > +free:
> > +   kfree(adev->umc.ras_if);
> > +   adev->umc.ras_if = NULL;
> > +   return r;
> > +}
> > +
> > +void amdgpu_umc_ras_fini(struct amdgpu_device *adev) {
> > +   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)
> > &&
> > +                   adev->umc.ras_if) {
> > +           struct ras_common_if *ras_if = adev->umc.ras_if;
> > +           struct ras_ih_if ih_info = {
> > +                   .head = *ras_if,
> > +                   .cb = amdgpu_umc_process_ras_data_cb,
> > +           };
> > +
> > +           amdgpu_ras_late_fini(adev, ras_if, &ih_info);
> > +           kfree(ras_if);
> > +   }
> > +}
> > +
> > +
> > +
> [Yang, Stanley] it's better remove extra blank lines.

[Tao] Thanks for your reminder, I'll remove them before push.

> 
> >  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> >             struct amdgpu_irq_src *source,
> >             struct amdgpu_iv_entry *entry)
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > index 8d18d5121f66..b72194e8bfe5 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > @@ -78,9 +78,8 @@ struct amdgpu_umc {
> >
> >  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);  void
> > amdgpu_umc_ras_fini(struct amdgpu_device *adev); -int
> > amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
> > +int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
> >             void *ras_error_status,
> > -           struct amdgpu_iv_entry *entry,
> >             bool reset);
> >  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
> >             struct amdgpu_irq_src *source,
> > --
> > 2.17.1

Reply via email to