Otherwise the RAS error count couldn't be queried from sysfs.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c    | 170 ++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h    |   3 +-
 3 files changed, 99 insertions(+), 76 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 0bf09a94d944..776a947b45df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct 
amdgpu_device *adev, bo
 
        /* CPU MCA will handle page retirement if connected_to_cpu is 1 */
        if (!adev->gmc.xgmi.connected_to_cpu)
-               amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+               amdgpu_umc_poison_handler(adev, &err_data, reset);
        else if (reset)
                amdgpu_amdkfd_gpu_reset(adev);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 0c33f367a4e5..1c2dbd00f647 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,79 +23,7 @@
 
 #include "amdgpu_ras.h"
 
-static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
-               void *ras_error_status,
-               struct amdgpu_iv_entry *entry)
-{
-       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, 
true);
-}
-
-int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
-{
-       int r;
-       struct ras_fs_if fs_info = {
-               .sysfs_name = "umc_err_count",
-       };
-       struct ras_ih_if ih_info = {
-               .cb = amdgpu_umc_process_ras_data_cb,
-       };
-
-       if (!adev->umc.ras_if) {
-               adev->umc.ras_if =
-                       kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
-               if (!adev->umc.ras_if)
-                       return -ENOMEM;
-               adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
-               adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-               adev->umc.ras_if->sub_block_index = 0;
-       }
-       ih_info.head = fs_info.head = *adev->umc.ras_if;
-
-       r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
-                                &fs_info, &ih_info);
-       if (r)
-               goto free;
-
-       if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
-               r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
-               if (r)
-                       goto late_fini;
-       } else {
-               r = 0;
-               goto free;
-       }
-
-       /* ras init of specific umc version */
-       if (adev->umc.ras_funcs &&
-           adev->umc.ras_funcs->err_cnt_init)
-               adev->umc.ras_funcs->err_cnt_init(adev);
-
-       return 0;
-
-late_fini:
-       amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
-free:
-       kfree(adev->umc.ras_if);
-       adev->umc.ras_if = NULL;
-       return r;
-}
-
-void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
-{
-       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
-                       adev->umc.ras_if) {
-               struct ras_common_if *ras_if = adev->umc.ras_if;
-               struct ras_ih_if ih_info = {
-                       .head = *ras_if,
-                       .cb = amdgpu_umc_process_ras_data_cb,
-               };
-
-               amdgpu_ras_late_fini(adev, ras_if, &ih_info);
-               kfree(ras_if);
-       }
-}
-
-int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
                void *ras_error_status,
                struct amdgpu_iv_entry *entry,
                bool reset)
@@ -180,6 +108,102 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device 
*adev,
        return AMDGPU_RAS_SUCCESS;
 }
 
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
+               void *ras_error_status,
+               bool reset)
+{
+       int ret;
+       struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+       struct ras_common_if head = {
+               .block = AMDGPU_RAS_BLOCK__UMC,
+       };
+       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+       ret =
+               amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, 
reset);
+
+       if (ret == AMDGPU_RAS_SUCCESS && obj) {
+               obj->err_data.ue_count += err_data->ue_count;
+               obj->err_data.ce_count += err_data->ce_count;
+       }
+
+       return ret;
+}
+
+static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
+               void *ras_error_status,
+               struct amdgpu_iv_entry *entry)
+{
+       return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, 
true);
+}
+
+int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
+{
+       int r;
+       struct ras_fs_if fs_info = {
+               .sysfs_name = "umc_err_count",
+       };
+       struct ras_ih_if ih_info = {
+               .cb = amdgpu_umc_process_ras_data_cb,
+       };
+
+       if (!adev->umc.ras_if) {
+               adev->umc.ras_if =
+                       kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+               if (!adev->umc.ras_if)
+                       return -ENOMEM;
+               adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
+               adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+               adev->umc.ras_if->sub_block_index = 0;
+       }
+       ih_info.head = fs_info.head = *adev->umc.ras_if;
+
+       r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
+                                &fs_info, &ih_info);
+       if (r)
+               goto free;
+
+       if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
+               r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
+               if (r)
+                       goto late_fini;
+       } else {
+               r = 0;
+               goto free;
+       }
+
+       /* ras init of specific umc version */
+       if (adev->umc.ras_funcs &&
+           adev->umc.ras_funcs->err_cnt_init)
+               adev->umc.ras_funcs->err_cnt_init(adev);
+
+       return 0;
+
+late_fini:
+       amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
+free:
+       kfree(adev->umc.ras_if);
+       adev->umc.ras_if = NULL;
+       return r;
+}
+
+void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
+{
+       if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
+                       adev->umc.ras_if) {
+               struct ras_common_if *ras_if = adev->umc.ras_if;
+               struct ras_ih_if ih_info = {
+                       .head = *ras_if,
+                       .cb = amdgpu_umc_process_ras_data_cb,
+               };
+
+               amdgpu_ras_late_fini(adev, ras_if, &ih_info);
+               kfree(ras_if);
+       }
+}
+
+
+
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
                struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 8d18d5121f66..b72194e8bfe5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -78,9 +78,8 @@ struct amdgpu_umc {
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
 void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
-int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
                void *ras_error_status,
-               struct amdgpu_iv_entry *entry,
                bool reset);
 int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
                struct amdgpu_irq_src *source,
-- 
2.17.1

Reply via email to