If a UMC bad page is reserved but not freed by an application, the
application may trigger uncorrectable error repeatly by accessing the page.

v2: add specific function to do the check.
v3: remove duplicate pages, calculate new added bad page number.
v4: reuse save_bad_pages to calculate new added bad page number.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  5 +++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6e543558386d..5c02c6c9f773 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -176,7 +176,7 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device 
*adev, uint64_t addre
        if (amdgpu_bad_page_threshold != 0) {
                amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
                                         err_data.err_addr_cnt);
-               amdgpu_ras_save_bad_pages(adev);
+               amdgpu_ras_save_bad_pages(adev, NULL);
        }
 
        dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL 
CORRUPT RAS EEPROM\n");
@@ -2084,22 +2084,32 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 /*
  * write error record array to eeprom, the function should be
  * protected by recovery_lock
+ * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
  */
-int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
+               unsigned long *new_cnt)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        struct ras_err_handler_data *data;
        struct amdgpu_ras_eeprom_control *control;
        int save_count;
 
-       if (!con || !con->eh_data)
+       if (!con || !con->eh_data) {
+               if (new_cnt)
+                       *new_cnt = 0;
+
                return 0;
+       }
 
        mutex_lock(&con->recovery_lock);
        control = &con->eeprom_control;
        data = con->eh_data;
        save_count = data->count - control->ras_num_recs;
        mutex_unlock(&con->recovery_lock);
+
+       if (new_cnt)
+               *new_cnt = save_count / adev->umc.retire_unit;
+
        /* only new entries are saved */
        if (save_count > 0) {
                if (amdgpu_ras_eeprom_append(control,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f2ad999993f6..ef38f4c93df0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -547,7 +547,8 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                struct eeprom_table_record *bps, int pages);
 
-int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
+               unsigned long *new_cnt);
 
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1c7fcb4f2380..7c6fc3214339 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -68,7 +68,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
        if (amdgpu_bad_page_threshold != 0) {
                amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
                                                err_data.err_addr_cnt);
-               amdgpu_ras_save_bad_pages(adev);
+               amdgpu_ras_save_bad_pages(adev, NULL);
        }
 
 out:
@@ -147,7 +147,8 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
                        err_data->err_addr_cnt) {
                        amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
                                                err_data->err_addr_cnt);
-                       amdgpu_ras_save_bad_pages(adev);
+
+                       amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
 
                        amdgpu_dpm_send_hbm_bad_pages_num(adev, 
con->eeprom_control.ras_num_recs);
 
-- 
2.35.1

Reply via email to