If a UMC bad page is reserved but not freed by an application, the
application may trigger uncorrectable error repeatly by accessing the page.

v2: add specific function to do the check.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 24 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  4 ++++
 3 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6e543558386d..5214034e1b16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2115,6 +2115,30 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
        return 0;
 }
 
+/* Return false if all pages have been reserved before, no new bad page
+ * is found, otherwise return true.
+ * Note: the function should be called between amdgpu_ras_add_bad_pages
+ * and amdgpu_ras_save_bad_pages.
+ */
+bool amdgpu_ras_new_bad_page_is_added(struct amdgpu_device *adev)
+{
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_err_handler_data *data;
+       struct amdgpu_ras_eeprom_control *control;
+       int save_count;
+
+       if (!con || !con->eh_data)
+               return false;
+
+       mutex_lock(&con->recovery_lock);
+       control = &con->eeprom_control;
+       data = con->eh_data;
+       save_count = data->count - control->ras_num_recs;
+       mutex_unlock(&con->recovery_lock);
+
+       return (save_count ? true : false);
+}
+
 /*
  * read error record array in eeprom and reserve enough space for
  * storing new bad pages
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f2ad999993f6..606b75c36848 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -549,6 +549,8 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 
 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
 
+bool amdgpu_ras_new_bad_page_is_added(struct amdgpu_device *adev);
+
 static inline enum ta_ras_block
 amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
        switch (block) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1c7fcb4f2380..1146e65c22be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -147,6 +147,10 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
                        err_data->err_addr_cnt) {
                        amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
                                                err_data->err_addr_cnt);
+                       /* if no new bad page is found, no need to increase ue 
count */
+                       if (!amdgpu_ras_new_bad_page_is_added(adev))
+                               err_data->ue_count = 0;
+
                        amdgpu_ras_save_bad_pages(adev);
 
                        amdgpu_dpm_send_hbm_bad_pages_num(adev, 
con->eeprom_control.ras_num_recs);
-- 
2.35.1

Reply via email to