when eeprom initialization failed, we still support ras error injection,
and reserve bad pages, but do not save bad pages to eeprom

Signed-off-by: ganglxie <gangl...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 22 ++++++++++++++-----
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 ++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 514b56e5d8ba..d24567787f9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3009,6 +3009,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
                return 0;
        }
 
+       if (!con->eeprom_control.is_eeprom_valid) {
+               dev_err(adev->dev,
+                       "Failed to save EEPROM table data because of EEPROM 
data corruption!");
+               if (new_cnt)
+                       *new_cnt = 0;
+
+               return 0;
+       }
+
        mutex_lock(&con->recovery_lock);
        control = &con->eeprom_control;
        data = con->eh_data;
@@ -3502,8 +3511,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device 
*adev)
 
        control = &con->eeprom_control;
        ret = amdgpu_ras_eeprom_init(control);
-       if (ret)
-               return ret;
+       control->is_eeprom_valid = !ret;
 
        if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
                control->ras_num_pa_recs = control->ras_num_recs;
@@ -3512,10 +3520,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device 
*adev)
            adev->umc.ras->get_retire_flip_bits)
                adev->umc.ras->get_retire_flip_bits(adev);
 
-       if (control->ras_num_recs) {
+       if (control->ras_num_recs && control->is_eeprom_valid) {
                ret = amdgpu_ras_load_bad_pages(adev);
-               if (ret)
-                       return ret;
+               if (ret) {
+                       control->is_eeprom_valid = false;
+                       return 0;
+               }
 
                amdgpu_dpm_send_hbm_bad_pages_num(
                        adev, control->ras_num_bad_pages);
@@ -3534,7 +3544,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device 
*adev)
                                        dev_warn(adev->dev, "Failed to format 
RAS EEPROM data in V3 version!\n");
        }
 
-       return ret;
+       return 0;
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index ec6d7ea37ad0..35c69ac3dbeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
        /* Record channel info which occurred bad pages
         */
        u32 bad_channel_bitmap;
+
+       bool is_eeprom_valid;
 };
 
 /*
-- 
2.34.1

Reply via email to