there is only MCA records in V3, no need to care about PA records.
recalculate the value of ras_num_bad_pages when parsing failed and
go on with the left records instead of quit.

Signed-off-by: ganglxie <gangl...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++++++++++++++----------
 1 file changed, 36 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e85143acf3a2..32f41f151c82 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2889,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct 
amdgpu_device *adev,
                if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
                        return -EINVAL;
        }
+
        return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
                                                                        
adev->umc.retire_unit);
 }
@@ -2903,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        &adev->psp.ras_context.ras->eeprom_control;
        enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
        int ret = 0;
-       uint32_t i;
+       uint32_t i = 0;
 
        if (!con || !con->eh_data || !bps || pages <= 0)
                return 0;
@@ -2924,28 +2925,31 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
        mutex_lock(&con->recovery_lock);
 
        if (from_rom) {
-               for (i = 0; i < pages; i++) {
-                       if (control->ras_num_recs - i >= adev->umc.retire_unit) 
{
-                               if ((bps[i].address == bps[i + 1].address) &&
-                                   (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
-                                       //deal with retire_unit records a time
-                                       ret = 
__amdgpu_ras_convert_rec_array_from_rom(adev,
-                                                                       
&bps[i], &err_data, nps);
-                                       if (ret)
-                                               goto free;
-                                       i += (adev->umc.retire_unit - 1);
+               /* there is no pa recs in V3, so skip pa recs processing */
+               if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+                       for (i = 0; i < pages; i++) {
+                               if (control->ras_num_recs - i >= 
adev->umc.retire_unit) {
+                                       if ((bps[i].address == bps[i + 
1].address) &&
+                                               (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
+                                               /* deal with retire_unit 
records a time */
+                                               ret = 
__amdgpu_ras_convert_rec_array_from_rom(adev,
+                                                                               
&bps[i], &err_data, nps);
+                                               if (ret)
+                                                       
control->ras_num_bad_pages -= adev->umc.retire_unit;
+                                               i += (adev->umc.retire_unit - 
1);
+                                       } else {
+                                               break;
+                                       }
                                } else {
                                        break;
                                }
-                       } else {
-                               break;
                        }
                }
                for (; i < pages; i++) {
                        ret = __amdgpu_ras_convert_rec_from_rom(adev,
                                &bps[i], &err_data, nps);
                        if (ret)
-                               goto free;
+                               control->ras_num_bad_pages -= 
adev->umc.retire_unit;
                }
        } else {
                ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
@@ -3040,21 +3044,28 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
                dev_err(adev->dev, "Failed to load EEPROM table records!");
        } else {
                if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
-                       for (i = 0; i < control->ras_num_recs; i++) {
-                               if ((control->ras_num_recs - i) >= 
adev->umc.retire_unit) {
-                                       if ((bps[i].address == bps[i + 
1].address) &&
-                                               (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
-                                               control->ras_num_pa_recs += 
adev->umc.retire_unit;
-                                               i += (adev->umc.retire_unit - 
1);
+                       /*In V3, there is no pa recs, and some cases(when 
address==0) may be parsed
+                       as pa recs, so add verion check to avoid it.
+                       */
+                       if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+                               for (i = 0; i < control->ras_num_recs; i++) {
+                                       if ((control->ras_num_recs - i) >= 
adev->umc.retire_unit) {
+                                               if ((bps[i].address == bps[i + 
1].address) &&
+                                                       (bps[i].mem_channel == 
bps[i + 1].mem_channel)) {
+                                                       
control->ras_num_pa_recs += adev->umc.retire_unit;
+                                                       i += 
(adev->umc.retire_unit - 1);
+                                               } else {
+                                                       
control->ras_num_mca_recs +=
+                                                                               
(control->ras_num_recs - i);
+                                                       break;
+                                               }
                                        } else {
-                                               control->ras_num_mca_recs +=
-                                                                       
(control->ras_num_recs - i);
+                                               control->ras_num_mca_recs += 
(control->ras_num_recs - i);
                                                break;
                                        }
-                               } else {
-                                       control->ras_num_mca_recs += 
(control->ras_num_recs - i);
-                                       break;
                                }
+                       } else {
+                               control->ras_num_mca_recs = 
control->ras_num_recs;
                        }
                }
 
-- 
2.34.1

Reply via email to