After eeprom records format upgrades to V3, records that have 'address == 0' 
should be
supported in NPS1

Signed-off-by: ganglxie <gangl...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 ++++++++++++++++---------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e85143acf3a2..4f4e12452ecc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2881,6 +2881,17 @@ static int __amdgpu_ras_convert_rec_from_rom(struct 
amdgpu_device *adev,
        save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
        bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
 
+       /* if address== 0, the rec must be a PA rec under NPS1 */
+       if (bps->address == 0) {
+               if (nps == AMDGPU_NPS1_PARTITION_MODE) {
+                       if (amdgpu_umc_pages_in_a_row(adev, err_data,
+                               bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                               return -EINVAL;
+                       goto out;
+               } else {
+                       return -EOPNOTSUPP;
+               }
+       }
        if (save_nps == nps) {
                if (amdgpu_umc_pages_in_a_row(adev, err_data,
                                bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
@@ -2889,6 +2900,8 @@ static int __amdgpu_ras_convert_rec_from_rom(struct 
amdgpu_device *adev,
                if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
                        return -EINVAL;
        }
+
+out:
        return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
                                                                        
adev->umc.retire_unit);
 }
@@ -2903,7 +2916,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        &adev->psp.ras_context.ras->eeprom_control;
        enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
        int ret = 0;
-       uint32_t i;
+       uint32_t i = 0;
 
        if (!con || !con->eh_data || !bps || pages <= 0)
                return 0;
@@ -2924,28 +2937,31 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
        mutex_lock(&con->recovery_lock);
 
        if (from_rom) {
-               for (i = 0; i < pages; i++) {
-                       if (control->ras_num_recs - i >= adev->umc.retire_unit) 
{
-                               if ((bps[i].address == bps[i + 1].address) &&
-                                   (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
-                                       //deal with retire_unit records a time
-                                       ret = 
__amdgpu_ras_convert_rec_array_from_rom(adev,
-                                                                       
&bps[i], &err_data, nps);
-                                       if (ret)
-                                               goto free;
-                                       i += (adev->umc.retire_unit - 1);
+               /* there is no pa recs in V3, so skip pa recs processing */
+               if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+                       for (i = 0; i < pages; i++) {
+                               if (control->ras_num_recs - i >= 
adev->umc.retire_unit) {
+                                       if ((bps[i].address == bps[i + 
1].address) &&
+                                               (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
+                                               /* deal with retire_unit 
records a time */
+                                               ret = 
__amdgpu_ras_convert_rec_array_from_rom(adev,
+                                                                               
&bps[i], &err_data, nps);
+                                               if (ret)
+                                                       
control->ras_num_bad_pages -= adev->umc.retire_unit;
+                                               i += (adev->umc.retire_unit - 
1);
+                                       } else {
+                                               break;
+                                       }
                                } else {
                                        break;
                                }
-                       } else {
-                               break;
                        }
                }
                for (; i < pages; i++) {
                        ret = __amdgpu_ras_convert_rec_from_rom(adev,
                                &bps[i], &err_data, nps);
                        if (ret)
-                               goto free;
+                               control->ras_num_bad_pages -= 
adev->umc.retire_unit;
                }
        } else {
                ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
@@ -3040,21 +3056,28 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
                dev_err(adev->dev, "Failed to load EEPROM table records!");
        } else {
                if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
-                       for (i = 0; i < control->ras_num_recs; i++) {
-                               if ((control->ras_num_recs - i) >= 
adev->umc.retire_unit) {
-                                       if ((bps[i].address == bps[i + 
1].address) &&
-                                               (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
-                                               control->ras_num_pa_recs += 
adev->umc.retire_unit;
-                                               i += (adev->umc.retire_unit - 
1);
+                       /*In V3, there is no pa recs, and some cases(when 
address==0) may be parsed
+                       as pa recs, so add verion check to avoid it.
+                       */
+                       if (control->tbl_hdr.version < RAS_TABLE_VER_V3) {
+                               for (i = 0; i < control->ras_num_recs; i++) {
+                                       if ((control->ras_num_recs - i) >= 
adev->umc.retire_unit) {
+                                               if ((bps[i].address == bps[i + 
1].address) &&
+                                                       (bps[i].mem_channel == 
bps[i + 1].mem_channel)) {
+                                                       
control->ras_num_pa_recs += adev->umc.retire_unit;
+                                                       i += 
(adev->umc.retire_unit - 1);
+                                               } else {
+                                                       
control->ras_num_mca_recs +=
+                                                                               
(control->ras_num_recs - i);
+                                                       break;
+                                               }
                                        } else {
-                                               control->ras_num_mca_recs +=
-                                                                       
(control->ras_num_recs - i);
+                                               control->ras_num_mca_recs += 
(control->ras_num_recs - i);
                                                break;
                                        }
-                               } else {
-                                       control->ras_num_mca_recs += 
(control->ras_num_recs - i);
-                                       break;
                                }
+                       } else {
+                               control->ras_num_mca_recs = 
control->ras_num_recs;
                        }
                }
 
-- 
2.34.1

Reply via email to