All legacy RAS bad pages are generated in NPS1 mode, but new bad page can be
generated in any NPS mode, so we can't use retired_page stored on eeprom
directly in non-nps1 mode even for legacy data. We need to take different
actions for different data, new data can be identified from old data by
UMC_CHANNEL_IDX_V2 flag.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 45 +++++++++++++++++++------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index be56938932d1..76cd095f96b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2757,12 +2757,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        is_mca_add = false;
        }
 
-       mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data)
-               goto out;
-
-       if (is_mca_add) {
+       if (from_rom) {
                err_data.err_addr =
                        kcalloc(adev->umc.retire_unit,
                                sizeof(struct eeprom_table_record), GFP_KERNEL);
@@ -2773,11 +2768,17 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                        goto out;
                }
 
+               err_rec = err_data.err_addr;
                loop_cnt = adev->umc.retire_unit;
                if (adev->gmc.gmc_funcs->query_mem_partition_mode)
                        nps = 
adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
        }
 
+       mutex_lock(&con->recovery_lock);
+       data = con->eh_data;
+       if (!data)
+               goto free;
+
        for (i = 0; i < pages; i++) {
                if (is_mca_add) {
                        if (!find_pages_per_pa) {
@@ -2799,10 +2800,34 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                                                bps[i].retired_page << 
AMDGPU_GPU_PAGE_SHIFT))
                                        goto free;
                        }
-
-                       err_rec = err_data.err_addr;
                } else {
-                       err_rec = &bps[i];
+                       if (from_rom && !find_pages_per_pa) {
+                               if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
+                                       /* bad page in any NPS mode in eeprom */
+                                       if (amdgpu_ras_mca2pa_by_idx(adev, 
&bps[i], &err_data))
+                                               goto free;
+                               } else {
+                                       /* legacy bad page in eeprom, generated 
only in NPS1 mode */
+                                       if (amdgpu_ras_mca2pa(adev, &bps[i], 
&err_data)) {
+                                               /* old RAS TA or ASICs which 
don't support to convert addrss
+                                                * via mca address
+                                                */
+                                               if (!i && nps == 
AMDGPU_NPS1_PARTITION_MODE) {
+                                                       find_pages_per_pa = 
true;
+                                                       err_rec = &bps[i];
+                                                       loop_cnt = 1;
+                                               } else {
+                                                       /* non-nps1 mode, old 
RAS TA can't support it */
+                                                       goto free;
+                                               }
+                                       }
+                               }
+
+                               if (!find_pages_per_pa)
+                                       i += (adev->umc.retire_unit - 1);
+                       } else {
+                               err_rec = &bps[i];
+                       }
                }
 
                for (j = 0; j < loop_cnt; j++) {
@@ -2826,7 +2851,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
        }
 
 free:
-       if (is_mca_add)
+       if (from_rom)
                kfree(err_data.err_addr);
 out:
        mutex_unlock(&con->recovery_lock);
-- 
2.34.1

Reply via email to