One UMC MCA address could map to multiply physical address (PA):

AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address, PA
is not cared about

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 33 +++++++++++++++----
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    | 14 ++++++++
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6de61e7b4256..a529aec19527 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2768,10 +2768,20 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
                return -ENOMEM;
 
        ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
-       if (ret)
+       if (ret) {
                dev_err(adev->dev, "Failed to load EEPROM table records!");
-       else
+       } else {
+               if (control->ras_num_recs > 1 &&
+                   adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+                       if ((bps[0].address == bps[1].address) &&
+                           (bps[0].mem_channel == bps[1].mem_channel))
+                               control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
+                       else
+                               control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+               }
+
                ret = amdgpu_ras_add_bad_pages(adev, bps, 
control->ras_num_recs);
+       }
 
        kfree(bps);
        return ret;
@@ -3160,13 +3170,14 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
 int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct amdgpu_ras_eeprom_control *control;
        int ret;
 
        if (!con || amdgpu_sriov_vf(adev))
                return 0;
 
-       ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
-
+       control = &con->eeprom_control;
+       ret = amdgpu_ras_eeprom_init(control);
        if (ret)
                return ret;
 
@@ -3174,17 +3185,25 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device 
*adev)
        if (amdgpu_ras_is_rma(adev))
                return -EHWPOISON;
 
-       if (con->eeprom_control.ras_num_recs) {
+       if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
+               control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
+
+       /* default status is MCA storage */
+       if (control->ras_num_recs <= 1 &&
+           adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
+               control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+
+       if (control->ras_num_recs) {
                ret = amdgpu_ras_load_bad_pages(adev);
                if (ret)
                        return ret;
 
                amdgpu_dpm_send_hbm_bad_pages_num(
-                       adev, con->eeprom_control.ras_num_recs);
+                       adev, control->ras_num_recs);
 
                if (con->update_channel_flag == true) {
                        amdgpu_dpm_send_hbm_bad_channel_flag(
-                               adev, con->eeprom_control.bad_channel_bitmap);
+                               adev, control->bad_channel_bitmap);
                        con->update_channel_flag = false;
                }
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index b9ebda577797..d3a6f7205a2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -43,6 +43,19 @@ enum amdgpu_ras_eeprom_err_type {
        AMDGPU_RAS_EEPROM_ERR_COUNT,
 };
 
+/*
+ * one UMC MCA address could map to multiply physical address (PA),
+ * such as 1:16, we use eeprom_table_record.address to store MCA
+ * address and use eeprom_table_record.retired_page to save PA.
+ *
+ * AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
+ * AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address
+ */
+enum amdgpu_ras_eeprom_rec_type {
+       AMDGPU_RAS_EEPROM_REC_PA,
+       AMDGPU_RAS_EEPROM_REC_MCA,
+};
+
 struct amdgpu_ras_eeprom_table_header {
        uint32_t header;
        uint32_t version;
@@ -102,6 +115,7 @@ struct amdgpu_ras_eeprom_control {
        /* Record channel info which occurred bad pages
         */
        u32 bad_channel_bitmap;
+       enum amdgpu_ras_eeprom_rec_type rec_type;
 };
 
 /*
-- 
2.34.1

Reply via email to