add nps info into eeprom records, and refine adding bad page
logic based on nps info.

Signed-off-by: ganglxie <gangl...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 244 +++++++++---------
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  25 +-
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  20 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h       |   7 +
 4 files changed, 153 insertions(+), 143 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5420e2d6d244..3945dda54b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device 
*adev,
                return  -EINVAL;
 }
 
+static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev,
+                                       struct eeprom_table_record *bps, int 
count)
+{
+       int j;
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       struct ras_err_handler_data *data = con->eh_data;
+
+       for (j = 0; j < count; j++) {
+               if (amdgpu_ras_check_bad_page_unlock(con,
+                       bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                       continue;
+
+               if (!data->space_left &&
+                       amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
+                       return -ENOMEM;
+               }
+
+               amdgpu_ras_reserve_page(adev, bps[j].retired_page);
+
+               memcpy(&data->bps[data->count], &(bps[j]),
+                               sizeof(struct eeprom_table_record));
+               data->count++;
+               data->space_left--;
+       }
+
+       return 0;
+}
+
+static int __amdgpu_ras_convert_unit_rec_from_rom(struct amdgpu_device *adev,
+                               struct eeprom_table_record *bps, struct 
ras_err_data *err_data,
+                               enum amdgpu_memory_partition nps)
+{
+       int i = 0;
+       int ret = 0;
+       enum amdgpu_memory_partition save_nps;
+
+       save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+
+       for (i = 0; i < adev->umc.retire_unit; i++)
+               bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+       if (save_nps) {
+               if (save_nps == nps) {
+                       if (amdgpu_umc_pages_in_a_row(adev, err_data,
+                                       bps[0].retired_page << 
AMDGPU_GPU_PAGE_SHIFT))
+                               return -EINVAL;
+               } else {
+                       if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data))
+                               return -EINVAL;
+               }
+       } else {
+               if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {
+                       if (nps == AMDGPU_NPS1_PARTITION_MODE)
+                               memcpy(err_data->err_addr, bps,
+                                       sizeof(struct eeprom_table_record) * 
adev->umc.retire_unit);
+                       else
+                               return -EOPNOTSUPP;
+               }
+       }
+
+       return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, 
adev->umc.retire_unit);
+}
+
+static int __amdgpu_ras_convert_single_rec_from_rom(struct amdgpu_device *adev,
+                               struct eeprom_table_record *bps, struct 
ras_err_data *err_data,
+                               enum amdgpu_memory_partition nps)
+{
+       enum amdgpu_memory_partition save_nps;
+
+       save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK;
+       bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT);
+
+       if (save_nps == nps) {
+               if (amdgpu_umc_pages_in_a_row(adev, err_data,
+                               bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))
+                       return -EINVAL;
+       } else {
+               if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data))
+                       return -EINVAL;
+       }
+       return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,
+                                                                       
adev->umc.retire_unit);
+}
+
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
                struct eeprom_table_record *bps, int pages, bool from_rom)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-       struct ras_err_handler_data *data;
        struct ras_err_data err_data;
-       struct eeprom_table_record *err_rec;
        struct amdgpu_ras_eeprom_control *control =
                        &adev->psp.ras_context.ras->eeprom_control;
        enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;
        int ret = 0;
-       uint32_t i, j, loop_cnt = 1;
-       bool find_pages_per_pa = false;
+       uint32_t i;
 
        if (!con || !con->eh_data || !bps || pages <= 0)
                return 0;
@@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device 
*adev,
                                sizeof(struct eeprom_table_record), GFP_KERNEL);
                if (!err_data.err_addr) {
                        dev_warn(adev->dev, "Failed to alloc UMC error address 
record in mca2pa conversion!\n");
-                       ret = -ENOMEM;
-                       goto out;
+                       return -ENOMEM;
                }
 
-               err_rec = err_data.err_addr;
-               loop_cnt = adev->umc.retire_unit;
                if (adev->gmc.gmc_funcs->query_mem_partition_mode)
                        nps = 
adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
        }
 
        mutex_lock(&con->recovery_lock);
-       data = con->eh_data;
-       if (!data) {
-               /* Returning 0 as the absence of eh_data is acceptable */
-               goto free;
-       }
-
-       for (i = 0; i < pages; i++) {
-               if (from_rom &&
-                   control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) {
-                       if (!find_pages_per_pa) {
-                               if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], 
&err_data)) {
-                                       if (!i && nps == 
AMDGPU_NPS1_PARTITION_MODE) {
-                                               /* may use old RAS TA, use PA 
to find pages in
-                                                * one row
-                                                */
-                                               if 
(amdgpu_umc_pages_in_a_row(adev, &err_data,
-                                                                             
bps[i].retired_page <<
-                                                                             
AMDGPU_GPU_PAGE_SHIFT)) {
-                                                       ret = -EINVAL;
-                                                       goto free;
-                                               } else {
-                                                       find_pages_per_pa = 
true;
-                                               }
-                                       } else {
-                                               /* unsupported cases */
-                                               ret = -EOPNOTSUPP;
-                                               goto free;
-                                       }
-                               }
-                       } else {
-                               if (amdgpu_umc_pages_in_a_row(adev, &err_data,
-                                               bps[i].retired_page << 
AMDGPU_GPU_PAGE_SHIFT)) {
-                                       ret = -EINVAL;
-                                       goto free;
-                               }
-                       }
-               } else {
-                       if (from_rom && !find_pages_per_pa) {
-                               if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) {
-                                       /* bad page in any NPS mode in eeprom */
-                                       if (amdgpu_ras_mca2pa_by_idx(adev, 
&bps[i], &err_data)) {
-                                               ret = -EINVAL;
+
+       if (from_rom) {
+               for (i = 0; i < pages; i++) {
+                       if (control->ras_num_recs - i >= adev->umc.retire_unit) 
{
+                               if ((bps[i].address == bps[i + 1].address) &&
+                                       (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
+                                       //deal retire_unit records a time
+                                       ret = 
__amdgpu_ras_convert_unit_rec_from_rom(adev,
+                                                                       
&bps[i], &err_data, nps);
+                                       if (ret)
                                                goto free;
-                                       }
+                                       i += (adev->umc.retire_unit - 1);
                                } else {
-                                       /* legacy bad page in eeprom, generated 
only in
-                                        * NPS1 mode
-                                        */
-                                       if (amdgpu_ras_mca2pa(adev, &bps[i], 
&err_data)) {
-                                               /* old RAS TA or ASICs which 
don't support to
-                                                * convert addrss via mca 
address
-                                                */
-                                               if (!i && nps == 
AMDGPU_NPS1_PARTITION_MODE) {
-                                                       find_pages_per_pa = 
true;
-                                                       err_rec = &bps[i];
-                                                       loop_cnt = 1;
-                                               } else {
-                                                       /* non-nps1 mode, old 
RAS TA
-                                                        * can't support it
-                                                        */
-                                                       ret = -EOPNOTSUPP;
-                                                       goto free;
-                                               }
-                                       }
+                                       break;
                                }
-
-                               if (!find_pages_per_pa)
-                                       i += (adev->umc.retire_unit - 1);
                        } else {
-                               err_rec = &bps[i];
+                               break;
                        }
                }
-
-               for (j = 0; j < loop_cnt; j++) {
-                       if (amdgpu_ras_check_bad_page_unlock(con,
-                               err_rec[j].retired_page << 
AMDGPU_GPU_PAGE_SHIFT))
-                               continue;
-
-                       if (!data->space_left &&
-                           amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
-                               ret = -ENOMEM;
+               for (; i < pages; i++) {
+                       ret = __amdgpu_ras_convert_single_rec_from_rom(adev,
+                               &bps[i], &err_data, nps);
+                       if (ret)
                                goto free;
-                       }
-
-                       amdgpu_ras_reserve_page(adev, err_rec[j].retired_page);
-
-                       memcpy(&data->bps[data->count], &(err_rec[j]),
-                                       sizeof(struct eeprom_table_record));
-                       data->count++;
-                       data->space_left--;
                }
+       } else {
+               ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);
        }
 
 free:
@@ -2971,24 +2985,14 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device 
*adev,
 
        /* only new entries are saved */
        if (save_count > 0) {
-               if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) {
+               for (i = 0; i < unit_num; i++) {
                        if (amdgpu_ras_eeprom_append(control,
-                                                    
&data->bps[control->ras_num_recs],
-                                                    save_count)) {
+                                       &data->bps[bad_page_num + i * 
adev->umc.retire_unit],
+                                       1)) {
                                dev_err(adev->dev, "Failed to save EEPROM table 
data!");
                                return -EIO;
                        }
-               } else {
-                       for (i = 0; i < unit_num; i++) {
-                               if (amdgpu_ras_eeprom_append(control,
-                                               &data->bps[bad_page_num + i * 
adev->umc.retire_unit],
-                                               1)) {
-                                       dev_err(adev->dev, "Failed to save 
EEPROM table data!");
-                                       return -EIO;
-                               }
-                       }
                }
-
                dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", 
save_count);
        }
 
@@ -3005,6 +3009,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device 
*adev)
                &adev->psp.ras_context.ras->eeprom_control;
        struct eeprom_table_record *bps;
        int ret;
+       int i = 0;
 
        /* no bad page record, skip eeprom access */
        if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
@@ -3018,13 +3023,23 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
        if (ret) {
                dev_err(adev->dev, "Failed to load EEPROM table records!");
        } else {
-               if (control->ras_num_recs > 1 &&
-                   adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
-                       if ((bps[0].address == bps[1].address) &&
-                           (bps[0].mem_channel == bps[1].mem_channel))
-                               control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
-                       else
-                               control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+               if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) {
+                       for (i = 0; i < control->ras_num_recs; i++) {
+                               if ((control->ras_num_recs - i) >= 
adev->umc.retire_unit) {
+                                       if ((bps[i].address == bps[i + 
1].address) &&
+                                               (bps[i].mem_channel == bps[i + 
1].mem_channel)) {
+                                               control->ras_num_pa_recs += 
adev->umc.retire_unit;
+                                               i += (adev->umc.retire_unit - 
1);
+                                       } else {
+                                               control->ras_num_mca_recs +=
+                                                                       
(control->ras_num_recs - i);
+                                               break;
+                                       }
+                               } else {
+                                       control->ras_num_mca_recs += 
(control->ras_num_recs - i);
+                                       break;
+                               }
+                       }
                }
 
                ret = amdgpu_ras_eeprom_check(control);
@@ -3438,12 +3453,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device 
*adev)
                return ret;
 
        if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
-               control->rec_type = AMDGPU_RAS_EEPROM_REC_PA;
-
-       /* default status is MCA storage */
-       if (control->ras_num_recs <= 1 &&
-           adev->umc.ras && adev->umc.ras->convert_ras_err_addr)
-               control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA;
+               control->ras_num_pa_recs = control->ras_num_recs;
 
        if (control->ras_num_recs) {
                ret = amdgpu_ras_load_bad_pages(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 83b54efcaa87..ab27cecb5519 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -727,11 +727,9 @@ amdgpu_ras_eeprom_append_table(struct 
amdgpu_ras_eeprom_control *control,
                                     - control->ras_fri)
                % control->ras_max_record_count;
 
-       if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
-               control->ras_num_bad_pages = control->ras_num_recs;
-       else
-               control->ras_num_bad_pages =
-                       control->ras_num_recs * adev->umc.retire_unit;
+       control->ras_num_mca_recs += num;
+       control->ras_num_bad_pages += num * adev->umc.retire_unit;
+
 Out:
        kfree(buf);
        return res;
@@ -852,6 +850,7 @@ int amdgpu_ras_eeprom_append(struct 
amdgpu_ras_eeprom_control *control,
 {
        struct amdgpu_device *adev = to_amdgpu_device(control);
        int res, i;
+       uint64_t nps = AMDGPU_NPS1_PARTITION_MODE;
 
        if (!__is_ras_eeprom_supported(adev))
                return 0;
@@ -865,9 +864,12 @@ int amdgpu_ras_eeprom_append(struct 
amdgpu_ras_eeprom_control *control,
                return -EINVAL;
        }
 
+       if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+               nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+
        /* set the new channel index flag */
        for (i = 0; i < num; i++)
-               record[i].retired_page |= UMC_CHANNEL_IDX_V2;
+               record[i].retired_page |= (nps << UMC_NPS_SHIFT);
 
        mutex_lock(&control->ras_tbl_mutex);
 
@@ -881,7 +883,7 @@ int amdgpu_ras_eeprom_append(struct 
amdgpu_ras_eeprom_control *control,
 
        /* clear channel index flag, the flag is only saved on eeprom */
        for (i = 0; i < num; i++)
-               record[i].retired_page &= ~UMC_CHANNEL_IDX_V2;
+               record[i].retired_page &= ~(nps << UMC_NPS_SHIFT);
 
        return res;
 }
@@ -1392,6 +1394,8 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control)
        }
        control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
 
+       control->ras_num_mca_recs = 0;
+       control->ras_num_pa_recs = 0;
        return 0;
 }
 
@@ -1412,11 +1416,8 @@ int amdgpu_ras_eeprom_check(struct 
amdgpu_ras_eeprom_control *control)
        if (!__get_eeprom_i2c_addr(adev, control))
                return -EINVAL;
 
-       if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
-               control->ras_num_bad_pages = control->ras_num_recs;
-       else
-               control->ras_num_bad_pages =
-                       control->ras_num_recs * adev->umc.retire_unit;
+       control->ras_num_bad_pages = control->ras_num_pa_recs +
+                       control->ras_num_mca_recs * adev->umc.retire_unit;
 
        if (hdr->header == RAS_TABLE_HDR_VAL) {
                DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 81d55cb7b397..13f7eda9a696 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -43,19 +43,6 @@ enum amdgpu_ras_eeprom_err_type {
        AMDGPU_RAS_EEPROM_ERR_COUNT,
 };
 
-/*
- * one UMC MCA address could map to multiply physical address (PA),
- * such as 1:16, we use eeprom_table_record.address to store MCA
- * address and use eeprom_table_record.retired_page to save PA.
- *
- * AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
- * AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address
- */
-enum amdgpu_ras_eeprom_rec_type {
-       AMDGPU_RAS_EEPROM_REC_PA,
-       AMDGPU_RAS_EEPROM_REC_MCA,
-};
-
 struct amdgpu_ras_eeprom_table_header {
        uint32_t header;
        uint32_t version;
@@ -100,6 +87,12 @@ struct amdgpu_ras_eeprom_control {
         */
        u32 ras_num_bad_pages;
 
+       /* Number of records store mca address */
+       u32 ras_num_mca_recs;
+
+       /* Number of records store physical address */
+       u32 ras_num_pa_recs;
+
        /* First record index to read, 0-based.
         * Range is [0, num_recs-1]. This is
         * an absolute index, starting right after
@@ -120,7 +113,6 @@ struct amdgpu_ras_eeprom_control {
        /* Record channel info which occurred bad pages
         */
        u32 bad_channel_bitmap;
-       enum amdgpu_ras_eeprom_rec_type rec_type;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index a4a7e61817aa..857693bcd8d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -71,6 +71,13 @@
  */
 #define UMC_CHANNEL_IDX_V2     BIT_ULL(47)
 
+/*
+ * save nps value to eeprom_table_record.retired_page[47:40],
+ * the channel index flag above will be retired.
+ */
+#define UMC_NPS_SHIFT 40
+#define UMC_NPS_MASK 0xffULL
+
 typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
                        uint32_t umc_inst, uint32_t ch_inst, void *data);
 
-- 
2.34.1

Reply via email to