[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Zhou1, Tao <tao.zh...@amd.com> > Sent: Friday, March 7, 2025 4:41 PM > To: Yang, Stanley <stanley.y...@amd.com>; amd-gfx@lists.freedesktop.org > Subject: RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 version > > [AMD Official Use Only - AMD Internal Distribution Only] > > > -----Original Message----- > > From: Yang, Stanley <stanley.y...@amd.com> > > Sent: Friday, March 7, 2025 3:36 PM > > To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org > > Cc: Zhou1, Tao <tao.zh...@amd.com> > > Subject: RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 > > version > > > > [AMD Official Use Only - AMD Internal Distribution Only] > > > > > -----Original Message----- > > > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of > > > Tao Zhou > > > Sent: Friday, March 7, 2025 2:47 PM > > > To: amd-gfx@lists.freedesktop.org > > > Cc: Zhou1, Tao <tao.zh...@amd.com> > > > Subject: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 > > > version > > > > > > Clear old data and save it in V3 format. > > > > > > Signed-off-by: Tao Zhou <tao.zh...@amd.com> > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 ++++ > > > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 26 ++++++++++--------- > > > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 1 + > > > 3 files changed, 20 insertions(+), 12 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > > index 837f33698b38..266f24002e07 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > > @@ -3465,6 +3465,11 @@ int amdgpu_ras_init_badpage_info(struct > > > amdgpu_device *adev) > > > adev, control->bad_channel_bitmap); > > > con->update_channel_flag = false; > > > } > > > + > > > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) > > > > [Stanley]: should check ip_version here, this affect all asics that > > epprom table version is low then V3. > > [Tao] how about "if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)" ?
[Stanley]: if v3 is only for UMC_HWIP 12.0.0, I suggest adding ip version here. Regards, Stanley > > > > > Regards > > Stanley > > > + if (!amdgpu_ras_eeprom_reset_table(control)) > > > + if (amdgpu_ras_save_bad_pages(adev, NULL)) > > > + dev_warn(adev->dev, "Failed to > > > + save > > > EEPROM data in V3 format!\n"); > > > } > > > > > > return ret; > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > > index 09a6f8bc1a5a..71dddb8983ee 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > > > @@ -413,9 +413,11 @@ static void > > > amdgpu_ras_set_eeprom_table_version(struct > > > amdgpu_ras_eeprom_control > > > > > > switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { > > > case IP_VERSION(8, 10, 0): > > > - case IP_VERSION(12, 0, 0): > > > hdr->version = RAS_TABLE_VER_V2_1; > > > return; > > > + case IP_VERSION(12, 0, 0): > > > + hdr->version = RAS_TABLE_VER_V3; > > > + return; > > > default: > > > hdr->version = RAS_TABLE_VER_V1; > > > return; > > > @@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct > > > amdgpu_ras_eeprom_control *control) > > > hdr->header = RAS_TABLE_HDR_VAL; > > > amdgpu_ras_set_eeprom_table_version(control); > > > > > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > > > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > > > hdr->first_rec_offset = RAS_RECORD_START_V2_1; > > > hdr->tbl_size = RAS_TABLE_HEADER_SIZE + > > > RAS_TABLE_V2_1_INFO_SIZE; @@ -461,7 > > > +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct > > > amdgpu_ras_eeprom_control *control) > > > } > > > > > > csum = __calc_hdr_byte_sum(control); > > > - if (hdr->version == RAS_TABLE_VER_V2_1) > > > + if (hdr->version >= RAS_TABLE_VER_V2_1) > > > csum += __calc_ras_info_byte_sum(control); > > > csum = -csum; > > > hdr->checksum = csum; > > > @@ -752,7 +754,7 @@ amdgpu_ras_eeprom_update_header(struct > > > amdgpu_ras_eeprom_control *control) > > > "Saved bad pages %d reaches threshold value %d\n", > > > control->ras_num_bad_pages, ras- > > > >bad_page_cnt_threshold); > > > control->tbl_hdr.header = RAS_TABLE_HDR_BAD; > > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { > > > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { > > > control->tbl_rai.rma_status = > > > GPU_RETIRED__ECC_REACH_THRESHOLD; > > > control->tbl_rai.health_percent = 0; > > > } > > > @@ -765,7 +767,7 @@ amdgpu_ras_eeprom_update_header(struct > > > amdgpu_ras_eeprom_control *control) > > > amdgpu_dpm_send_rma_reason(adev); > > > } > > > > > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > > > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > > > control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + > > > RAS_TABLE_V2_1_INFO_SIZE + > > > control->ras_num_recs * > > > RAS_TABLE_RECORD_SIZE; @@ -805,7 +807,7 @@ > > > amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control > > *control) > > > * now calculate gpu health percent > > > */ > > > if (amdgpu_bad_page_threshold != 0 && > > > - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && > > > + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && > > > control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) > > > control->tbl_rai.health_percent = > > > ((ras->bad_page_cnt_threshold - > > > > > > control->ras_num_bad_pages) * 100) / @@ -818,7 +820,7 @@ > > > amdgpu_ras_eeprom_update_header(struct > > > amdgpu_ras_eeprom_control *control) > > > csum += *pp; > > > > > > csum += __calc_hdr_byte_sum(control); > > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > > > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > > > csum += __calc_ras_info_byte_sum(control); > > > /* avoid sign extension when assigning to "checksum" */ > > > csum = -csum; > > > @@ -1035,7 +1037,7 @@ uint32_t > > > amdgpu_ras_eeprom_max_record_count(struct > > > amdgpu_ras_eeprom_control *co > > > /* get available eeprom table version first before eeprom table > > > init */ > > > amdgpu_ras_set_eeprom_table_version(control); > > > > > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > > > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > > > return RAS_MAX_RECORD_COUNT_V2_1; > > > else > > > return RAS_MAX_RECORD_COUNT; @@ -1280,7 +1282,7 @@ > > > static int __verify_ras_table_checksum(struct > > > amdgpu_ras_eeprom_control *control > > > int buf_size, res; > > > u8 csum, *buf, *pp; > > > > > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > > > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > > > buf_size = RAS_TABLE_HEADER_SIZE + > > > RAS_TABLE_V2_1_INFO_SIZE + > > > control->ras_num_recs * > > > RAS_TABLE_RECORD_SIZE; @@ -1383,7 +1385,7 @@ int > > > amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) > > > > > > __decode_table_header_from_buf(hdr, buf); > > > > > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > > > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > > > control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); > > > control->ras_record_offset = RAS_RECORD_START_V2_1; > > > control->ras_max_record_count = > > > RAS_MAX_RECORD_COUNT_V2_1; @@ -1423,7 +1425,7 @@ int > > > amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) > > > DRM_DEBUG_DRIVER("Found existing EEPROM table with %d > > > records", > > > control->ras_num_bad_pages); > > > > > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > > > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > > > res = __read_table_ras_info(control); > > > if (res) > > > return res; @@ -1443,7 +1445,7 @@ int > > > amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) > > > ras->bad_page_cnt_threshold); > > > } else if (hdr->header == RAS_TABLE_HDR_BAD && > > > amdgpu_bad_page_threshold != 0) { > > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > > > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > > > res = __read_table_ras_info(control); > > > if (res) > > > return res; diff --git > > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > > index 13f7eda9a696..ec6d7ea37ad0 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > > > @@ -28,6 +28,7 @@ > > > > > > #define RAS_TABLE_VER_V1 0x00010000 > > > #define RAS_TABLE_VER_V2_1 0x00021000 > > > +#define RAS_TABLE_VER_V3 0x00030000 > > > > > > struct amdgpu_device; > > > > > > -- > > > 2.34.1 > > >