[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Zhou1, Tao <tao.zh...@amd.com>
> Sent: Friday, March 7, 2025 4:41 PM
> To: Yang, Stanley <stanley.y...@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 version
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> > -----Original Message-----
> > From: Yang, Stanley <stanley.y...@amd.com>
> > Sent: Friday, March 7, 2025 3:36 PM
> > To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Zhou1, Tao <tao.zh...@amd.com>
> > Subject: RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3
> > version
> >
> > [AMD Official Use Only - AMD Internal Distribution Only]
> >
> > > -----Original Message-----
> > > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of
> > > Tao Zhou
> > > Sent: Friday, March 7, 2025 2:47 PM
> > > To: amd-gfx@lists.freedesktop.org
> > > Cc: Zhou1, Tao <tao.zh...@amd.com>
> > > Subject: [PATCH] drm/amdgpu: format old RAS eeprom data into V3
> > > version
> > >
> > > Clear old data and save it in V3 format.
> > >
> > > Signed-off-by: Tao Zhou <tao.zh...@amd.com>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  5 ++++
> > >  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 26 ++++++++++---------
> > >  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  1 +
> > >  3 files changed, 20 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > > index 837f33698b38..266f24002e07 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > > @@ -3465,6 +3465,11 @@ int amdgpu_ras_init_badpage_info(struct
> > > amdgpu_device *adev)
> > >                               adev, control->bad_channel_bitmap);
> > >                       con->update_channel_flag = false;
> > >               }
> > > +
> > > +             if (control->tbl_hdr.version < RAS_TABLE_VER_V3)
> >
> > [Stanley]: should check ip_version here, this affect all asics that
> > epprom table version is low  then V3.
>
> [Tao] how about "if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)" ?

[Stanley]: if v3 is only for UMC_HWIP 12.0.0, I suggest adding ip version here.

Regards,
Stanley
>
> >
> > Regards
> > Stanley
> > > +                     if (!amdgpu_ras_eeprom_reset_table(control))
> > > +                             if (amdgpu_ras_save_bad_pages(adev, NULL))
> > > +                                     dev_warn(adev->dev, "Failed to
> > > + save
> > > EEPROM data in V3 format!\n");
> > >       }
> > >
> > >       return ret;
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > > index 09a6f8bc1a5a..71dddb8983ee 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> > > @@ -413,9 +413,11 @@ static void
> > > amdgpu_ras_set_eeprom_table_version(struct
> > > amdgpu_ras_eeprom_control
> > >
> > >       switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
> > >       case IP_VERSION(8, 10, 0):
> > > -     case IP_VERSION(12, 0, 0):
> > >               hdr->version = RAS_TABLE_VER_V2_1;
> > >               return;
> > > +     case IP_VERSION(12, 0, 0):
> > > +             hdr->version = RAS_TABLE_VER_V3;
> > > +             return;
> > >       default:
> > >               hdr->version = RAS_TABLE_VER_V1;
> > >               return;
> > > @@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> > > amdgpu_ras_eeprom_control *control)
> > >       hdr->header = RAS_TABLE_HDR_VAL;
> > >       amdgpu_ras_set_eeprom_table_version(control);
> > >
> > > -     if (hdr->version == RAS_TABLE_VER_V2_1) {
> > > +     if (hdr->version >= RAS_TABLE_VER_V2_1) {
> > >               hdr->first_rec_offset = RAS_RECORD_START_V2_1;
> > >               hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
> > >                               RAS_TABLE_V2_1_INFO_SIZE; @@ -461,7
> > > +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> > > amdgpu_ras_eeprom_control *control)
> > >       }
> > >
> > >       csum = __calc_hdr_byte_sum(control);
> > > -     if (hdr->version == RAS_TABLE_VER_V2_1)
> > > +     if (hdr->version >= RAS_TABLE_VER_V2_1)
> > >               csum += __calc_ras_info_byte_sum(control);
> > >       csum = -csum;
> > >       hdr->checksum = csum;
> > > @@ -752,7 +754,7 @@ amdgpu_ras_eeprom_update_header(struct
> > > amdgpu_ras_eeprom_control *control)
> > >                       "Saved bad pages %d reaches threshold value %d\n",
> > >                       control->ras_num_bad_pages, ras-
> > > >bad_page_cnt_threshold);
> > >               control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
> > > -             if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
> > > +             if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
> > >                       control->tbl_rai.rma_status =
> > > GPU_RETIRED__ECC_REACH_THRESHOLD;
> > >                       control->tbl_rai.health_percent = 0;
> > >               }
> > > @@ -765,7 +767,7 @@ amdgpu_ras_eeprom_update_header(struct
> > > amdgpu_ras_eeprom_control *control)
> > >               amdgpu_dpm_send_rma_reason(adev);
> > >       }
> > >
> > > -     if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> > > +     if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> > >               control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
> > >                                           RAS_TABLE_V2_1_INFO_SIZE +
> > >                                           control->ras_num_recs *
> > > RAS_TABLE_RECORD_SIZE; @@ -805,7 +807,7 @@
> > > amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control
> > *control)
> > >        * now calculate gpu health percent
> > >        */
> > >       if (amdgpu_bad_page_threshold != 0 &&
> > > -         control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
> > > +         control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
> > >           control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
> > >               control->tbl_rai.health_percent =
> > > ((ras->bad_page_cnt_threshold -
> > >
> > > control->ras_num_bad_pages) * 100) / @@ -818,7 +820,7 @@
> > > amdgpu_ras_eeprom_update_header(struct
> > > amdgpu_ras_eeprom_control *control)
> > >               csum += *pp;
> > >
> > >       csum += __calc_hdr_byte_sum(control);
> > > -     if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> > > +     if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> > >               csum += __calc_ras_info_byte_sum(control);
> > >       /* avoid sign extension when assigning to "checksum" */
> > >       csum = -csum;
> > > @@ -1035,7 +1037,7 @@ uint32_t
> > > amdgpu_ras_eeprom_max_record_count(struct
> > > amdgpu_ras_eeprom_control *co
> > >       /* get available eeprom table version first before eeprom table 
> > > init */
> > >       amdgpu_ras_set_eeprom_table_version(control);
> > >
> > > -     if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> > > +     if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> > >               return RAS_MAX_RECORD_COUNT_V2_1;
> > >       else
> > >               return RAS_MAX_RECORD_COUNT; @@ -1280,7 +1282,7 @@
> > > static int __verify_ras_table_checksum(struct
> > > amdgpu_ras_eeprom_control *control
> > >       int buf_size, res;
> > >       u8  csum, *buf, *pp;
> > >
> > > -     if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
> > > +     if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
> > >               buf_size = RAS_TABLE_HEADER_SIZE +
> > >                          RAS_TABLE_V2_1_INFO_SIZE +
> > >                          control->ras_num_recs *
> > > RAS_TABLE_RECORD_SIZE; @@ -1383,7 +1385,7 @@ int
> > > amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
> > >
> > >       __decode_table_header_from_buf(hdr, buf);
> > >
> > > -     if (hdr->version == RAS_TABLE_VER_V2_1) {
> > > +     if (hdr->version >= RAS_TABLE_VER_V2_1) {
> > >               control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
> > >               control->ras_record_offset = RAS_RECORD_START_V2_1;
> > >               control->ras_max_record_count =
> > > RAS_MAX_RECORD_COUNT_V2_1; @@ -1423,7 +1425,7 @@ int
> > > amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
> > >               DRM_DEBUG_DRIVER("Found existing EEPROM table with %d
> > > records",
> > >                                control->ras_num_bad_pages);
> > >
> > > -             if (hdr->version == RAS_TABLE_VER_V2_1) {
> > > +             if (hdr->version >= RAS_TABLE_VER_V2_1) {
> > >                       res = __read_table_ras_info(control);
> > >                       if (res)
> > >                               return res; @@ -1443,7 +1445,7 @@ int
> > > amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
> > >                                       ras->bad_page_cnt_threshold);
> > >       } else if (hdr->header == RAS_TABLE_HDR_BAD &&
> > >                  amdgpu_bad_page_threshold != 0) {
> > > -             if (hdr->version == RAS_TABLE_VER_V2_1) {
> > > +             if (hdr->version >= RAS_TABLE_VER_V2_1) {
> > >                       res = __read_table_ras_info(control);
> > >                       if (res)
> > >                               return res; diff --git
> > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > > index 13f7eda9a696..ec6d7ea37ad0 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> > > @@ -28,6 +28,7 @@
> > >
> > >  #define RAS_TABLE_VER_V1           0x00010000
> > >  #define RAS_TABLE_VER_V2_1         0x00021000
> > > +#define RAS_TABLE_VER_V3           0x00030000
> > >
> > >  struct amdgpu_device;
> > >
> > > --
> > > 2.34.1
> >
>

Reply via email to