[Public] Please fix up the patch title as well. E.g.,
drm/amdgpu: Add support for legacy records in eeprom format V3 Alex > -----Original Message----- > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of ganglxie > Sent: Tuesday, April 29, 2025 3:01 AM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao <tao.zh...@amd.com>; Xie, Patrick <gangliang....@amd.com> > Subject: [PATCH] Add support for leagcy records in eeprom format V3 > > After eeprom records format upgrades to V3, records that have 'address == 0' > should be supported in NPS1 > > Signed-off-by: ganglxie <gangl...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 ++++++++++++++++--------- > 1 file changed, 48 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index e85143acf3a2..4f4e12452ecc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2881,6 +2881,17 @@ static int __amdgpu_ras_convert_rec_from_rom(struct > amdgpu_device *adev, > save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & > UMC_NPS_MASK; > bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); > > + /* if address== 0, the rec must be a PA rec under NPS1 */ > + if (bps->address == 0) { > + if (nps == AMDGPU_NPS1_PARTITION_MODE) { > + if (amdgpu_umc_pages_in_a_row(adev, err_data, > + bps->retired_page << > AMDGPU_GPU_PAGE_SHIFT)) > + return -EINVAL; > + goto out; > + } else { > + return -EOPNOTSUPP; > + } > + } > if (save_nps == nps) { > if (amdgpu_umc_pages_in_a_row(adev, err_data, > bps->retired_page << > AMDGPU_GPU_PAGE_SHIFT)) @@ -2889,6 +2900,8 @@ static int > __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, > if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) > return -EINVAL; > } > + > +out: > return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, > adev- > >umc.retire_unit); > } > @@ -2903,7 +2916,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device > *adev, > &adev->psp.ras_context.ras->eeprom_control; > enum amdgpu_memory_partition nps = > AMDGPU_NPS1_PARTITION_MODE; > int ret = 0; > - uint32_t i; > + uint32_t i = 0; > > if (!con || !con->eh_data || !bps || pages <= 0) > return 0; > @@ -2924,28 +2937,31 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > mutex_lock(&con->recovery_lock); > > if (from_rom) { > - for (i = 0; i < pages; i++) { > - if (control->ras_num_recs - i >= adev->umc.retire_unit) > { > - if ((bps[i].address == bps[i + 1].address) && > - (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > - //deal with retire_unit records a time > - ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > - &bps[i], > &err_data, nps); > - if (ret) > - goto free; > - i += (adev->umc.retire_unit - 1); > + /* there is no pa recs in V3, so skip pa recs processing */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < pages; i++) { > + if (control->ras_num_recs - i >= adev- > >umc.retire_unit) { > + if ((bps[i].address == bps[i + > 1].address) && > + (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > + /* deal with retire_unit > records a time */ > + ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > + > &bps[i], > &err_data, nps); > + if (ret) > + > control->ras_num_bad_pages - > = adev->umc.retire_unit; > + i += (adev->umc.retire_unit - > 1); > + } else { > + break; > + } > } else { > break; > } > - } else { > - break; > } > } > for (; i < pages; i++) { > ret = __amdgpu_ras_convert_rec_from_rom(adev, > &bps[i], &err_data, nps); > if (ret) > - goto free; > + control->ras_num_bad_pages -= adev- > >umc.retire_unit; > } > } else { > ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); @@ - > 3040,21 +3056,28 @@ static int amdgpu_ras_load_bad_pages(struct > amdgpu_device *adev) > dev_err(adev->dev, "Failed to load EEPROM table records!"); > } else { > if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { > - for (i = 0; i < control->ras_num_recs; i++) { > - if ((control->ras_num_recs - i) >= adev- > >umc.retire_unit) { > - if ((bps[i].address == bps[i + > 1].address) && > - (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > - control->ras_num_pa_recs += > adev- > >umc.retire_unit; > - i += (adev->umc.retire_unit - > 1); > + /*In V3, there is no pa recs, and some cases(when > address==0) may be parsed > + as pa recs, so add verion check to avoid it. > + */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < control->ras_num_recs; i++) { > + if ((control->ras_num_recs - i) >= adev- > >umc.retire_unit) { > + if ((bps[i].address == bps[i + > 1].address) && > + (bps[i].mem_channel == > bps[i + > 1].mem_channel)) { > + > control->ras_num_pa_recs += > adev->umc.retire_unit; > + i += > (adev->umc.retire_unit - 1); > + } else { > + > control->ras_num_mca_recs > += > + > (control- > >ras_num_recs - i); > + break; > + } > } else { > - control->ras_num_mca_recs += > - > (control- > >ras_num_recs - i); > + control->ras_num_mca_recs += > (control->ras_num_recs - i); > break; > } > - } else { > - control->ras_num_mca_recs += (control- > >ras_num_recs - i); > - break; > } > + } else { > + control->ras_num_mca_recs = control- > >ras_num_recs; > } > } > > -- > 2.34.1