[AMD Official Use Only - AMD Internal Distribution Only] -----Original Message----- From: Zhou1, Tao <tao.zh...@amd.com> Sent: Tuesday, April 29, 2025 4:53 PM To: Xie, Patrick <gangliang....@amd.com>; amd-gfx@lists.freedesktop.org Subject: RE: [PATCH] Add support for leagcy records in eeprom format V3
[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Xie, Patrick <gangliang....@amd.com> > Sent: Tuesday, April 29, 2025 3:01 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao <tao.zh...@amd.com>; Xie, Patrick > <gangliang....@amd.com> > Subject: [PATCH] Add support for leagcy records in eeprom format V3 > > After eeprom records format upgrades to V3, records that have 'address > == 0' should be supported in NPS1 > > Signed-off-by: ganglxie <gangl...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 > ++++++++++++++++--------- > 1 file changed, 48 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index e85143acf3a2..4f4e12452ecc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2881,6 +2881,17 @@ static int > __amdgpu_ras_convert_rec_from_rom(struct > amdgpu_device *adev, > save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; > bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); > > + /* if address== 0, the rec must be a PA rec under NPS1 */ > + if (bps->address == 0) { > + if (nps == AMDGPU_NPS1_PARTITION_MODE) { > + if (amdgpu_umc_pages_in_a_row(adev, err_data, > + bps->retired_page << > AMDGPU_GPU_PAGE_SHIFT)) > + return -EINVAL; > + goto out; > + } else { > + return -EOPNOTSUPP; > + } > + } [Tao] better to add an empty line here. [Patrick] will add this empty line > if (save_nps == nps) { [Tao] is possible that both "bps->address == 0" and "(save_nps == nps " are true? [Patrick]Yes, all of the "address==0" records have save_nps=NPS1, so when loading the records in NPS1, save_nps==nps. But it does not matter, I check "address==0" ahead of "save_nps==nps" , the code won't go to "save_nps==nps"'s checking at any case. > if (amdgpu_umc_pages_in_a_row(adev, err_data, > bps->retired_page << > AMDGPU_GPU_PAGE_SHIFT)) @@ -2889,6 +2900,8 @@ static int > __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, > if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) > return -EINVAL; > } > + > +out: > return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, > > adev- > >umc.retire_unit); > } > @@ -2903,7 +2916,7 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > &adev->psp.ras_context.ras->eeprom_control; > enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; > int ret = 0; > - uint32_t i; > + uint32_t i = 0; > > if (!con || !con->eh_data || !bps || pages <= 0) > return 0; > @@ -2924,28 +2937,31 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > mutex_lock(&con->recovery_lock); > > if (from_rom) { > - for (i = 0; i < pages; i++) { > - if (control->ras_num_recs - i >= adev->umc.retire_unit) > { > - if ((bps[i].address == bps[i + 1].address) && > - (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > - //deal with retire_unit records a time > - ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > - &bps[i], > &err_data, nps); > - if (ret) > - goto free; > - i += (adev->umc.retire_unit - 1); > + /* there is no pa recs in V3, so skip pa recs processing */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < pages; i++) { > + if (control->ras_num_recs - i >= > + adev->umc.retire_unit) > { > + if ((bps[i].address == bps[i + > 1].address) && > + (bps[i].mem_channel == > + bps[i + > 1].mem_channel)) { > + /* deal with retire_unit > records a time */ > + ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > + > + &bps[i], > &err_data, nps); > + if (ret) > + > + control->ras_num_bad_pages - > = adev->umc.retire_unit; > + i += (adev->umc.retire_unit - > 1); > + } else { > + break; > + } > } else { > break; > } > - } else { > - break; > } > } > for (; i < pages; i++) { > ret = __amdgpu_ras_convert_rec_from_rom(adev, > &bps[i], &err_data, nps); > if (ret) > - goto free; > + control->ras_num_bad_pages -= adev- > >umc.retire_unit; > } > } else { > ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); > @@ - > 3040,21 +3056,28 @@ static int amdgpu_ras_load_bad_pages(struct > amdgpu_device *adev) > dev_err(adev->dev, "Failed to load EEPROM table records!"); > } else { > if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { > - for (i = 0; i < control->ras_num_recs; i++) { > - if ((control->ras_num_recs - i) >= adev- > >umc.retire_unit) { > - if ((bps[i].address == bps[i + > 1].address) && > - (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > - control->ras_num_pa_recs += > adev- > >umc.retire_unit; > - i += (adev->umc.retire_unit - > 1); > + /*In V3, there is no pa recs, and some > + cases(when > address==0) may be parsed > + as pa recs, so add verion check to avoid it. > + */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < control->ras_num_recs; i++) { > + if ((control->ras_num_recs - i) > + >= adev- > >umc.retire_unit) { > + if ((bps[i].address == > + bps[i + > 1].address) && > + > + (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > + > + control->ras_num_pa_recs += > adev->umc.retire_unit; > + i += > (adev->umc.retire_unit - 1); > + } else { > + > control->ras_num_mca_recs += > + > + (control- > >ras_num_recs - i); > + break; > + } > } else { > - control->ras_num_mca_recs += > - > (control- > >ras_num_recs - i); > + > + control->ras_num_mca_recs += > (control->ras_num_recs - i); > break; > } > - } else { > - control->ras_num_mca_recs += (control- > >ras_num_recs - i); > - break; > } > + } else { > + control->ras_num_mca_recs = control- > >ras_num_recs; > } > } > > -- > 2.34.1