On Wed, Apr 30, 2025 at 12:13 AM ganglxie <gangl...@amd.com> wrote:
Please fix the patch title. Please add a drm/amdgpu prefix. E.g., drm/amdgpu: Refine RAS bad page records counting and parsing in eeprom V3 > > there is only MCA records in V3, no need to care about PA records. > recalculate the value of ras_num_bad_pages when parsing failed and > go on with the left records instead of quit. > > Signed-off-by: ganglxie <gangl...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 61 +++++++++++++++---------- > 1 file changed, 36 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index e85143acf3a2..32f41f151c82 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -2889,6 +2889,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct > amdgpu_device *adev, > if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) > return -EINVAL; > } > + > return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, > > adev->umc.retire_unit); > } > @@ -2903,7 +2904,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > &adev->psp.ras_context.ras->eeprom_control; > enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; > int ret = 0; > - uint32_t i; > + uint32_t i = 0; > > if (!con || !con->eh_data || !bps || pages <= 0) > return 0; > @@ -2924,28 +2925,31 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device > *adev, > mutex_lock(&con->recovery_lock); > > if (from_rom) { > - for (i = 0; i < pages; i++) { > - if (control->ras_num_recs - i >= > adev->umc.retire_unit) { > - if ((bps[i].address == bps[i + 1].address) && > - (bps[i].mem_channel == bps[i + > 1].mem_channel)) { > - //deal with retire_unit records a time > - ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > - > &bps[i], &err_data, nps); > - if (ret) > - goto free; > - i += (adev->umc.retire_unit - 1); > + /* there is no pa recs in V3, so skip pa recs processing */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < pages; i++) { > + if (control->ras_num_recs - i >= > adev->umc.retire_unit) { > + if ((bps[i].address == bps[i + > 1].address) && > + (bps[i].mem_channel == bps[i > + 1].mem_channel)) { > + /* deal with retire_unit > records a time */ > + ret = > __amdgpu_ras_convert_rec_array_from_rom(adev, > + > &bps[i], &err_data, nps); > + if (ret) > + > control->ras_num_bad_pages -= adev->umc.retire_unit; > + i += (adev->umc.retire_unit - > 1); > + } else { > + break; > + } > } else { > break; > } > - } else { > - break; > } > } > for (; i < pages; i++) { > ret = __amdgpu_ras_convert_rec_from_rom(adev, > &bps[i], &err_data, nps); > if (ret) > - goto free; > + control->ras_num_bad_pages -= > adev->umc.retire_unit; > } > } else { > ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); > @@ -3040,21 +3044,28 @@ static int amdgpu_ras_load_bad_pages(struct > amdgpu_device *adev) > dev_err(adev->dev, "Failed to load EEPROM table records!"); > } else { > if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { > - for (i = 0; i < control->ras_num_recs; i++) { > - if ((control->ras_num_recs - i) >= > adev->umc.retire_unit) { > - if ((bps[i].address == bps[i + > 1].address) && > - (bps[i].mem_channel == bps[i > + 1].mem_channel)) { > - control->ras_num_pa_recs += > adev->umc.retire_unit; > - i += (adev->umc.retire_unit - > 1); > + /*In V3, there is no pa recs, and some cases(when > address==0) may be parsed > + as pa recs, so add verion check to avoid it. > + */ > + if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { > + for (i = 0; i < control->ras_num_recs; i++) { > + if ((control->ras_num_recs - i) >= > adev->umc.retire_unit) { > + if ((bps[i].address == bps[i > + 1].address) && > + (bps[i].mem_channel > == bps[i + 1].mem_channel)) { > + > control->ras_num_pa_recs += adev->umc.retire_unit; > + i += > (adev->umc.retire_unit - 1); > + } else { > + > control->ras_num_mca_recs += > + > (control->ras_num_recs - i); > + break; > + } > } else { > - control->ras_num_mca_recs += > - > (control->ras_num_recs - i); > + control->ras_num_mca_recs += > (control->ras_num_recs - i); > break; > } > - } else { > - control->ras_num_mca_recs += > (control->ras_num_recs - i); > - break; > } > + } else { > + control->ras_num_mca_recs = > control->ras_num_recs; > } > } > > -- > 2.34.1 >