[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Xie, Patrick <gangliang....@amd.com> > Sent: Friday, June 27, 2025 5:37 PM > To: amd-gfx@lists.freedesktop.org > Cc: Zhou1, Tao <tao.zh...@amd.com>; Xie, Patrick <gangliang....@amd.com> > Subject: [PATCH] drm/amdgpu: refine ras error injection when eeprom > initialization > failed > > when eeprom initialization failed, we still support ras error injection, and > reserve bad > pages, but do not save bad pages to eeprom > > Signed-off-by: ganglxie <gangl...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++++++++++----- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 ++ > 2 files changed, 18 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 514b56e5d8ba..d24567787f9e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3009,6 +3009,15 @@ int amdgpu_ras_save_bad_pages(struct > amdgpu_device *adev, > return 0; > } > > + if (!con->eeprom_control.is_eeprom_valid) { > + dev_err(adev->dev,
[Tao] since we return 0 here, it's better to use dev_warn, other than this, the patch is: Reviewed-by: Tao Zhou <tao.zh...@amd.com> > + "Failed to save EEPROM table data because of EEPROM > data corruption!"); > + if (new_cnt) > + *new_cnt = 0; > + > + return 0; > + } > + > mutex_lock(&con->recovery_lock); > control = &con->eeprom_control; > data = con->eh_data; > @@ -3502,8 +3511,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device > *adev) > > control = &con->eeprom_control; > ret = amdgpu_ras_eeprom_init(control); > - if (ret) > - return ret; > + control->is_eeprom_valid = !ret; > > if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) > control->ras_num_pa_recs = control->ras_num_recs; @@ -3512,10 > +3520,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) > adev->umc.ras->get_retire_flip_bits) > adev->umc.ras->get_retire_flip_bits(adev); > > - if (control->ras_num_recs) { > + if (control->ras_num_recs && control->is_eeprom_valid) { > ret = amdgpu_ras_load_bad_pages(adev); > - if (ret) > - return ret; > + if (ret) { > + control->is_eeprom_valid = false; > + return 0; > + } > > amdgpu_dpm_send_hbm_bad_pages_num( > adev, control->ras_num_bad_pages); > @@ -3534,7 +3544,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device > *adev) > dev_warn(adev->dev, "Failed to format > RAS > EEPROM data in V3 version!\n"); > } > > - return ret; > + return 0; > } > > int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) > diff -- > git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > index ec6d7ea37ad0..35c69ac3dbeb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > @@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control { > /* Record channel info which occurred bad pages > */ > u32 bad_channel_bitmap; > + > + bool is_eeprom_valid; > }; > > /* > -- > 2.34.1