> -----Original Message----- > From: Chen, Guchun <guchun.c...@amd.com> > Sent: 2019年9月2日 10:11 > To: Zhou1, Tao <tao.zh...@amd.com>; amd-gfx@lists.freedesktop.org; > Grodzovsky, Andrey <andrey.grodzov...@amd.com>; Li, Dennis > <dennis...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com> > Cc: Zhou1, Tao <tao.zh...@amd.com> > Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS > > > > -----Original Message----- > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Tao > Zhou > Sent: Friday, August 30, 2019 8:25 PM > To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey > <andrey.grodzov...@amd.com>; Chen, Guchun <guchun.c...@amd.com>; > Li, Dennis <dennis...@amd.com>; Zhang, Hawking > <hawking.zh...@amd.com> > Cc: Zhou1, Tao <tao.zh...@amd.com> > Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS > > support eeprom records load and save for ras, move EEPROM records > storing to bad page reserving > > Signed-off-by: Tao Zhou <tao.zh...@amd.com> > Signed-off-by: Andrey Grodzovsky <andrey.grodzov...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++-- > ---- > 1 file changed, 83 insertions(+), 28 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 24663ec41248..02120aa3cb5d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > return ret; > } > > +/* > + * write error record array to eeprom, the function should be > + * protected by recovery_lock > + */ > +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_err_handler_data *data; > + struct amdgpu_ras_eeprom_control *control = > + &adev->psp.ras.ras->eeprom_control; > + int save_count; > + > + if (!con || !con->eh_data) > + return 0; > + > + data = con->eh_data; > + if (!data) > + return 0; > [Guchun]Such check (!data) is redundant and not needed. As we have > checked !con->eh_data earlier, and the whole function is protected by > recovery_lock.
[Tao] OK, I'll remove it. > > + save_count = data->count - control->num_recs; > + /* only new entries are saved */ > + if (save_count > 0) > + if (amdgpu_ras_eeprom_process_recods(&con- > >eeprom_control, > + &data->bps[control- > >num_recs], > + true, > + save_count)) { > + DRM_ERROR("Failed to save EEPROM table data!"); > + return -EIO; > + } > + > + return 0; > +} > + > +/* > + * read error record array in eeprom and reserve enough space for > + * storing new bad pages > + */ > +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { > + struct amdgpu_ras_eeprom_control *control = > + &adev->psp.ras.ras->eeprom_control; > + struct eeprom_table_record *bps = NULL; > + int ret = 0; > + > + /* no bad page record, skip eeprom access */ > + if (!control->num_recs) > + return ret; > + > + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); > + if (!bps) > + return -ENOMEM; > + > + if (amdgpu_ras_eeprom_process_recods(control, bps, false, > + control->num_recs)) { > + DRM_ERROR("Failed to load EEPROM table records!"); > + ret = -EIO; > + goto out; > + } > + > + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); > + > +out: > + kfree(bps); > + return ret; > +} > + > /* called in gpu recovery/init */ > int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ - > 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct > amdgpu_device *adev) > struct ras_err_handler_data *data; > uint64_t bp; > struct amdgpu_bo *bo; > - int i; > + int i, ret = 0; > > if (!con || !con->eh_data) > return 0; > @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct > amdgpu_device *adev) > data->bps_bo[i] = bo; > data->last_reserved = i + 1; > } > + > + ret = amdgpu_ras_save_bad_pages(adev); > out: > mutex_unlock(&con->recovery_lock); > - return 0; > + return ret; > } > > /* called when driver unload */ > @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct > amdgpu_device *adev) > return 0; > } > > -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{ > - /* TODO > - * write the array to eeprom when SMU disabled. > - */ > - return 0; > -} > - > -/* > - * read error record array in eeprom and reserve enough space for > - * storing new bad pages > - */ > -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{ > - struct eeprom_table_record *bps = NULL; > - int ret; > - > - ret = amdgpu_ras_add_bad_pages(adev, bps, > - adev->umc.max_ras_err_cnt_per_query); > - > - return ret; > -} > - > static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data = &con->eh_data; > + int ret; > > *data = kmalloc(sizeof(**data), > GFP_KERNEL|__GFP_ZERO); > @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct > amdgpu_device *adev) > atomic_set(&con->in_recovery, 0); > con->adev = adev; > > - amdgpu_ras_load_bad_pages(adev); > - amdgpu_ras_reserve_bad_pages(adev); > + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras- > >eeprom_control); > + if (ret) > + return ret; > + > + if (adev->psp.ras.ras->eeprom_control.num_recs) { > + ret = amdgpu_ras_load_bad_pages(adev); > + if (ret) > + return ret; > + ret = amdgpu_ras_reserve_bad_pages(adev); > + if (ret) > + return ret; > + } > > return 0; > } > @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct > amdgpu_device *adev) > struct ras_err_handler_data *data = con->eh_data; > > cancel_work_sync(&con->recovery_work); > - amdgpu_ras_save_bad_pages(adev); > amdgpu_ras_release_bad_pages(adev); > > mutex_lock(&con->recovery_lock); > -- > 2.17.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx