[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Tao Zhou <tao.zh...@amd.com>
> -----Original Message----- > From: Sun, Ce(Overlord) <ce....@amd.com> > Sent: Tuesday, July 29, 2025 11:23 AM > To: amd-gfx@lists.freedesktop.org > Cc: Zhang, Hawking <hawking.zh...@amd.com>; Zhou1, Tao > <tao.zh...@amd.com>; Yang, Stanley <stanley.y...@amd.com>; Sun, > Ce(Overlord) <ce....@amd.com> > Subject: [PATCH] drm/amdgpu: Avoid rma causes GPU duplicate reset > > Try to ensure poison creation handle is completed in time to set device rma > value. > > Signed-off-by: Ce Sun <cesun...@amd.com> > Signed-off-by: Stanley.Yang <stanley.y...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 ++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + > 2 files changed, 11 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index ac7099d03e89..eea175874ba0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3356,7 +3356,6 @@ static void amdgpu_ras_do_page_retirement(struct > work_struct *work) > page_retirement_dwork.work); > struct amdgpu_device *adev = con->adev; > struct ras_err_data err_data; > - unsigned long err_cnt; > > /* If gpu reset is ongoing, delay retiring the bad pages */ > if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { @@ -3368,13 > +3367,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) > amdgpu_ras_error_data_init(&err_data); > > amdgpu_umc_handle_bad_pages(adev, &err_data); > - err_cnt = err_data.err_addr_cnt; > > amdgpu_ras_error_data_fini(&err_data); > > - if (err_cnt && amdgpu_ras_is_rma(adev)) > - amdgpu_ras_reset_gpu(adev); > - > amdgpu_ras_schedule_retirement_dwork(con, > AMDGPU_RAS_RETIRE_PAGE_INTERVAL); > } > @@ -3428,6 +3423,9 @@ static int amdgpu_ras_poison_creation_handler(struct > amdgpu_device *adev, > if (total_detect_count) > schedule_delayed_work(&ras->page_retirement_dwork, 0); > > + if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, > 0, 1) == 0) > + amdgpu_ras_reset_gpu(adev); > + > return 0; > } > > @@ -3464,6 +3462,12 @@ static int > amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, > reset_flags |= msg.reset; > } > > + /* > + * Try to ensure poison creation handler is completed first > + * to set rma if bad page exceed threshold. > + */ > + flush_delayed_work(&con->page_retirement_dwork); > + > /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ > if (reset_flags && !amdgpu_ras_is_rma(adev)) { > if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) > @@ -3473,8 +3477,6 @@ static int > amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, > else > reset = reset_flags; > > - flush_delayed_work(&con->page_retirement_dwork); > - > con->gpu_reset_flags |= reset; > amdgpu_ras_reset_gpu(adev); > > @@ -3645,6 +3647,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > *adev, bool init_bp_info) > mutex_init(&con->recovery_lock); > INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); > atomic_set(&con->in_recovery, 0); > + atomic_set(&con->rma_in_recovery, 0); > con->eeprom_control.bad_channel_bitmap = 0; > > max_eeprom_records_count = > amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 7f10a7402160..662046ab73ba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -515,6 +515,7 @@ struct amdgpu_ras { > /* gpu recovery */ > struct work_struct recovery_work; > atomic_t in_recovery; > + atomic_t rma_in_recovery; > struct amdgpu_device *adev; > /* error handler data */ > struct ras_err_handler_data *eh_data; > -- > 2.34.1