On 4/26/2024 9:27 AM, Yunxiang Li wrote:
> Some times a hang GPU causes multiple reset sources to schedule resets.
> The second source will be able to trigger an unnecessary reset if they
> schedule after we call amdgpu_device_stop_pending_resets.
> 
> Move amdgpu_device_stop_pending_resets to after the reset is done. Since
> at this point the GPU is supposedly in a good state, any reset scheduled
> after this point would be a legitimate reset.
> 
> Remove unnecessary and incorrect checks for amdgpu_in_reset that was
> kinda serving this purpose.
> 
> Signed-off-by: Yunxiang Li <yunxiang...@amd.com>

Reviewed-by: Lijo Lazar <lijo.la...@amd.com>

Thanks,
Lijo
> ---
> v2: instead of adding amdgpu_in_reset check, move when we cancel pending
> resets
> v3: no changes from v2, collect all the patches in one series for easier 
> review
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  2 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  2 +-
>  drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      |  2 +-
>  5 files changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 861ccff78af9..8befd10bf007 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5070,8 +5070,6 @@ static int amdgpu_device_reset_sriov(struct 
> amdgpu_device *adev,
>  retry:
>       amdgpu_amdkfd_pre_reset(adev);
>  
> -     amdgpu_device_stop_pending_resets(adev);
> -
>       if (from_hypervisor)
>               r = amdgpu_virt_request_full_gpu(adev, true);
>       else
> @@ -5823,13 +5821,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>                                 r, adev_to_drm(tmp_adev)->unique);
>                       tmp_adev->asic_reset_res = r;
>               }
> -
> -             if (!amdgpu_sriov_vf(tmp_adev))
> -                     /*
> -                     * Drop all pending non scheduler resets. Scheduler 
> resets
> -                     * were already dropped during drm_sched_stop
> -                     */
> -                     amdgpu_device_stop_pending_resets(tmp_adev);
>       }
>  
>       /* Actual ASIC resets if needed.*/
> @@ -5851,6 +5842,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>                       goto retry;
>       }
>  
> +     list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
> +             /*
> +              * Drop any pending non scheduler resets queued before reset is 
> done.
> +              * Any reset scheduled after this point would be valid. 
> Scheduler resets
> +              * were already dropped during drm_sched_stop and no new ones 
> can come
> +              * in before drm_sched_start.
> +              */
> +             amdgpu_device_stop_pending_resets(tmp_adev);
> +     }
> +
>  skip_hw_reset:
>  
>       /* Post ASIC reset for all devs .*/
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 54ab51a4ada7..c2385178d6b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -597,7 +597,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct 
> work_struct *work)
>       if (ret) {
>               adev->virt.vf2pf_update_retry_cnt++;
>               if ((adev->virt.vf2pf_update_retry_cnt >= 
> AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
> -                 amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
> +                 amdgpu_sriov_runtime(adev)) {
>                       amdgpu_ras_set_fed(adev, true);
>                       if (amdgpu_reset_domain_schedule(adev->reset_domain,
>                                                         &adev->virt.flr_work))
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 0c7275bca8f7..c5ba9c4757a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device 
> *adev,
>  
>       switch (event) {
>               case IDH_FLR_NOTIFICATION:
> -             if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
> +             if (amdgpu_sriov_runtime(adev))
>                       
> WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
>                                                               
> &adev->virt.flr_work),
>                                 "Failed to queue work! at %s",
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index aba00d961627..fa9d1b02f391 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -358,7 +358,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device 
> *adev,
>  
>       switch (event) {
>       case IDH_FLR_NOTIFICATION:
> -             if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
> +             if (amdgpu_sriov_runtime(adev))
>                       
> WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
>                                  &adev->virt.flr_work),
>                                 "Failed to queue work! at %s",
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> index 59f53c743362..14a065516ae4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
> @@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device 
> *adev,
>               r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);
>  
>               /* only handle FLR_NOTIFY now */
> -             if (!r && !amdgpu_in_reset(adev))
> +             if (!r)
>                       
> WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
>                                                               
> &adev->virt.flr_work),
>                                 "Failed to queue work! at %s",

Reply via email to