These patches look good to me. JingWen will pull these patches and do some 
basic TDR test on sriov environment, and give feedback.

Best wishes
Emily Deng



>-----Original Message-----
>From: Liu, Monk <[email protected]>
>Sent: Thursday, December 23, 2021 6:14 PM
>To: Koenig, Christian <[email protected]>; Grodzovsky, Andrey
><[email protected]>; [email protected]; amd-
>[email protected]; Chen, Horace <[email protected]>; Chen,
>JingWen <[email protected]>; Deng, Emily <[email protected]>
>Cc: [email protected]
>Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>for SRIOV
>
>[AMD Official Use Only]
>
>@Chen, Horace @Chen, JingWen @Deng, Emily
>
>Please take a review on Andrey's patch
>
>Thanks
>-------------------------------------------------------------------
>Monk Liu | Cloud GPU & Virtualization Solution | AMD
>-------------------------------------------------------------------
>we are hiring software manager for CVS core team
>-------------------------------------------------------------------
>
>-----Original Message-----
>From: Koenig, Christian <[email protected]>
>Sent: Thursday, December 23, 2021 4:42 PM
>To: Grodzovsky, Andrey <[email protected]>; dri-
>[email protected]; [email protected]
>Cc: [email protected]; Liu, Monk <[email protected]>; Chen, Horace
><[email protected]>
>Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection
>for SRIOV
>
>Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:
>> Since now flr work is serialized against  GPU resets there is no need
>> for this.
>>
>> Signed-off-by: Andrey Grodzovsky <[email protected]>
>
>Acked-by: Christian König <[email protected]>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 -----------
>>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 -----------
>>   2 files changed, 22 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> index 487cd654b69e..7d59a66e3988 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
>> @@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>      struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>      int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
>>
>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>> -     * otherwise the mailbox msg will be ruined/reseted by
>> -     * the VF FLR.
>> -     */
>> -    if (!down_write_trylock(&adev->reset_sem))
>> -            return;
>> -
>>      amdgpu_virt_fini_data_exchange(adev);
>> -    atomic_set(&adev->in_gpu_reset, 1);
>>
>>      xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>
>> @@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct
>work_struct *work)
>>      } while (timeout > 1);
>>
>>   flr_done:
>> -    atomic_set(&adev->in_gpu_reset, 0);
>> -    up_write(&adev->reset_sem);
>> -
>>      /* Trigger recovery for world switch failure if no TDR */
>>      if (amdgpu_device_should_recover_gpu(adev)
>>              && (!amdgpu_device_has_job_running(adev) || diff --git
>> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> index e3869067a31d..f82c066c8e8d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
>> @@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>      struct amdgpu_device *adev = container_of(virt, struct
>amdgpu_device, virt);
>>      int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
>>
>> -    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>> -     * otherwise the mailbox msg will be ruined/reseted by
>> -     * the VF FLR.
>> -     */
>> -    if (!down_write_trylock(&adev->reset_sem))
>> -            return;
>> -
>>      amdgpu_virt_fini_data_exchange(adev);
>> -    atomic_set(&adev->in_gpu_reset, 1);
>>
>>      xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
>>
>> @@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct
>work_struct *work)
>>      } while (timeout > 1);
>>
>>   flr_done:
>> -    atomic_set(&adev->in_gpu_reset, 0);
>> -    up_write(&adev->reset_sem);
>> -
>>      /* Trigger recovery for world switch failure if no TDR */
>>      if (amdgpu_device_should_recover_gpu(adev)
>>              && (!amdgpu_device_has_job_running(adev) ||

Reply via email to