[AMD Official Use Only - AMD Internal Distribution Only]

>-----Original Message-----
>From: Lazar, Lijo <lijo.la...@amd.com>
>Sent: Wednesday, June 4, 2025 2:54 PM
>To: Deng, Emily <emily.d...@amd.com>; amd-gfx@lists.freedesktop.org
>Subject: Re: [PATCH v2] drm/amdkfd: Move the process suspend and resume out of
>full access
>
>
>
>On 5/27/2025 4:19 PM, Emily Deng wrote:
>> For the suspend and resume process, exclusive access is not required.
>> Therefore, it can be moved out of the full access section to reduce
>> the duration of exclusive access.
>>
>> Signed-off-by: Emily Deng <emily.d...@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++--
>>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 40 +++++++++++++++++-----
>>  4 files changed, 70 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 4cec3a873995..ba07e9c6619d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -264,6 +264,22 @@ int amdgpu_amdkfd_resume(struct amdgpu_device
>*adev, bool run_pm)
>>      return r;
>>  }
>>
>> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool
>> +run_pm) {
>> +    if (adev->kfd.dev)
>> +            kgd2kfd_suspend_process(adev->kfd.dev, run_pm); }
>> +
>> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool
>> +run_pm) {
>> +    int r = 0;
>> +
>> +    if (adev->kfd.dev)
>> +            r = kgd2kfd_resume_process(adev->kfd.dev, run_pm);
>> +
>> +    return r;
>> +}
>> +
>>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
>>                          struct amdgpu_reset_context *reset_context)  { diff 
>> --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index b6ca41859b53..841ae8b75ab1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -156,6 +156,8 @@ void amdgpu_amdkfd_fini(void);
>>
>>  void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>> int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
>> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool
>> +run_pm); int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev,
>> +bool run_pm);
>>  void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>>                      const void *ih_ring_entry);
>>  void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); @@
>> -413,6 +415,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,  void
>> kgd2kfd_device_exit(struct kfd_dev *kfd);  void kgd2kfd_suspend(struct
>> kfd_dev *kfd, bool run_pm);  int kgd2kfd_resume(struct kfd_dev *kfd,
>> bool run_pm);
>> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm); int
>> +kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm);
>>  int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>>                    struct amdgpu_reset_context *reset_context);  int
>> kgd2kfd_post_reset(struct kfd_dev *kfd); @@ -463,6 +467,15 @@ static
>> inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>>      return 0;
>>  }
>>
>> +static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool
>> +run_pm) { }
>> +
>> +static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool
>> +run_pm) {
>> +    return 0;
>> +}
>> +
>>  static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>>                                  struct amdgpu_reset_context *reset_context) 
>>  { diff --
>git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 625c416c7d45..6e29f8bd54bb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5080,7 +5080,7 @@ int amdgpu_device_suspend(struct drm_device *dev,
>bool notify_clients)
>>      amdgpu_device_ip_suspend_phase1(adev);
>>
>>      if (!adev->in_s0ix) {
>> -            amdgpu_amdkfd_suspend(adev, adev->in_runpm);
>> +            amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_runtime(adev) ||
>> +adev->in_runpm);
>>              amdgpu_userq_suspend(adev);
>>      }
>>
>> @@ -5097,6 +5097,9 @@ int amdgpu_device_suspend(struct drm_device *dev,
>bool notify_clients)
>>      if (amdgpu_sriov_vf(adev))
>>              amdgpu_virt_release_full_gpu(adev, false);
>>
>> +    if (!adev->in_s0ix)
>> +            amdgpu_amdkfd_suspend_process(adev, adev->in_runpm);
>> +
>>      r = amdgpu_dpm_notify_rlc_state(adev, false);
>>      if (r)
>>              return r;
>> @@ -5178,7 +5181,7 @@ int amdgpu_device_resume(struct drm_device *dev,
>bool notify_clients)
>>      }
>>
>>      if (!adev->in_s0ix) {
>> -            r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
>> +            r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_runtime(adev) ||
>> +adev->in_runpm);
>>              if (r)
>>                      goto exit;
>>
>> @@ -5199,6 +5202,11 @@ int amdgpu_device_resume(struct drm_device *dev,
>bool notify_clients)
>>              amdgpu_virt_release_full_gpu(adev, true);
>>      }
>>
>> +    if (!adev->in_s0ix) {
>> +            r = amdgpu_amdkfd_resume_process(adev, adev->in_runpm);
>
>Generic logic is to suspend all processes which should also result in stopping 
>any
>submissions to hardware before proceeding with hardware uninit. How is this 
>different
>for a VF? Is there a different mechanism done for a gracious suspend/hardware
>access prevention? Or, does that result in abrupt errors in processes?
>
>That aside, the refactor causes the call to be made twice on bare metal.
>
>Thanks,
>Lijo
You are right, should suspend all processes before hardware fini. And will also 
repair the twice issue for bare metal.

Emily Deng
Best Wishes


>
>> +            if (r)
>> +                    goto exit;
>> +    }
>>      if (r)
>>              return r;
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> index bf0854bd5555..22c6ef7c42b6 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>> @@ -1027,15 +1027,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool
>run_pm)
>>      if (!kfd->init_complete)
>>              return;
>>
>> -    /* for runtime suspend, skip locking kfd */
>> -    if (!run_pm) {
>> -            mutex_lock(&kfd_processes_mutex);
>> -            /* For first KFD device suspend all the KFD processes */
>> -            if (++kfd_locked == 1)
>> -                    kfd_suspend_all_processes();
>> -            mutex_unlock(&kfd_processes_mutex);
>> -    }
>> -
>> +    kgd2kfd_suspend_process(kfd, run_pm);
>>      for (i = 0; i < kfd->num_nodes; i++) {
>>              node = kfd->nodes[i];
>>              node->dqm->ops.stop(node->dqm);
>> @@ -1055,6 +1047,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>>                      return ret;
>>      }
>>
>> +    ret = kgd2kfd_resume_process(kfd, run_pm);
>> +
>> +    return ret;
>> +}
>> +
>> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm) {
>> +    struct kfd_node *node;
>> +    int i;
>> +
>> +    if (!kfd->init_complete)
>> +            return;
>> +
>> +    /* for runtime suspend, skip locking kfd */
>> +    if (!run_pm) {
>> +            mutex_lock(&kfd_processes_mutex);
>> +            /* For first KFD device suspend all the KFD processes */
>> +            if (++kfd_locked == 1)
>> +                    kfd_suspend_all_processes();
>> +            mutex_unlock(&kfd_processes_mutex);
>> +    }
>> +}
>> +
>> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm) {
>> +    int ret, i;
>> +
>> +    if (!kfd->init_complete)
>> +            return 0;
>> +
>>      /* for runtime resume, skip unlocking kfd */
>>      if (!run_pm) {
>>              mutex_lock(&kfd_processes_mutex);

Reply via email to