On 6/4/2025 1:18 PM, Emily Deng wrote:
> For the suspend and resume process, exclusive access is not required.
> Therefore, it can be moved out of the full access section to reduce the
> duration of exclusive access.
> 
> v3:
> Move suspend processes before hardware fini.
> Remove twice call for bare metal.
> 
> Signed-off-by: Emily Deng <emily.d...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  9 +++--
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 40 +++++++++++++++++-----
>  4 files changed, 67 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d8ac4b1051a8..0a8e7835d0fc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -264,6 +264,22 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, 
> bool run_pm)
>       return r;
>  }
>  
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool run_pm)

run_pm flag is not required in *_suspend/resume_process calls. It may be
checked at the caller site. Call the functions as required based on the
flags.

> +{
> +     if (adev->kfd.dev)
> +             kgd2kfd_suspend_process(adev->kfd.dev, run_pm);
> +}
> +
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool run_pm)
> +{
> +     int r = 0;
> +
> +     if (adev->kfd.dev)
> +             r = kgd2kfd_resume_process(adev->kfd.dev, run_pm);
> +
> +     return r;
> +}
> +
>  int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
>                           struct amdgpu_reset_context *reset_context)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b6ca41859b53..841ae8b75ab1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -156,6 +156,8 @@ void amdgpu_amdkfd_fini(void);
>  
>  void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
>  int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool run_pm);
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool run_pm);
>  void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>                       const void *ih_ring_entry);
>  void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
> @@ -413,6 +415,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
>  void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
>  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);

run_pm in the above two calls may be renamed to
suspend_process/resume_process flags. Aligns more with the generic logic
of - based on suspend/resume flags, process suspend/resume is taken care.

> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm);
> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>                     struct amdgpu_reset_context *reset_context);
>  int kgd2kfd_post_reset(struct kfd_dev *kfd);
> @@ -463,6 +467,15 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, 
> bool run_pm)
>       return 0;
>  }
>  
> +static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm)
> +{
> +}
> +
> +static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm)
> +{
> +     return 0;
> +}
> +
>  static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>                                   struct amdgpu_reset_context *reset_context)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5289400879ec..08ff9917c62f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5061,6 +5061,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>       adev->in_suspend = true;
>  
>       if (amdgpu_sriov_vf(adev)) {
> +             if (!adev->in_s0ix)
> +                     amdgpu_amdkfd_suspend_process(adev, adev->in_runpm);
>               amdgpu_virt_fini_data_exchange(adev);
>               r = amdgpu_virt_request_full_gpu(adev, false);
>               if (r)
> @@ -5080,7 +5082,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>       amdgpu_device_ip_suspend_phase1(adev);
>  
>       if (!adev->in_s0ix) {
> -             amdgpu_amdkfd_suspend(adev, adev->in_runpm);
> +             amdgpu_amdkfd_suspend(adev, amdgpu_sriov_vf(adev) || 
> adev->in_runpm);
>               amdgpu_userq_suspend(adev);
>       }
>  
> @@ -5178,7 +5180,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
> notify_clients)
>       }
>  
>       if (!adev->in_s0ix) {
> -             r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
> +             r = amdgpu_amdkfd_resume(adev, amdgpu_sriov_vf(adev) || 
> adev->in_runpm);
>               if (r)
>                       goto exit;
>  
> @@ -5197,6 +5199,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
> notify_clients)
>       if (amdgpu_sriov_vf(adev)) {
>               amdgpu_virt_init_data_exchange(adev);
>               amdgpu_virt_release_full_gpu(adev, true);
> +
> +             if (!adev->in_s0ix && !r)
> +                     r = amdgpu_amdkfd_resume_process(adev, adev->in_runpm);
>       }
>  
>       if (r)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index bf0854bd5555..22c6ef7c42b6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1027,15 +1027,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>       if (!kfd->init_complete)
>               return;
>  
> -     /* for runtime suspend, skip locking kfd */
> -     if (!run_pm) {
> -             mutex_lock(&kfd_processes_mutex);
> -             /* For first KFD device suspend all the KFD processes */
> -             if (++kfd_locked == 1)
> -                     kfd_suspend_all_processes();
> -             mutex_unlock(&kfd_processes_mutex);
> -     }
> -
> +     kgd2kfd_suspend_process(kfd, run_pm);

To clarify further about the earlier comments, this will be

if (suspend_process)
        kgd2kfd_suspend_process(kfd);

Same for resume below.

Thanks,
Lijo

>       for (i = 0; i < kfd->num_nodes; i++) {
>               node = kfd->nodes[i];
>               node->dqm->ops.stop(node->dqm);
> @@ -1055,6 +1047,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>                       return ret;
>       }
>  
> +     ret = kgd2kfd_resume_process(kfd, run_pm);
> +
> +     return ret;
> +}
> +
> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm)
> +{
> +     struct kfd_node *node;
> +     int i;
> +
> +     if (!kfd->init_complete)
> +             return;
> +
> +     /* for runtime suspend, skip locking kfd */
> +     if (!run_pm) {
> +             mutex_lock(&kfd_processes_mutex);
> +             /* For first KFD device suspend all the KFD processes */
> +             if (++kfd_locked == 1)
> +                     kfd_suspend_all_processes();
> +             mutex_unlock(&kfd_processes_mutex);
> +     }
> +}
> +
> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm)
> +{
> +     int ret, i;
> +
> +     if (!kfd->init_complete)
> +             return 0;
> +
>       /* for runtime resume, skip unlocking kfd */
>       if (!run_pm) {
>               mutex_lock(&kfd_processes_mutex);

Reply via email to