On 6/6/2025 3:38 PM, Emily Deng wrote:
> For the suspend and resume process, exclusive access is not required.
> Therefore, it can be moved out of the full access section to reduce the
> duration of exclusive access.
> 
> v3:
> Move suspend processes before hardware fini.
> Remove twice call for bare metal.
> 
> v4:
> Refine code
> 
> Signed-off-by: Emily Deng <emily.d...@amd.com>

Acked-by: Lijo Lazar <lijo.la...@amd.com>

cc: Mukul/Harish to take a look.

Thanks,
Lijo


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    | 24 ++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    | 25 ++++++--
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 11 +++-
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c      |  4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 57 ++++++++++++-------
>  6 files changed, 89 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index d8ac4b1051a8..fe282b855734 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -248,18 +248,34 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>               kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
>  }
>  
> -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc)
>  {
>       if (adev->kfd.dev)
> -             kgd2kfd_suspend(adev->kfd.dev, run_pm);
> +             kgd2kfd_suspend(adev->kfd.dev, suspend_proc);
>  }
>  
> -int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc)
>  {
>       int r = 0;
>  
>       if (adev->kfd.dev)
> -             r = kgd2kfd_resume(adev->kfd.dev, run_pm);
> +             r = kgd2kfd_resume(adev->kfd.dev, resume_proc);
> +
> +     return r;
> +}
> +
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev)
> +{
> +     if (adev->kfd.dev)
> +             kgd2kfd_suspend_process(adev->kfd.dev);
> +}
> +
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev)
> +{
> +     int r = 0;
> +
> +     if (adev->kfd.dev)
> +             r = kgd2kfd_resume_process(adev->kfd.dev);
>  
>       return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b6ca41859b53..05ecba2a85eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -154,8 +154,10 @@ struct amdkfd_process_info {
>  int amdgpu_amdkfd_init(void);
>  void amdgpu_amdkfd_fini(void);
>  
> -void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
> -int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
> +void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc);
> +int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc);
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev);
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev);
>  void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>                       const void *ih_ring_entry);
>  void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
> @@ -411,8 +413,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device 
> *adev, bool vf);
>  bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                        const struct kgd2kfd_shared_resources *gpu_resources);
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
> -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> -int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc);
> +int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc);
> +void kgd2kfd_suspend_process(struct kfd_dev *kfd);
> +int kgd2kfd_resume_process(struct kfd_dev *kfd);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>                     struct amdgpu_reset_context *reset_context);
>  int kgd2kfd_post_reset(struct kfd_dev *kfd);
> @@ -454,11 +458,20 @@ static inline void kgd2kfd_device_exit(struct kfd_dev 
> *kfd)
>  {
>  }
>  
> -static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> +static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc)
>  {
>  }
>  
> -static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> +static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc)
> +{
> +     return 0;
> +}
> +
> +static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool 
> suspend_proc
> +{
> +}
> +
> +static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool 
> resume_proc)
>  {
>       return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> index ffbaa8bc5eea..1105a09e55dc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> @@ -320,7 +320,7 @@ static void set_barrier_auto_waitcnt(struct amdgpu_device 
> *adev, bool enable_wai
>       if (!down_read_trylock(&adev->reset_domain->sem))
>               return;
>  
> -     amdgpu_amdkfd_suspend(adev, false);
> +     amdgpu_amdkfd_suspend(adev, true);
>  
>       if (suspend_resume_compute_scheduler(adev, true))
>               goto out;
> @@ -333,7 +333,7 @@ static void set_barrier_auto_waitcnt(struct amdgpu_device 
> *adev, bool enable_wai
>  out:
>       suspend_resume_compute_scheduler(adev, false);
>  
> -     amdgpu_amdkfd_resume(adev, false);
> +     amdgpu_amdkfd_resume(adev, true);
>  
>       up_read(&adev->reset_domain->sem);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5289400879ec..e8b500c266c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3524,7 +3524,7 @@ static int amdgpu_device_ip_fini_early(struct 
> amdgpu_device *adev)
>       amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
>       amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
>  
> -     amdgpu_amdkfd_suspend(adev, false);
> +     amdgpu_amdkfd_suspend(adev, true);
>       amdgpu_userq_suspend(adev);
>  
>       /* Workaround for ASICs need to disable SMC first */
> @@ -5061,6 +5061,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>       adev->in_suspend = true;
>  
>       if (amdgpu_sriov_vf(adev)) {
> +             if (!adev->in_s0ix && !adev->in_runpm)
> +                     amdgpu_amdkfd_suspend_process(adev);
>               amdgpu_virt_fini_data_exchange(adev);
>               r = amdgpu_virt_request_full_gpu(adev, false);
>               if (r)
> @@ -5080,7 +5082,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>       amdgpu_device_ip_suspend_phase1(adev);
>  
>       if (!adev->in_s0ix) {
> -             amdgpu_amdkfd_suspend(adev, adev->in_runpm);
> +             amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && 
> !adev->in_runpm);
>               amdgpu_userq_suspend(adev);
>       }
>  
> @@ -5178,7 +5180,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
> notify_clients)
>       }
>  
>       if (!adev->in_s0ix) {
> -             r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
> +             r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && 
> !adev->in_runpm);
>               if (r)
>                       goto exit;
>  
> @@ -5197,6 +5199,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
> notify_clients)
>       if (amdgpu_sriov_vf(adev)) {
>               amdgpu_virt_init_data_exchange(adev);
>               amdgpu_virt_release_full_gpu(adev, true);
> +
> +             if (!adev->in_s0ix && !r && !adev->in_runpm)
> +                     r = amdgpu_amdkfd_resume_process(adev);
>       }
>  
>       if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 9c169112a5e7..48e2d67273f7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1676,9 +1676,9 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring 
> *ring, unsigned int vmid)
>       if (!(adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
>               return -EOPNOTSUPP;
>  
> -     amdgpu_amdkfd_suspend(adev, false);
> +     amdgpu_amdkfd_suspend(adev, true);
>       r = amdgpu_sdma_reset_engine(adev, id);
> -     amdgpu_amdkfd_resume(adev, false);
> +     amdgpu_amdkfd_resume(adev, true);
>  
>       return r;
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index bf0854bd5555..6a832f3c2518 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -971,7 +971,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd,
>               kfd_smi_event_update_gpu_reset(node, false, reset_context);
>       }
>  
> -     kgd2kfd_suspend(kfd, false);
> +     kgd2kfd_suspend(kfd, true);
>  
>       for (i = 0; i < kfd->num_nodes; i++)
>               kfd_signal_reset_event(kfd->nodes[i]);
> @@ -1019,7 +1019,7 @@ bool kfd_is_locked(void)
>       return  (kfd_locked > 0);
>  }
>  
> -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool suspend_proc)
>  {
>       struct kfd_node *node;
>       int i;
> @@ -1027,14 +1027,8 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>       if (!kfd->init_complete)
>               return;
>  
> -     /* for runtime suspend, skip locking kfd */
> -     if (!run_pm) {
> -             mutex_lock(&kfd_processes_mutex);
> -             /* For first KFD device suspend all the KFD processes */
> -             if (++kfd_locked == 1)
> -                     kfd_suspend_all_processes();
> -             mutex_unlock(&kfd_processes_mutex);
> -     }
> +     if (suspend_proc)
> +             kgd2kfd_suspend_process(kfd);
>  
>       for (i = 0; i < kfd->num_nodes; i++) {
>               node = kfd->nodes[i];
> @@ -1042,7 +1036,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>       }
>  }
>  
> -int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> +int kgd2kfd_resume(struct kfd_dev *kfd, bool resume_proc)
>  {
>       int ret, i;
>  
> @@ -1055,14 +1049,39 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
>                       return ret;
>       }
>  
> -     /* for runtime resume, skip unlocking kfd */
> -     if (!run_pm) {
> -             mutex_lock(&kfd_processes_mutex);
> -             if (--kfd_locked == 0)
> -                     ret = kfd_resume_all_processes();
> -             WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
> -             mutex_unlock(&kfd_processes_mutex);
> -     }
> +     if (resume_proc)
> +             ret = kgd2kfd_resume_process(kfd);
> +
> +     return ret;
> +}
> +
> +void kgd2kfd_suspend_process(struct kfd_dev *kfd)
> +{
> +     struct kfd_node *node;
> +     int i;
> +
> +     if (!kfd->init_complete)
> +             return;
> +
> +     mutex_lock(&kfd_processes_mutex);
> +     /* For first KFD device suspend all the KFD processes */
> +     if (++kfd_locked == 1)
> +             kfd_suspend_all_processes();
> +     mutex_unlock(&kfd_processes_mutex);
> +}
> +
> +int kgd2kfd_resume_process(struct kfd_dev *kfd)
> +{
> +     int ret, i;
> +
> +     if (!kfd->init_complete)
> +             return 0;
> +
> +     mutex_lock(&kfd_processes_mutex);
> +     if (--kfd_locked == 0)
> +             ret = kfd_resume_all_processes();
> +     WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error");
> +     mutex_unlock(&kfd_processes_mutex);
>  
>       return ret;
>  }

Reply via email to