On 5/27/2025 4:19 PM, Emily Deng wrote:
> For the suspend and resume process, exclusive access is not required.
> Therefore, it can be moved out of the full access section to reduce the
> duration of exclusive access.
>
> Signed-off-by: Emily Deng <emily.d...@amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++--
> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 40 +++++++++++++++++-----
> 4 files changed, 70 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 4cec3a873995..ba07e9c6619d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -264,6 +264,22 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev,
> bool run_pm)
> return r;
> }
>
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool run_pm)
> +{
> + if (adev->kfd.dev)
> + kgd2kfd_suspend_process(adev->kfd.dev, run_pm);
> +}
> +
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool run_pm)
> +{
> + int r = 0;
> +
> + if (adev->kfd.dev)
> + r = kgd2kfd_resume_process(adev->kfd.dev, run_pm);
> +
> + return r;
> +}
> +
> int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
> struct amdgpu_reset_context *reset_context)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index b6ca41859b53..841ae8b75ab1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -156,6 +156,8 @@ void amdgpu_amdkfd_fini(void);
>
> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
> int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool run_pm);
> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool run_pm);
> void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
> const void *ih_ring_entry);
> void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev);
> @@ -413,6 +415,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
> void kgd2kfd_device_exit(struct kfd_dev *kfd);
> void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm);
> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm);
> int kgd2kfd_pre_reset(struct kfd_dev *kfd,
> struct amdgpu_reset_context *reset_context);
> int kgd2kfd_post_reset(struct kfd_dev *kfd);
> @@ -463,6 +467,15 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd,
> bool run_pm)
> return 0;
> }
>
> +static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm)
> +{
> +}
> +
> +static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm)
> +{
> + return 0;
> +}
> +
> static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
> struct amdgpu_reset_context *reset_context)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 625c416c7d45..6e29f8bd54bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5080,7 +5080,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool
> notify_clients)
> amdgpu_device_ip_suspend_phase1(adev);
>
> if (!adev->in_s0ix) {
> - amdgpu_amdkfd_suspend(adev, adev->in_runpm);
> + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_runtime(adev) ||
> adev->in_runpm);
> amdgpu_userq_suspend(adev);
> }
>
> @@ -5097,6 +5097,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool
> notify_clients)
> if (amdgpu_sriov_vf(adev))
> amdgpu_virt_release_full_gpu(adev, false);
>
> + if (!adev->in_s0ix)
> + amdgpu_amdkfd_suspend_process(adev, adev->in_runpm);
> +
> r = amdgpu_dpm_notify_rlc_state(adev, false);
> if (r)
> return r;
> @@ -5178,7 +5181,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool
> notify_clients)
> }
>
> if (!adev->in_s0ix) {
> - r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
> + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_runtime(adev) ||
> adev->in_runpm);
> if (r)
> goto exit;
>
> @@ -5199,6 +5202,11 @@ int amdgpu_device_resume(struct drm_device *dev, bool
> notify_clients)
> amdgpu_virt_release_full_gpu(adev, true);
> }
>
> + if (!adev->in_s0ix) {
> + r = amdgpu_amdkfd_resume_process(adev, adev->in_runpm);
Generic logic is to suspend all processes which should also result in
stopping any submissions to hardware before proceeding with hardware
uninit. How is this different for a VF? Is there a different mechanism
done for a gracious suspend/hardware access prevention? Or, does that
result in abrupt errors in processes?
That aside, the refactor causes the call to be made twice on bare metal.
Thanks,
Lijo
> + if (r)
> + goto exit;
> + }
> if (r)
> return r;
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index bf0854bd5555..22c6ef7c42b6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -1027,15 +1027,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> if (!kfd->init_complete)
> return;
>
> - /* for runtime suspend, skip locking kfd */
> - if (!run_pm) {
> - mutex_lock(&kfd_processes_mutex);
> - /* For first KFD device suspend all the KFD processes */
> - if (++kfd_locked == 1)
> - kfd_suspend_all_processes();
> - mutex_unlock(&kfd_processes_mutex);
> - }
> -
> + kgd2kfd_suspend_process(kfd, run_pm);
> for (i = 0; i < kfd->num_nodes; i++) {
> node = kfd->nodes[i];
> node->dqm->ops.stop(node->dqm);
> @@ -1055,6 +1047,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
> return ret;
> }
>
> + ret = kgd2kfd_resume_process(kfd, run_pm);
> +
> + return ret;
> +}
> +
> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm)
> +{
> + struct kfd_node *node;
> + int i;
> +
> + if (!kfd->init_complete)
> + return;
> +
> + /* for runtime suspend, skip locking kfd */
> + if (!run_pm) {
> + mutex_lock(&kfd_processes_mutex);
> + /* For first KFD device suspend all the KFD processes */
> + if (++kfd_locked == 1)
> + kfd_suspend_all_processes();
> + mutex_unlock(&kfd_processes_mutex);
> + }
> +}
> +
> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm)
> +{
> + int ret, i;
> +
> + if (!kfd->init_complete)
> + return 0;
> +
> /* for runtime resume, skip unlocking kfd */
> if (!run_pm) {
> mutex_lock(&kfd_processes_mutex);