[AMD Official Use Only - AMD Internal Distribution Only] >-----Original Message----- >From: Lazar, Lijo <lijo.la...@amd.com> >Sent: Wednesday, June 4, 2025 2:54 PM >To: Deng, Emily <emily.d...@amd.com>; amd-gfx@lists.freedesktop.org >Subject: Re: [PATCH v2] drm/amdkfd: Move the process suspend and resume out of >full access > > > >On 5/27/2025 4:19 PM, Emily Deng wrote: >> For the suspend and resume process, exclusive access is not required. >> Therefore, it can be moved out of the full access section to reduce >> the duration of exclusive access. >> >> Signed-off-by: Emily Deng <emily.d...@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++-- >> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 40 +++++++++++++++++----- >> 4 files changed, 70 insertions(+), 11 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> index 4cec3a873995..ba07e9c6619d 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> @@ -264,6 +264,22 @@ int amdgpu_amdkfd_resume(struct amdgpu_device >*adev, bool run_pm) >> return r; >> } >> >> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool >> +run_pm) { >> + if (adev->kfd.dev) >> + kgd2kfd_suspend_process(adev->kfd.dev, run_pm); } >> + >> +int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool >> +run_pm) { >> + int r = 0; >> + >> + if (adev->kfd.dev) >> + r = kgd2kfd_resume_process(adev->kfd.dev, run_pm); >> + >> + return r; >> +} >> + >> int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, >> struct amdgpu_reset_context *reset_context) { diff >> --git >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> index b6ca41859b53..841ae8b75ab1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> @@ -156,6 +156,8 @@ void amdgpu_amdkfd_fini(void); >> >> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm); >> int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm); >> +void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool >> +run_pm); int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, >> +bool run_pm); >> void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, >> const void *ih_ring_entry); >> void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); @@ >> -413,6 +415,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void >> kgd2kfd_device_exit(struct kfd_dev *kfd); void kgd2kfd_suspend(struct >> kfd_dev *kfd, bool run_pm); int kgd2kfd_resume(struct kfd_dev *kfd, >> bool run_pm); >> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm); int >> +kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm); >> int kgd2kfd_pre_reset(struct kfd_dev *kfd, >> struct amdgpu_reset_context *reset_context); int >> kgd2kfd_post_reset(struct kfd_dev *kfd); @@ -463,6 +467,15 @@ static >> inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) >> return 0; >> } >> >> +static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool >> +run_pm) { } >> + >> +static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool >> +run_pm) { >> + return 0; >> +} >> + >> static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, >> struct amdgpu_reset_context *reset_context) >> { diff -- >git >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 625c416c7d45..6e29f8bd54bb 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -5080,7 +5080,7 @@ int amdgpu_device_suspend(struct drm_device *dev, >bool notify_clients) >> amdgpu_device_ip_suspend_phase1(adev); >> >> if (!adev->in_s0ix) { >> - amdgpu_amdkfd_suspend(adev, adev->in_runpm); >> + amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_runtime(adev) || >> +adev->in_runpm); >> amdgpu_userq_suspend(adev); >> } >> >> @@ -5097,6 +5097,9 @@ int amdgpu_device_suspend(struct drm_device *dev, >bool notify_clients) >> if (amdgpu_sriov_vf(adev)) >> amdgpu_virt_release_full_gpu(adev, false); >> >> + if (!adev->in_s0ix) >> + amdgpu_amdkfd_suspend_process(adev, adev->in_runpm); >> + >> r = amdgpu_dpm_notify_rlc_state(adev, false); >> if (r) >> return r; >> @@ -5178,7 +5181,7 @@ int amdgpu_device_resume(struct drm_device *dev, >bool notify_clients) >> } >> >> if (!adev->in_s0ix) { >> - r = amdgpu_amdkfd_resume(adev, adev->in_runpm); >> + r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_runtime(adev) || >> +adev->in_runpm); >> if (r) >> goto exit; >> >> @@ -5199,6 +5202,11 @@ int amdgpu_device_resume(struct drm_device *dev, >bool notify_clients) >> amdgpu_virt_release_full_gpu(adev, true); >> } >> >> + if (!adev->in_s0ix) { >> + r = amdgpu_amdkfd_resume_process(adev, adev->in_runpm); > >Generic logic is to suspend all processes which should also result in stopping >any >submissions to hardware before proceeding with hardware uninit. How is this >different >for a VF? Is there a different mechanism done for a gracious suspend/hardware >access prevention? Or, does that result in abrupt errors in processes? > >That aside, the refactor causes the call to be made twice on bare metal. > >Thanks, >Lijo You are right, should suspend all processes before hardware fini. And will also repair the twice issue for bare metal.
Emily Deng Best Wishes > >> + if (r) >> + goto exit; >> + } >> if (r) >> return r; >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> index bf0854bd5555..22c6ef7c42b6 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >> @@ -1027,15 +1027,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool >run_pm) >> if (!kfd->init_complete) >> return; >> >> - /* for runtime suspend, skip locking kfd */ >> - if (!run_pm) { >> - mutex_lock(&kfd_processes_mutex); >> - /* For first KFD device suspend all the KFD processes */ >> - if (++kfd_locked == 1) >> - kfd_suspend_all_processes(); >> - mutex_unlock(&kfd_processes_mutex); >> - } >> - >> + kgd2kfd_suspend_process(kfd, run_pm); >> for (i = 0; i < kfd->num_nodes; i++) { >> node = kfd->nodes[i]; >> node->dqm->ops.stop(node->dqm); >> @@ -1055,6 +1047,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) >> return ret; >> } >> >> + ret = kgd2kfd_resume_process(kfd, run_pm); >> + >> + return ret; >> +} >> + >> +void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm) { >> + struct kfd_node *node; >> + int i; >> + >> + if (!kfd->init_complete) >> + return; >> + >> + /* for runtime suspend, skip locking kfd */ >> + if (!run_pm) { >> + mutex_lock(&kfd_processes_mutex); >> + /* For first KFD device suspend all the KFD processes */ >> + if (++kfd_locked == 1) >> + kfd_suspend_all_processes(); >> + mutex_unlock(&kfd_processes_mutex); >> + } >> +} >> + >> +int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm) { >> + int ret, i; >> + >> + if (!kfd->init_complete) >> + return 0; >> + >> /* for runtime resume, skip unlocking kfd */ >> if (!run_pm) { >> mutex_lock(&kfd_processes_mutex);