[AMD Official Use Only - AMD Internal Distribution Only] Hi Lijo, Could you help to review the patch again? Thanks.
Emily Deng Best Wishes >-----Original Message----- >From: Deng, Emily >Sent: Thursday, June 5, 2025 3:21 PM >To: Chen, Horace <horace.c...@amd.com>; Lazar, Lijo <lijo.la...@amd.com> >Cc: Emily Deng <emily.d...@amd.com>; amd-gfx@lists.freedesktop.org; Zhang, >Owen(SRDC) <owen.zha...@amd.com> >Subject: RE: [PATCH v3] drm/amdkfd: Move the process suspend and resume out of >full access > >@Lazar, Lijo and @Chen, Horace > Could you help to review again, thanks? > >Emily Deng >Best Wishes > > > >>-----Original Message----- >>From: Emily Deng <emily.d...@amd.com> >>Sent: Wednesday, June 4, 2025 3:48 PM >>To: amd-gfx@lists.freedesktop.org >>Cc: Deng, Emily <emily.d...@amd.com> >>Subject: [PATCH v3] drm/amdkfd: Move the process suspend and resume out >>of full access >> >>For the suspend and resume process, exclusive access is not required. >>Therefore, it can be moved out of the full access section to reduce the >>duration of exclusive access. >> >>v3: >>Move suspend processes before hardware fini. >>Remove twice call for bare metal. >> >>Signed-off-by: Emily Deng <emily.d...@amd.com> >>--- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 16 +++++++++ >>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +++++++ >>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++-- >> drivers/gpu/drm/amd/amdkfd/kfd_device.c | 40 +++++++++++++++++----- >> 4 files changed, 67 insertions(+), 11 deletions(-) >> >>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >>index d8ac4b1051a8..0a8e7835d0fc 100644 >>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >>@@ -264,6 +264,22 @@ int amdgpu_amdkfd_resume(struct amdgpu_device >>*adev, bool run_pm) >> return r; >> } >> >>+void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool >>+run_pm) { >>+ if (adev->kfd.dev) >>+ kgd2kfd_suspend_process(adev->kfd.dev, run_pm); } >>+ >>+int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, bool >>+run_pm) { >>+ int r = 0; >>+ >>+ if (adev->kfd.dev) >>+ r = kgd2kfd_resume_process(adev->kfd.dev, run_pm); >>+ >>+ return r; >>+} >>+ >> int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev, >> struct amdgpu_reset_context *reset_context) { diff >> --git >>a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>index b6ca41859b53..841ae8b75ab1 100644 >>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >>@@ -156,6 +156,8 @@ void amdgpu_amdkfd_fini(void); >> >> void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm); >>int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm); >>+void amdgpu_amdkfd_suspend_process(struct amdgpu_device *adev, bool >>+run_pm); int amdgpu_amdkfd_resume_process(struct amdgpu_device *adev, >>+bool run_pm); >> void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev, >> const void *ih_ring_entry); >> void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev); @@ -413,6 >>+415,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void >>kgd2kfd_device_exit(struct kfd_dev *kfd); void kgd2kfd_suspend(struct >>kfd_dev *kfd, bool run_pm); int kgd2kfd_resume(struct kfd_dev *kfd, >>bool run_pm); >>+void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm); int >>+kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm); >> int kgd2kfd_pre_reset(struct kfd_dev *kfd, >> struct amdgpu_reset_context *reset_context); int >>kgd2kfd_post_reset(struct kfd_dev *kfd); @@ -463,6 +467,15 @@ static >>inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) >> return 0; >> } >> >>+static inline void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool >>+run_pm) { } >>+ >>+static inline int kgd2kfd_resume_process(struct kfd_dev *kfd, bool >>+run_pm) { >>+ return 0; >>+} >>+ >> static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd, >> struct amdgpu_reset_context *reset_context) >> { diff -- >git >>a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>index 5289400879ec..08ff9917c62f 100644 >>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>@@ -5061,6 +5061,8 @@ int amdgpu_device_suspend(struct drm_device *dev, >>bool >>notify_clients) >> adev->in_suspend = true; >> >> if (amdgpu_sriov_vf(adev)) { >>+ if (!adev->in_s0ix) >>+ amdgpu_amdkfd_suspend_process(adev, adev->in_runpm); >> amdgpu_virt_fini_data_exchange(adev); >> r = amdgpu_virt_request_full_gpu(adev, false); >> if (r) >>@@ -5080,7 +5082,7 @@ int amdgpu_device_suspend(struct drm_device *dev, >>bool >>notify_clients) >> amdgpu_device_ip_suspend_phase1(adev); >> >> if (!adev->in_s0ix) { >>- amdgpu_amdkfd_suspend(adev, adev->in_runpm); >>+ amdgpu_amdkfd_suspend(adev, amdgpu_sriov_vf(adev) || adev- >>>in_runpm); >> amdgpu_userq_suspend(adev); >> } >> >>@@ -5178,7 +5180,7 @@ int amdgpu_device_resume(struct drm_device *dev, >>bool >>notify_clients) >> } >> >> if (!adev->in_s0ix) { >>- r = amdgpu_amdkfd_resume(adev, adev->in_runpm); >>+ r = amdgpu_amdkfd_resume(adev, amdgpu_sriov_vf(adev) || >>+adev->in_runpm); >> if (r) >> goto exit; >> >>@@ -5197,6 +5199,9 @@ int amdgpu_device_resume(struct drm_device *dev, >>bool >>notify_clients) >> if (amdgpu_sriov_vf(adev)) { >> amdgpu_virt_init_data_exchange(adev); >> amdgpu_virt_release_full_gpu(adev, true); >>+ >>+ if (!adev->in_s0ix && !r) >>+ r = amdgpu_amdkfd_resume_process(adev, adev->in_runpm); >> } >> >> if (r) >>diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>index bf0854bd5555..22c6ef7c42b6 100644 >>--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c >>@@ -1027,15 +1027,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool >run_pm) >> if (!kfd->init_complete) >> return; >> >>- /* for runtime suspend, skip locking kfd */ >>- if (!run_pm) { >>- mutex_lock(&kfd_processes_mutex); >>- /* For first KFD device suspend all the KFD processes */ >>- if (++kfd_locked == 1) >>- kfd_suspend_all_processes(); >>- mutex_unlock(&kfd_processes_mutex); >>- } >>- >>+ kgd2kfd_suspend_process(kfd, run_pm); >> for (i = 0; i < kfd->num_nodes; i++) { >> node = kfd->nodes[i]; >> node->dqm->ops.stop(node->dqm); >>@@ -1055,6 +1047,36 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) >> return ret; >> } >> >>+ ret = kgd2kfd_resume_process(kfd, run_pm); >>+ >>+ return ret; >>+} >>+ >>+void kgd2kfd_suspend_process(struct kfd_dev *kfd, bool run_pm) { >>+ struct kfd_node *node; >>+ int i; >>+ >>+ if (!kfd->init_complete) >>+ return; >>+ >>+ /* for runtime suspend, skip locking kfd */ >>+ if (!run_pm) { >>+ mutex_lock(&kfd_processes_mutex); >>+ /* For first KFD device suspend all the KFD processes */ >>+ if (++kfd_locked == 1) >>+ kfd_suspend_all_processes(); >>+ mutex_unlock(&kfd_processes_mutex); >>+ } >>+} >>+ >>+int kgd2kfd_resume_process(struct kfd_dev *kfd, bool run_pm) { >>+ int ret, i; >>+ >>+ if (!kfd->init_complete) >>+ return 0; >>+ >> /* for runtime resume, skip unlocking kfd */ >> if (!run_pm) { >> mutex_lock(&kfd_processes_mutex); >>-- >>2.34.1