Nice fix. Reviewed-by: Oak Zeng <oak.z...@amd.com>

Regards,
Oak

-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Yang, Philip
Sent: Monday, October 21, 2019 5:05 PM
To: amd-gfx@lists.freedesktop.org
Cc: Yang, Philip <philip.y...@amd.com>
Subject: [PATCH] drm/amdkfd: don't use dqm lock during device 
reset/suspend/resume

If device reset/suspend/resume failed for some reason, dqm lock is hold forever 
and this causes deadlock. Below is a kernel backtrace when application open kfd 
after suspend/resume failed.

Instead of holding dqm lock in pre_reset and releasing dqm lock in post_reset, 
add dqm->device_stopped flag which is modified in
dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
because write/read are all inside dqm lock.

For HWS case, map_queues_cpsch and unmap_queues_cpsch checks device_stopped 
flag before sending the updated runlist.

Backtrace of dqm lock deadlock:

[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more than 120 
seconds.
[Thu Oct 17 16:43:37 2019]       Not tainted
5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1 [Thu Oct 17 16:43:37 2019] "echo 
0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Thu Oct 17 16:43:37 2019] rocminfo        D    0  3024   2947
0x80000000
[Thu Oct 17 16:43:37 2019] Call Trace:
[Thu Oct 17 16:43:37 2019]  ? __schedule+0x3d9/0x8a0 [Thu Oct 17 16:43:37 2019] 
 schedule+0x32/0x70 [Thu Oct 17 16:43:37 2019]  
schedule_preempt_disabled+0xa/0x10
[Thu Oct 17 16:43:37 2019]  __mutex_lock.isra.9+0x1e3/0x4e0 [Thu Oct 17 
16:43:37 2019]  ? __call_srcu+0x264/0x3b0 [Thu Oct 17 16:43:37 2019]  ? 
process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]  process_termination_cpsch+0x24/0x2f0
[amdgpu]
[Thu Oct 17 16:43:37 2019]
kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu] [Thu Oct 17 16:43:37 
2019]  kfd_process_notifier_release+0x1be/0x220
[amdgpu]
[Thu Oct 17 16:43:37 2019]  __mmu_notifier_release+0x3e/0xc0 [Thu Oct 17 
16:43:37 2019]  exit_mmap+0x160/0x1a0 [Thu Oct 17 16:43:37 2019]  ? 
__handle_mm_fault+0xba3/0x1200 [Thu Oct 17 16:43:37 2019]  ? 
exit_robust_list+0x5a/0x110 [Thu Oct 17 16:43:37 2019]  mmput+0x4a/0x120 [Thu 
Oct 17 16:43:37 2019]  do_exit+0x284/0xb20 [Thu Oct 17 16:43:37 2019]  ? 
handle_mm_fault+0xfa/0x200 [Thu Oct 17 16:43:37 2019]  do_group_exit+0x3a/0xa0 
[Thu Oct 17 16:43:37 2019]  __x64_sys_exit_group+0x14/0x20 [Thu Oct 17 16:43:37 
2019]  do_syscall_64+0x4f/0x100 [Thu Oct 17 16:43:37 2019]  
entry_SYSCALL_64_after_hwframe+0x44/0xa9

Suggested-by: Felix Kuehling <felix.kuehl...@amd.com>
Signed-off-by: Philip Yang <philip.y...@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c            |  6 +++---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c             |  5 -----
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++++---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h   |  1 +
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index d9e36dbf13d5..40d75c39f08e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -120,13 +120,13 @@ static int kfd_open(struct inode *inode, struct file 
*filep)
                return -EPERM;
        }
 
+       if (kfd_is_locked())
+               return -EAGAIN;
+
        process = kfd_create_process(filep);
        if (IS_ERR(process))
                return PTR_ERR(process);
 
-       if (kfd_is_locked())
-               return -EAGAIN;
-
        dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
                process->pasid, process->is_32bit_user_mode);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8f4b24e84964..4fa8834ce7cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -730,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
                return 0;
        kgd2kfd_suspend(kfd);
 
-       /* hold dqm->lock to prevent further execution*/
-       dqm_lock(kfd->dqm);
-
        kfd_signal_reset_event(kfd);
        return 0;
 }
@@ -750,8 +747,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
        if (!kfd->init_complete)
                return 0;
 
-       dqm_unlock(kfd->dqm);
-
        ret = kfd_resume(kfd);
        if (ret)
                return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 81fb545cf42c..04a40fabe9d7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -915,7 +915,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
        
        if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
                return pm_init(&dqm->packets, dqm);
-       
+       dqm->device_stopped = false;
+
        return 0;
 }
 
@@ -923,7 +924,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)  {
        if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
                pm_uninit(&dqm->packets);
-       
+       dqm->device_stopped = true;
+
        return 0;
 }
 
@@ -1074,6 +1076,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
        dqm_lock(dqm);
        /* clear hang status when driver try to start the hw scheduler */
        dqm->is_hws_hang = false;
+       dqm->device_stopped = false;
        execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
        dqm_unlock(dqm);
 
@@ -1089,6 +1092,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)  {
        dqm_lock(dqm);
        unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+       dqm->device_stopped = true;
        dqm_unlock(dqm);
 
        kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1275,9 +1279,10 @@ 
static int map_queues_cpsch(struct device_queue_manager *dqm)  {
        int retval;
 
+       if (dqm->device_stopped)
+               return 0;
        if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
                return 0;
-
        if (dqm->active_runlist)
                return 0;
 
@@ -1299,6 +1304,8 @@ static int unmap_queues_cpsch(struct device_queue_manager 
*dqm,  {
        int retval = 0;
 
+       if (dqm->device_stopped)
+               return 0;
        if (dqm->is_hws_hang)
                return -EIO;
        if (!dqm->active_runlist)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 2eaea6b04cbe..44ecdf999ca8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -201,6 +201,7 @@ struct device_queue_manager {
        bool                    is_hws_hang;
        struct work_struct      hw_exception_work;
        struct kfd_mem_obj      hiq_sdma_mqd;
+       bool                    device_stopped;
 };
 
 void device_queue_manager_init_cik(
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to