It is to differenciate case scenario for proper behavior when
calling evict queues, such as GPU reset doesn't need to roll
back restoring partial evicted queues.

Signed-off-by: Eric Huang <jinhuieric.hu...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 10 +++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 18 ++++++++++--------
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 77044e8ba4e6..59ce5a17a834 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -190,7 +190,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
 void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
 {
        if (adev->kfd.dev)
-               kgd2kfd_suspend(adev->kfd.dev, run_pm);
+               kgd2kfd_suspend(adev->kfd.dev, run_pm, false);
 }
 
 int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 332ccba00e69..b7e46ad0507e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -372,7 +372,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
                         struct drm_device *ddev,
                         const struct kgd2kfd_shared_resources *gpu_resources);
 void kgd2kfd_device_exit(struct kfd_dev *kfd);
-void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force);
 int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync);
 int kgd2kfd_pre_reset(struct kfd_dev *kfd);
 int kgd2kfd_post_reset(struct kfd_dev *kfd);
@@ -407,7 +407,7 @@ static inline void kgd2kfd_device_exit(struct kfd_dev *kfd)
 {
 }
 
-static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
+static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool 
force)
 {
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 24b5e0aa1eac..48e51ee8de56 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -940,7 +940,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 void kgd2kfd_device_exit(struct kfd_dev *kfd)
 {
        if (kfd->init_complete) {
-               kgd2kfd_suspend(kfd, false);
+               kgd2kfd_suspend(kfd, false, true);
                svm_migrate_fini((struct amdgpu_device *)kfd->kgd);
                device_queue_manager_uninit(kfd->dqm);
                kfd_interrupt_exit(kfd);
@@ -965,7 +965,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 
        kfd->dqm->ops.pre_reset(kfd->dqm);
 
-       kgd2kfd_suspend(kfd, false);
+       kgd2kfd_suspend(kfd, false, true);
 
        kfd_signal_reset_event(kfd);
        return 0;
@@ -1001,7 +1001,7 @@ bool kfd_is_locked(void)
        return  (atomic_read(&kfd_locked) > 0);
 }
 
-void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
+void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force)
 {
        if (!kfd->init_complete)
                return;
@@ -1010,7 +1010,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
        if (!run_pm) {
                /* For first KFD device suspend all the KFD processes */
                if (atomic_inc_return(&kfd_locked) == 1)
-                       kfd_suspend_all_processes();
+                       kfd_suspend_all_processes(force);
        }
 
        kfd->dqm->ops.stop(kfd->dqm);
@@ -1122,7 +1122,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
                return -ESRCH;
 
        WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
-       r = kfd_process_evict_queues(p);
+       r = kfd_process_evict_queues(p, true);
 
        kfd_unref_process(p);
        return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3d5d3994d8a4..e80fb64a6dcc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1042,9 +1042,9 @@ static inline struct kfd_process_device 
*kfd_process_device_from_gpuidx(
 }
 
 void kfd_unref_process(struct kfd_process *p);
-int kfd_process_evict_queues(struct kfd_process *p);
+int kfd_process_evict_queues(struct kfd_process *p, bool force);
 int kfd_process_restore_queues(struct kfd_process *p);
-void kfd_suspend_all_processes(void);
+void kfd_suspend_all_processes(bool force);
 /*
  * kfd_resume_all_processes:
  *     bool sync: If kfd_resume_all_processes() should wait for the
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 38a9dee40785..a41ece37bc3c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1875,20 +1875,22 @@ struct kfd_process *kfd_lookup_process_by_mm(const 
struct mm_struct *mm)
  * Eviction is reference-counted per process-device. This means multiple
  * evictions from different sources can be nested safely.
  */
-int kfd_process_evict_queues(struct kfd_process *p)
+int kfd_process_evict_queues(struct kfd_process *p, bool force)
 {
-       int r = 0;
+       int r = 0, r_tmp = 0;
        int i;
        unsigned int n_evicted = 0;
 
        for (i = 0; i < p->n_pdds; i++) {
                struct kfd_process_device *pdd = p->pdds[i];
 
-               r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
+               r_tmp = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
                                                            &pdd->qpd);
-               if (r) {
+               if (r_tmp) {
                        pr_err("Failed to evict process queues\n");
-                       goto fail;
+                       r = r_tmp;
+                       if (!force)
+                               goto fail;
                }
                n_evicted++;
        }
@@ -2079,7 +2081,7 @@ static void evict_process_worker(struct work_struct *work)
        p->last_evict_timestamp = get_jiffies_64();
 
        pr_debug("Started evicting pasid 0x%x\n", p->pasid);
-       ret = kfd_process_evict_queues(p);
+       ret = kfd_process_evict_queues(p, false);
        if (!ret) {
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
@@ -2147,7 +2149,7 @@ static void restore_process_worker(struct work_struct 
*work)
                pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
 }
 
-void kfd_suspend_all_processes(void)
+void kfd_suspend_all_processes(bool force)
 {
        struct kfd_process *p;
        unsigned int temp;
@@ -2158,7 +2160,7 @@ void kfd_suspend_all_processes(void)
                cancel_delayed_work_sync(&p->eviction_work);
                cancel_delayed_work_sync(&p->restore_work);
 
-               if (kfd_process_evict_queues(p))
+               if (kfd_process_evict_queues(p, force))
                        pr_err("Failed to suspend process 0x%x\n", p->pasid);
                dma_fence_signal(p->ef);
                dma_fence_put(p->ef);
-- 
2.25.1

Reply via email to