[Public]

So, the code now loops two times over the list of queues. Couple of questions.

(1) Isn't it possible to call suspend_all_queues_mes(dqm) before the first 
list_for_each_entry()? The first loop only does some housekeeping. That way you 
can still do get away with single loop.
(2) Also remove_queue_mes() is called for inactive queues 
(q->properties.is_active). Is that intentional?

Best Regards,
Harish



-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Jonathan Kim
Sent: Tuesday, June 3, 2025 12:30 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul 
<mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com>
Subject: [PATCH] drm/amdkfd: fix mes based process evictions

First, MES suspend/resume calls should be consistently held under the
KFD device lock (similar to queue invalidation) so consolidate
MES based eviction logic with F32 HWS based eviction logic.

Second, save the last eviction timestamp prior to current eviction to
align with F32 HWS timestamp logging behaviour.

Finally, bail early on failure to remove a single queue as something
has gone really wrong post-suspend and a GPU reset is going to occur
anyway so it's more efficient to just release the device lock.

Signed-off-by: Jonathan Kim <jonathan....@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 +++++--------------
 1 file changed, 20 insertions(+), 53 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 76359c6a3f3a..c1f0207eeb9e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1219,25 +1219,36 @@ static int evict_process_queues_cpsch(struct 
device_queue_manager *dqm,

                q->properties.is_active = false;
                decrement_queue_count(dqm, qpd, q);
+       }

-               if (dqm->dev->kfd->shared_resources.enable_mes) {
-                       int err;
+       pdd->last_evict_timestamp = get_jiffies_64();
+
+       if (dqm->dev->kfd->shared_resources.enable_mes) {
+               retval = suspend_all_queues_mes(dqm);
+               if (retval) {
+                       dev_err(dev, "Suspending all queues failed");
+                       goto out;
+               }

-                       err = remove_queue_mes(dqm, q, qpd);
-                       if (err) {
+               list_for_each_entry(q, &qpd->queues_list, list) {
+                       retval = remove_queue_mes(dqm, q, qpd);
+                       if (retval) {
                                dev_err(dev, "Failed to evict queue %d\n",
                                        q->properties.queue_id);
-                               retval = err;
+                               goto out;
                        }
                }
-       }
-       pdd->last_evict_timestamp = get_jiffies_64();
-       if (!dqm->dev->kfd->shared_resources.enable_mes)
+
+               retval = resume_all_queues_mes(dqm);
+               if (retval)
+                       dev_err(dev, "Resuming all queues failed");
+       } else {
                retval = execute_queues_cpsch(dqm,
                                              qpd->is_debug ?
                                              
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
                                              
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
                                              USE_DEFAULT_GRACE_PERIOD);
+       }

 out:
        dqm_unlock(dqm);
@@ -3089,61 +3100,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node 
*knode, u32 pasid, u32 doorbel
        return ret;
 }

-static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
-                                  struct qcm_process_device *qpd)
-{
-       struct device *dev = dqm->dev->adev->dev;
-       int ret = 0;
-
-       /* Check if process is already evicted */
-       dqm_lock(dqm);
-       if (qpd->evicted) {
-               /* Increment the evicted count to make sure the
-                * process stays evicted before its terminated.
-                */
-               qpd->evicted++;
-               dqm_unlock(dqm);
-               goto out;
-       }
-       dqm_unlock(dqm);
-
-       ret = suspend_all_queues_mes(dqm);
-       if (ret) {
-               dev_err(dev, "Suspending all queues failed");
-               goto out;
-       }
-
-       ret = dqm->ops.evict_process_queues(dqm, qpd);
-       if (ret) {
-               dev_err(dev, "Evicting process queues failed");
-               goto out;
-       }
-
-       ret = resume_all_queues_mes(dqm);
-       if (ret)
-               dev_err(dev, "Resuming all queues failed");
-
-out:
-       return ret;
-}
-
 int kfd_evict_process_device(struct kfd_process_device *pdd)
 {
        struct device_queue_manager *dqm;
        struct kfd_process *p;
-       int ret = 0;

        p = pdd->process;
        dqm = pdd->dev->dqm;

        WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);

-       if (dqm->dev->kfd->shared_resources.enable_mes)
-               ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
-       else
-               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
-
-       return ret;
+       return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
 }

 int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
--
2.34.1

Reply via email to