[Public] So, the code now loops two times over the list of queues. Couple of questions.
(1) Isn't it possible to call suspend_all_queues_mes(dqm) before the first list_for_each_entry()? The first loop only does some housekeeping. That way you can still do get away with single loop. (2) Also remove_queue_mes() is called for inactive queues (q->properties.is_active). Is that intentional? Best Regards, Harish -----Original Message----- From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Jonathan Kim Sent: Tuesday, June 3, 2025 12:30 PM To: amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul <mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com> Subject: [PATCH] drm/amdkfd: fix mes based process evictions First, MES suspend/resume calls should be consistently held under the KFD device lock (similar to queue invalidation) so consolidate MES based eviction logic with F32 HWS based eviction logic. Second, save the last eviction timestamp prior to current eviction to align with F32 HWS timestamp logging behaviour. Finally, bail early on failure to remove a single queue as something has gone really wrong post-suspend and a GPU reset is going to occur anyway so it's more efficient to just release the device lock. Signed-off-by: Jonathan Kim <jonathan....@amd.com> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 +++++-------------- 1 file changed, 20 insertions(+), 53 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 76359c6a3f3a..c1f0207eeb9e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1219,25 +1219,36 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, q->properties.is_active = false; decrement_queue_count(dqm, qpd, q); + } - if (dqm->dev->kfd->shared_resources.enable_mes) { - int err; + pdd->last_evict_timestamp = get_jiffies_64(); + + if (dqm->dev->kfd->shared_resources.enable_mes) { + retval = suspend_all_queues_mes(dqm); + if (retval) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } - err = remove_queue_mes(dqm, q, qpd); - if (err) { + list_for_each_entry(q, &qpd->queues_list, list) { + retval = remove_queue_mes(dqm, q, qpd); + if (retval) { dev_err(dev, "Failed to evict queue %d\n", q->properties.queue_id); - retval = err; + goto out; } } - } - pdd->last_evict_timestamp = get_jiffies_64(); - if (!dqm->dev->kfd->shared_resources.enable_mes) + + retval = resume_all_queues_mes(dqm); + if (retval) + dev_err(dev, "Resuming all queues failed"); + } else { retval = execute_queues_cpsch(dqm, qpd->is_debug ? KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD); + } out: dqm_unlock(dqm); @@ -3089,61 +3100,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbel return ret; } -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, - struct qcm_process_device *qpd) -{ - struct device *dev = dqm->dev->adev->dev; - int ret = 0; - - /* Check if process is already evicted */ - dqm_lock(dqm); - if (qpd->evicted) { - /* Increment the evicted count to make sure the - * process stays evicted before its terminated. - */ - qpd->evicted++; - dqm_unlock(dqm); - goto out; - } - dqm_unlock(dqm); - - ret = suspend_all_queues_mes(dqm); - if (ret) { - dev_err(dev, "Suspending all queues failed"); - goto out; - } - - ret = dqm->ops.evict_process_queues(dqm, qpd); - if (ret) { - dev_err(dev, "Evicting process queues failed"); - goto out; - } - - ret = resume_all_queues_mes(dqm); - if (ret) - dev_err(dev, "Resuming all queues failed"); - -out: - return ret; -} - int kfd_evict_process_device(struct kfd_process_device *pdd) { struct device_queue_manager *dqm; struct kfd_process *p; - int ret = 0; p = pdd->process; dqm = pdd->dev->dqm; WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); - if (dqm->dev->kfd->shared_resources.enable_mes) - ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); - else - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); - - return ret; + return dqm->ops.evict_process_queues(dqm, &pdd->qpd); } int reserve_debug_trap_vmid(struct device_queue_manager *dqm, -- 2.34.1