[Public]

> -----Original Message-----
> From: Kasiviswanathan, Harish <harish.kasiviswanat...@amd.com>
> Sent: Tuesday, June 3, 2025 5:22 PM
> To: Kim, Jonathan <jonathan....@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul
> <mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com>
> Subject: RE: [PATCH] drm/amdkfd: fix mes based process evictions
>
> [Public]
>
> So, the code now loops two times over the list of queues. Couple of questions.
>
> (1) Isn't it possible to call suspend_all_queues_mes(dqm) before the first
> list_for_each_entry()? The first loop only does some housekeeping. That way 
> you
> can still do get away with single loop.

Yeah I guess there's no harm in grabbing the last eviction time stamp prior to 
house keeping + queue removal instead of just raw queue removal.

> (2) Also remove_queue_mes() is called for inactive queues 
> (q->properties.is_active).
> Is that intentional?

No. That was a lazy copy and paste mistake.  Good catch.

Jon

>
> Best Regards,
> Harish
>
>
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Jonathan
> Kim
> Sent: Tuesday, June 3, 2025 12:30 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul
> <mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com>
> Subject: [PATCH] drm/amdkfd: fix mes based process evictions
>
> First, MES suspend/resume calls should be consistently held under the
> KFD device lock (similar to queue invalidation) so consolidate
> MES based eviction logic with F32 HWS based eviction logic.
>
> Second, save the last eviction timestamp prior to current eviction to
> align with F32 HWS timestamp logging behaviour.
>
> Finally, bail early on failure to remove a single queue as something
> has gone really wrong post-suspend and a GPU reset is going to occur
> anyway so it's more efficient to just release the device lock.
>
> Signed-off-by: Jonathan Kim <jonathan....@amd.com>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 +++++--------------
>  1 file changed, 20 insertions(+), 53 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 76359c6a3f3a..c1f0207eeb9e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1219,25 +1219,36 @@ static int evict_process_queues_cpsch(struct
> device_queue_manager *dqm,
>
>                 q->properties.is_active = false;
>                 decrement_queue_count(dqm, qpd, q);
> +       }
>
> -               if (dqm->dev->kfd->shared_resources.enable_mes) {
> -                       int err;
> +       pdd->last_evict_timestamp = get_jiffies_64();
> +
> +       if (dqm->dev->kfd->shared_resources.enable_mes) {
> +               retval = suspend_all_queues_mes(dqm);
> +               if (retval) {
> +                       dev_err(dev, "Suspending all queues failed");
> +                       goto out;
> +               }
>
> -                       err = remove_queue_mes(dqm, q, qpd);
> -                       if (err) {
> +               list_for_each_entry(q, &qpd->queues_list, list) {
> +                       retval = remove_queue_mes(dqm, q, qpd);
> +                       if (retval) {
>                                 dev_err(dev, "Failed to evict queue %d\n",
>                                         q->properties.queue_id);
> -                               retval = err;
> +                               goto out;
>                         }
>                 }
> -       }
> -       pdd->last_evict_timestamp = get_jiffies_64();
> -       if (!dqm->dev->kfd->shared_resources.enable_mes)
> +
> +               retval = resume_all_queues_mes(dqm);
> +               if (retval)
> +                       dev_err(dev, "Resuming all queues failed");
> +       } else {
>                 retval = execute_queues_cpsch(dqm,
>                                               qpd->is_debug ?
>                                               
> KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
>
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
>                                               USE_DEFAULT_GRACE_PERIOD);
> +       }
>
>  out:
>         dqm_unlock(dqm);
> @@ -3089,61 +3100,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct
> kfd_node *knode, u32 pasid, u32 doorbel
>         return ret;
>  }
>
> -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
> -                                  struct qcm_process_device *qpd)
> -{
> -       struct device *dev = dqm->dev->adev->dev;
> -       int ret = 0;
> -
> -       /* Check if process is already evicted */
> -       dqm_lock(dqm);
> -       if (qpd->evicted) {
> -               /* Increment the evicted count to make sure the
> -                * process stays evicted before its terminated.
> -                */
> -               qpd->evicted++;
> -               dqm_unlock(dqm);
> -               goto out;
> -       }
> -       dqm_unlock(dqm);
> -
> -       ret = suspend_all_queues_mes(dqm);
> -       if (ret) {
> -               dev_err(dev, "Suspending all queues failed");
> -               goto out;
> -       }
> -
> -       ret = dqm->ops.evict_process_queues(dqm, qpd);
> -       if (ret) {
> -               dev_err(dev, "Evicting process queues failed");
> -               goto out;
> -       }
> -
> -       ret = resume_all_queues_mes(dqm);
> -       if (ret)
> -               dev_err(dev, "Resuming all queues failed");
> -
> -out:
> -       return ret;
> -}
> -
>  int kfd_evict_process_device(struct kfd_process_device *pdd)
>  {
>         struct device_queue_manager *dqm;
>         struct kfd_process *p;
> -       int ret = 0;
>
>         p = pdd->process;
>         dqm = pdd->dev->dqm;
>
>         WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>
> -       if (dqm->dev->kfd->shared_resources.enable_mes)
> -               ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
> -       else
> -               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> -
> -       return ret;
> +       return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
>  }
>
>  int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
> --
> 2.34.1
>

Reply via email to