[Public]

Actually, on a second look, transient suspend/resume all seems kind of 
pointless for MES process eviction if we're scanning and removing all active 
queues one-by-one anyway.
>From the looks of it, MES has some trouble removing queues that have already 
>been suspended.
Not sure why this got called in the eviction path in the first place.
We should probably should just remove this transient suspend/resume call 
together from eviction all together.

Jon

> -----Original Message-----
> From: Kim, Jonathan
> Sent: Tuesday, June 3, 2025 5:53 PM
> To: Kasiviswanathan, Harish <harish.kasiviswanat...@amd.com>; amd-
> g...@lists.freedesktop.org
> Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul
> <mukul.jo...@amd.com>
> Subject: RE: [PATCH] drm/amdkfd: fix mes based process evictions
>
>
>
> > -----Original Message-----
> > From: Kasiviswanathan, Harish <harish.kasiviswanat...@amd.com>
> > Sent: Tuesday, June 3, 2025 5:22 PM
> > To: Kim, Jonathan <jonathan....@amd.com>; amd-gfx@lists.freedesktop.org
> > Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul
> > <mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com>
> > Subject: RE: [PATCH] drm/amdkfd: fix mes based process evictions
> >
> > [Public]
> >
> > So, the code now loops two times over the list of queues. Couple of 
> > questions.
> >
> > (1) Isn't it possible to call suspend_all_queues_mes(dqm) before the first
> > list_for_each_entry()? The first loop only does some housekeeping. That way 
> > you
> > can still do get away with single loop.
>
> Yeah I guess there's no harm in grabbing the last eviction time stamp prior 
> to house
> keeping + queue removal instead of just raw queue removal.
>
> > (2) Also remove_queue_mes() is called for inactive queues (q-
> >properties.is_active).
> > Is that intentional?
>
> No. That was a lazy copy and paste mistake.  Good catch.
>
> Jon
>
> >
> > Best Regards,
> > Harish
> >
> >
> >
> > -----Original Message-----
> > From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Jonathan
> > Kim
> > Sent: Tuesday, June 3, 2025 12:30 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Kuehling, Felix <felix.kuehl...@amd.com>; Joshi, Mukul
> > <mukul.jo...@amd.com>; Kim, Jonathan <jonathan....@amd.com>
> > Subject: [PATCH] drm/amdkfd: fix mes based process evictions
> >
> > First, MES suspend/resume calls should be consistently held under the
> > KFD device lock (similar to queue invalidation) so consolidate
> > MES based eviction logic with F32 HWS based eviction logic.
> >
> > Second, save the last eviction timestamp prior to current eviction to
> > align with F32 HWS timestamp logging behaviour.
> >
> > Finally, bail early on failure to remove a single queue as something
> > has gone really wrong post-suspend and a GPU reset is going to occur
> > anyway so it's more efficient to just release the device lock.
> >
> > Signed-off-by: Jonathan Kim <jonathan....@amd.com>
> > ---
> >  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 +++++--------------
> >  1 file changed, 20 insertions(+), 53 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 76359c6a3f3a..c1f0207eeb9e 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -1219,25 +1219,36 @@ static int evict_process_queues_cpsch(struct
> > device_queue_manager *dqm,
> >
> >                 q->properties.is_active = false;
> >                 decrement_queue_count(dqm, qpd, q);
> > +       }
> >
> > -               if (dqm->dev->kfd->shared_resources.enable_mes) {
> > -                       int err;
> > +       pdd->last_evict_timestamp = get_jiffies_64();
> > +
> > +       if (dqm->dev->kfd->shared_resources.enable_mes) {
> > +               retval = suspend_all_queues_mes(dqm);
> > +               if (retval) {
> > +                       dev_err(dev, "Suspending all queues failed");
> > +                       goto out;
> > +               }
> >
> > -                       err = remove_queue_mes(dqm, q, qpd);
> > -                       if (err) {
> > +               list_for_each_entry(q, &qpd->queues_list, list) {
> > +                       retval = remove_queue_mes(dqm, q, qpd);
> > +                       if (retval) {
> >                                 dev_err(dev, "Failed to evict queue %d\n",
> >                                         q->properties.queue_id);
> > -                               retval = err;
> > +                               goto out;
> >                         }
> >                 }
> > -       }
> > -       pdd->last_evict_timestamp = get_jiffies_64();
> > -       if (!dqm->dev->kfd->shared_resources.enable_mes)
> > +
> > +               retval = resume_all_queues_mes(dqm);
> > +               if (retval)
> > +                       dev_err(dev, "Resuming all queues failed");
> > +       } else {
> >                 retval = execute_queues_cpsch(dqm,
> >                                               qpd->is_debug ?
> >                                               
> > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> >
> > KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
> >                                               USE_DEFAULT_GRACE_PERIOD);
> > +       }
> >
> >  out:
> >         dqm_unlock(dqm);
> > @@ -3089,61 +3100,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct
> > kfd_node *knode, u32 pasid, u32 doorbel
> >         return ret;
> >  }
> >
> > -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
> > -                                  struct qcm_process_device *qpd)
> > -{
> > -       struct device *dev = dqm->dev->adev->dev;
> > -       int ret = 0;
> > -
> > -       /* Check if process is already evicted */
> > -       dqm_lock(dqm);
> > -       if (qpd->evicted) {
> > -               /* Increment the evicted count to make sure the
> > -                * process stays evicted before its terminated.
> > -                */
> > -               qpd->evicted++;
> > -               dqm_unlock(dqm);
> > -               goto out;
> > -       }
> > -       dqm_unlock(dqm);
> > -
> > -       ret = suspend_all_queues_mes(dqm);
> > -       if (ret) {
> > -               dev_err(dev, "Suspending all queues failed");
> > -               goto out;
> > -       }
> > -
> > -       ret = dqm->ops.evict_process_queues(dqm, qpd);
> > -       if (ret) {
> > -               dev_err(dev, "Evicting process queues failed");
> > -               goto out;
> > -       }
> > -
> > -       ret = resume_all_queues_mes(dqm);
> > -       if (ret)
> > -               dev_err(dev, "Resuming all queues failed");
> > -
> > -out:
> > -       return ret;
> > -}
> > -
> >  int kfd_evict_process_device(struct kfd_process_device *pdd)
> >  {
> >         struct device_queue_manager *dqm;
> >         struct kfd_process *p;
> > -       int ret = 0;
> >
> >         p = pdd->process;
> >         dqm = pdd->dev->dqm;
> >
> >         WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> >
> > -       if (dqm->dev->kfd->shared_resources.enable_mes)
> > -               ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
> > -       else
> > -               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> > -
> > -       return ret;
> > +       return dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> >  }
> >
> >  int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
> > --
> > 2.34.1
> >

Reply via email to