stop scheduling

Alex Deucher Fri, 11 Apr 2025 10:43:55 -0700

On Fri, Apr 11, 2025 at 1:26 PM Khatri, Sunil <sukha...@amd.com> wrote:
>
>
> On 4/11/2025 10:22 PM, Alex Deucher wrote:
> > On Fri, Apr 11, 2025 at 12:17 PM Khatri, Sunil <sukha...@amd.com> wrote:
> >>
> >> On 4/11/2025 7:42 PM, Alex Deucher wrote:
> >>> This will be used to stop/start user queue scheduling for
> >>> example when switching between kernel and user queues when
> >>> enforce isolation is enabled.
> >>>
> >>> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
> >>> ---
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 68 ++++++++++++++++---
> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h |  3 +
> >>>    3 files changed, 64 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> index b156e31ac86ac..30c485f529d17 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >>> @@ -1249,6 +1249,7 @@ struct amdgpu_device {
> >>>
> >>>        struct list_head                userq_mgr_list;
> >>>        struct mutex                    userq_mutex;
> >>> +     bool                            userq_halt;
> >>>    };
> >>>
> >>>    static inline uint32_t amdgpu_ip_version(const struct amdgpu_device 
> >>> *adev,
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c 
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> >>> index aa7222137c31d..e93e390f4e301 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
> >>> @@ -335,6 +335,7 @@ amdgpu_userqueue_create(struct drm_file *filp, union 
> >>> drm_amdgpu_userq *args)
> >>>                goto unlock;
> >>>        }
> >>>
> >>> +
> >>>        qid = idr_alloc(&uq_mgr->userq_idr, queue, 1, 
> >>> AMDGPU_MAX_USERQ_COUNT, GFP_KERNEL);
> >>>        if (qid < 0) {
> >>>                DRM_ERROR("Failed to allocate a queue id\n");
> >>> @@ -345,15 +346,21 @@ amdgpu_userqueue_create(struct drm_file *filp, 
> >>> union drm_amdgpu_userq *args)
> >>>                goto unlock;
> >>>        }
> >>>
> >>> -     r = uq_funcs->map(uq_mgr, queue);
> >>> -     if (r) {
> >>> -             DRM_ERROR("Failed to map Queue\n");
> >>> -             idr_remove(&uq_mgr->userq_idr, qid);
> >>> -             amdgpu_userq_fence_driver_free(queue);
> >>> -             uq_funcs->mqd_destroy(uq_mgr, queue);
> >>> -             kfree(queue);
> >>> -             goto unlock;
> >>> +     /* don't map the queue if scheduling is halted */
> >>> +     mutex_lock(&adev->userq_mutex);
> >>> +     if (!adev->userq_halt) {
> >>> +             r = uq_funcs->map(uq_mgr, queue);
> >>> +             if (r) {
> >>> +                     DRM_ERROR("Failed to map Queue\n");
> >>> +                     mutex_unlock(&adev->userq_mutex);
> >>> +                     idr_remove(&uq_mgr->userq_idr, qid);
> >>> +                     amdgpu_userq_fence_driver_free(queue);
> >>> +                     uq_funcs->mqd_destroy(uq_mgr, queue);
> >>> +                     kfree(queue);
> >>> +                     goto unlock;
> >>> +             }
> >>>        }
> >>> +     mutex_unlock(&adev->userq_mutex);
> >>>
> >>>        args->out.queue_id = qid;
> >>>
> >>> @@ -720,3 +727,48 @@ int amdgpu_userq_resume(struct amdgpu_device *adev)
> >>>        mutex_unlock(&adev->userq_mutex);
> >>>        return ret;
> >>>    }
> >>> +
> >>> +int amdgpu_userq_stop_sched(struct amdgpu_device *adev, u32 idx)
> >> Did i miss something of the param "idx" isnt being used in this function
> >> ? Same is applicable for start shed .
> > It's only applicable for chips with multiple XCDs,  gfx11 and 12 are
> > only single XCDs so it's unused for now.
> >
> > I dont see a way to handle multiple XCDs with this function and might need 
> > redesign. I am assuming that with an IP with multiple XCDs we will have 
> > multiple MES hw too and we need to choose which MES unmap we have to call 
> > isnt it?  Also unused variable might be a warning too.
> >
> > So you want to use idx later when we have a target which needed that 
> > support and till then keep a version without idx ?? Anyways it looks 
> > functionally working Reviewed-by:
> > Sunil Khatri <sunil.kha...@amd.com>
> >
> > But some explanation in commit message for idx or may be just here is good 
> > to have.


The KFD equivalent of this function uses the idx because it supports
MI parts which have multiple XCDs. We already track the xcd idx
throughout the rest of the stack.  I'll add the idx handling for
userqs and respin the patch with that included.

Alex

> >
> > Regards
> > Sunil Khatri
> >
> > Alex
> >
> >> Regards
> >> Sunil
> >>> +{
> >>> +     const struct amdgpu_userq_funcs *userq_funcs;
> >>> +     struct amdgpu_usermode_queue *queue;
> >>> +     struct amdgpu_userq_mgr *uqm, *tmp;
> >>> +     int queue_id;
> >>> +     int ret = 0;
> >>> +
> >>> +     mutex_lock(&adev->userq_mutex);
> >>> +     if (adev->userq_halt)
> >>> +             dev_warn(adev->dev, "userq scheduling already stopped!\n");
> >>> +     adev->userq_halt = true;
> >>> +     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> >>> +             cancel_delayed_work_sync(&uqm->resume_work);
> >>> +             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> >>> +                     userq_funcs = adev->userq_funcs[queue->queue_type];
> >>> +                     ret |= userq_funcs->unmap(uqm, queue);
> >>> +             }
> >>> +     }
> >>> +     mutex_unlock(&adev->userq_mutex);
> >>> +     return ret;
> >>> +}
> >>> +
> >>> +int amdgpu_userq_start_sched(struct amdgpu_device *adev, u32 idx)
> >>> +{
> >>> +     const struct amdgpu_userq_funcs *userq_funcs;
> >>> +     struct amdgpu_usermode_queue *queue;
> >>> +     struct amdgpu_userq_mgr *uqm, *tmp;
> >>> +     int queue_id;
> >>> +     int ret = 0;
> >>> +
> >>> +     mutex_lock(&adev->userq_mutex);
> >>> +     if (!adev->userq_halt)
> >>> +             dev_warn(adev->dev, "userq scheduling already started!\n");
> >>> +     adev->userq_halt = false;
> >>> +     list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
> >>> +             idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
> >>> +                     userq_funcs = adev->userq_funcs[queue->queue_type];
> >>> +                     ret |= userq_funcs->map(uqm, queue);
> >>> +             }
> >>> +     }
> >>> +     mutex_unlock(&adev->userq_mutex);
> >>> +     return ret;
> >>> +}
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h 
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h
> >>> index 381b9c6f0573d..fd0542f60433b 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h
> >>> @@ -114,4 +114,7 @@ uint64_t amdgpu_userqueue_get_doorbell_index(struct 
> >>> amdgpu_userq_mgr *uq_mgr,
> >>>    int amdgpu_userq_suspend(struct amdgpu_device *adev);
> >>>    int amdgpu_userq_resume(struct amdgpu_device *adev);
> >>>
> >>> +int amdgpu_userq_stop_sched(struct amdgpu_device *adev, u32 idx);
> >>> +int amdgpu_userq_start_sched(struct amdgpu_device *adev, u32 idx);
> >>> +
> >>>    #endif

Re: [PATCH 8/9] drm/amdgpu/userq: add helpers to start/stop scheduling

Reply via email to