tests: Port to cancel_job()

Tvrtko Ursulin Fri, 04 Jul 2025 05:30:22 -0700


On 04/07/2025 12:27, Philipp Stanner wrote:

On Fri, 2025-07-04 at 11:53 +0200, Philipp Stanner wrote:

On Wed, 2025-07-02 at 12:25 +0100, Tvrtko Ursulin wrote:


On 02/07/2025 11:56, Philipp Stanner wrote:

On Wed, 2025-07-02 at 11:36 +0100, Tvrtko Ursulin wrote:


On 01/07/2025 14:21, Philipp Stanner wrote:

The GPU Scheduler now supports a new callback, cancel_job(),
which
lets
the scheduler cancel all jobs which might not yet be freed
when
drm_sched_fini() runs. Using this callback allows for
significantly
simplifying the mock scheduler teardown code.

Implement the cancel_job() callback and adjust the code where
necessary.


Cross referencing against my version I think you missed this
hunk:

--- a/drivers/gpu/drm/scheduler/tests/sched_tests.h
+++ b/drivers/gpu/drm/scheduler/tests/sched_tests.h
@@ -49,7 +49,6 @@ struct drm_mock_scheduler {

        spinlock_t              lock;
        struct list_head        job_list;
-       struct list_head        done_list;

        struct {
                u64             context;


Right, overlooked that one.


I also had this:

@@ -97,7 +96,8 @@ struct drm_mock_sched_job {
        struct completion       done;

    #define DRM_MOCK_SCHED_JOB_DONE             0x1
-#define DRM_MOCK_SCHED_JOB_TIMEDOUT    0x2
+#define DRM_MOCK_SCHED_JOB_CANCELED    0x2
+#define DRM_MOCK_SCHED_JOB_TIMEDOUT    0x4

And was setting it in the callback. And since we should add a
test to
explicitly cover the new callback, and just the callback, that
could
make it very easy to do it.


What do you imagine that to look like? The scheduler only invokes
the
callback on tear down.

We also don't have tests that only test free_job() and the like,
do
we?

You cannot test a callback for the scheduler, because the
callback
is
implemented in the driver.

Callbacks are tested by using the scheduler. In this case, it's
tested
the intended way by the unit tests invoking drm_sched_free().


Something like (untested):

static void drm_sched_test_cleanup(struct kunit *test)
{
        struct drm_mock_sched_entity *entity;
        struct drm_mock_scheduler *sched;
        struct drm_mock_sched_job job;
        bool done;

        /*
         * Check that the job cancel callback gets invoked by the
scheduler.
         */

        sched = drm_mock_sched_new(test, MAX_SCHEDULE_TIMEOUT);
        entity = drm_mock_sched_entity_new(test,
                                        
DRM_SCHED_PRIORITY_NORMAL,
                                           sched);

        job = drm_mock_sched_job_new(test, entity);
        drm_mock_sched_job_submit(job);
        done = drm_mock_sched_job_wait_scheduled(job, HZ);
        KUNIT_ASSERT_TRUE(test, done);

        drm_mock_sched_entity_free(entity);
        drm_mock_sched_fini(sched);

        KUNIT_ASSERT_TRUE(test, job->flags &
DRM_MOCK_SCHED_JOB_CANCELED);
}


That could work – but it's racy.

I wonder if we want to introduce a mechanism into the mock scheduler
through which we can enforce a simulated GPU hang. Then it would
never
race, and that might be useful for more advanced tests for reset
recovery and those things.

Opinions?


Ah wait, we can do that with job_duration = 0


Yes, the above already wasn't racy.

If job duration is not explicitly set, and it isn't, the job will notcomplete until test would call drm_mock_sched_advance(sched, 1). Andsince it doesn't, the job is guaranteed to be sitting on the listforever. So drm_sched_fini() has a guaranteed chance to clean it up.

Very good, I'll include something like that in v2.


Ack.

Regards,

Tvrtko

Or via the hw fence status.

Regards,

Tvrtko

Signed-off-by: Philipp Stanner <[email protected]>
---
    .../gpu/drm/scheduler/tests/mock_scheduler.c  | 66
+++++++--
-----
-----
    1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
index 49d067fecd67..2d3169d95200 100644
--- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
+++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
@@ -63,7 +63,7 @@ static void
drm_mock_sched_job_complete(struct
drm_mock_sched_job *job)
        lockdep_assert_held(&sched->lock);

job->flags |= DRM_MOCK_SCHED_JOB_DONE;

-       list_move_tail(&job->link, &sched->done_list);
+       list_del(&job->link);
        dma_fence_signal_locked(&job->hw_fence);
        complete(&job->done);
    }
@@ -236,26 +236,39 @@ mock_sched_timedout_job(struct
drm_sched_job
*sched_job)

static void mock_sched_free_job(struct drm_sched_job

*sched_job)
    {
-       struct drm_mock_scheduler *sched =
-                       drm_sched_to_mock_sched(sched_job-

sched);

        struct drm_mock_sched_job *job =
drm_sched_job_to_mock_job(sched_job);
-       unsigned long flags;

- /* Remove from the scheduler done list. */

-       spin_lock_irqsave(&sched->lock, flags);
-       list_del(&job->link);
-       spin_unlock_irqrestore(&sched->lock, flags);
        dma_fence_put(&job->hw_fence);
-
        drm_sched_job_cleanup(sched_job);

/* Mock job itself is freed by the kunit framework.

*/
    }

+static void mock_sched_cancel_job(struct drm_sched_job

*sched_job)
+{
+       struct drm_mock_scheduler *sched =
drm_sched_to_mock_sched(sched_job->sched);
+       struct drm_mock_sched_job *job =
drm_sched_job_to_mock_job(sched_job);
+       unsigned long flags;
+
+       hrtimer_cancel(&job->timer);
+
+       spin_lock_irqsave(&sched->lock, flags);
+       if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
+               list_del(&job->link);
+               dma_fence_set_error(&job->hw_fence, -
ECANCELED);
+               dma_fence_signal_locked(&job->hw_fence);
+       }
+       spin_unlock_irqrestore(&sched->lock, flags);
+
+       /* The GPU Scheduler will call
drm_sched_backend_ops.free_job(), still.
+        * Mock job itself is freed by the kunit framework.
*/


/*
    * Multiline comment style to stay consistent, at least in
this
file.
    */

The rest looks good, but I need to revisit the timeout/free
handling
since it has been a while and you changed it recently.

Regards,

Tvrtko

+}
+
    static const struct drm_sched_backend_ops
drm_mock_scheduler_ops
= {
        .run_job = mock_sched_run_job,
        .timedout_job = mock_sched_timedout_job,
-       .free_job = mock_sched_free_job
+       .free_job = mock_sched_free_job,
+       .cancel_job = mock_sched_cancel_job,
    };

/**

@@ -289,7 +302,6 @@ struct drm_mock_scheduler
*drm_mock_sched_new(struct kunit *test, long timeout)
        sched->hw_timeline.context =
dma_fence_context_alloc(1);
        atomic_set(&sched->hw_timeline.next_seqno, 0);
        INIT_LIST_HEAD(&sched->job_list);
-       INIT_LIST_HEAD(&sched->done_list);
        spin_lock_init(&sched->lock);

return sched;

@@ -304,38 +316,6 @@ struct drm_mock_scheduler
*drm_mock_sched_new(struct kunit *test, long timeout)
     */
    void drm_mock_sched_fini(struct drm_mock_scheduler *sched)
    {
-       struct drm_mock_sched_job *job, *next;
-       unsigned long flags;
-       LIST_HEAD(list);
-
-       drm_sched_wqueue_stop(&sched->base);
-
-       /* Force complete all unfinished jobs. */
-       spin_lock_irqsave(&sched->lock, flags);
-       list_for_each_entry_safe(job, next, &sched-

job_list,

link)
-               list_move_tail(&job->link, &list);
-       spin_unlock_irqrestore(&sched->lock, flags);
-
-       list_for_each_entry(job, &list, link)
-               hrtimer_cancel(&job->timer);
-
-       spin_lock_irqsave(&sched->lock, flags);
-       list_for_each_entry_safe(job, next, &list, link)
-               drm_mock_sched_job_complete(job);
-       spin_unlock_irqrestore(&sched->lock, flags);
-
-       /*
-        * Free completed jobs and jobs not yet processed by
the
DRM scheduler
-        * free worker.
-        */
-       spin_lock_irqsave(&sched->lock, flags);
-       list_for_each_entry_safe(job, next, &sched-

done_list,

link)
-               list_move_tail(&job->link, &list);
-       spin_unlock_irqrestore(&sched->lock, flags);
-
-       list_for_each_entry_safe(job, next, &list, link)
-               mock_sched_free_job(&job->base);
-
        drm_sched_fini(&sched->base);
    }

Re: [PATCH 2/6] drm/sched/tests: Port to cancel_job()

Reply via email to