selftests: Stop using kthread_stop()

Ville Syrjälä Thu, 20 Oct 2022 07:19:21 -0700

On Thu, Oct 20, 2022 at 02:08:41PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> 
> Since a7c01fa93aeb ("signal: break out of wait loops on kthread_stop()")
> kthread_stop() started asserting a pending signal which wreaks havoc with
> a few of our selftests. Mainly because they are not fully expecting to
> handle signals, but also cutting the intended test runtimes short due
> signal_pending() now returning true (via __igt_timeout), which therefore
> breaks both the patterns of:
> 
>   kthread_run()
>   ..sleep for igt_timeout_ms to allow test to exercise stuff..
>   kthread_stop()
> 
> And check for errors recorded in the thread.
> 
> And also:
> 
>     Main thread  |   Test thread
>   ---------------+------------------------------
>   kthread_run()  |
>   kthread_stop() |  do stuff until __igt_timeout
>                |  -- exits early due signal --
> 
> Where this kthread_stop() was assume would have a "join" semantics, which
> it would have had if not the new signal assertion issue.
> 
> To recap, threads are now likely to catch a previously impossible
> ERESTARTSYS or EINTR, marking the test as failed, or have a pointlessly
> short run time.
> 
> To work around this start using kthread_work(er) API which provides
> an explicit way of waiting for threads to exit. And for cases where
> parent controls the test duration we add explicit signaling which threads
> will now use instead of relying on kthread_should_stop().
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> Cc: Ville Syrjälä <ville.syrj...@linux.intel.com>
> ---
>  .../drm/i915/gem/selftests/i915_gem_context.c | 118 ++++----
>  drivers/gpu/drm/i915/gt/selftest_execlists.c  |  48 ++--
>  drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |  51 ++--
>  drivers/gpu/drm/i915/selftests/i915_request.c | 252 +++++++++++-------
>  4 files changed, 281 insertions(+), 188 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c 
> b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> index c6ad67b90e8a..d8864444432b 100644
> --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c
> @@ -179,97 +179,108 @@ static int live_nop_switch(void *arg)
>  }
>  
>  struct parallel_switch {
> -     struct task_struct *tsk;
> +     struct kthread_worker *worker;
> +     struct kthread_work work;
>       struct intel_context *ce[2];
> +     int result;
>  };
>  
> -static int __live_parallel_switch1(void *data)
> +static void __live_parallel_switch1(struct kthread_work *work)
>  {
> -     struct parallel_switch *arg = data;
> +     struct parallel_switch *arg =
> +             container_of(work, typeof(*arg), work);
>       IGT_TIMEOUT(end_time);
>       unsigned long count;
>  
>       count = 0;
> +     arg->result = 0;
>       do {
>               struct i915_request *rq = NULL;
> -             int err, n;
> +             int n;
>  
> -             err = 0;
> -             for (n = 0; !err && n < ARRAY_SIZE(arg->ce); n++) {
> +             for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) {
>                       struct i915_request *prev = rq;
>  
>                       rq = i915_request_create(arg->ce[n]);
>                       if (IS_ERR(rq)) {
>                               i915_request_put(prev);
> -                             return PTR_ERR(rq);
> +                             arg->result = PTR_ERR(rq);
> +                             break;
>                       }
>  
>                       i915_request_get(rq);
>                       if (prev) {
> -                             err = i915_request_await_dma_fence(rq, 
> &prev->fence);
> +                             arg->result =
> +                                     i915_request_await_dma_fence(rq,
> +                                                                  
> &prev->fence);
>                               i915_request_put(prev);
>                       }
>  
>                       i915_request_add(rq);
>               }
> +
> +             if (IS_ERR_OR_NULL(rq))
> +                     break;
> +
>               if (i915_request_wait(rq, 0, HZ) < 0)
> -                     err = -ETIME;
> +                     arg->result = -ETIME;
> +
>               i915_request_put(rq);
> -             if (err)
> -                     return err;
>  
>               count++;
> -     } while (!__igt_timeout(end_time, NULL));
> +     } while (!arg->result && !__igt_timeout(end_time, NULL));
>  
> -     pr_info("%s: %lu switches (sync)\n", arg->ce[0]->engine->name, count);
> -     return 0;
> +     pr_info("%s: %lu switches (sync) <%d>\n",
> +             arg->ce[0]->engine->name, count, arg->result);
>  }
>  
> -static int __live_parallel_switchN(void *data)
> +static void __live_parallel_switchN(struct kthread_work *work)
>  {
> -     struct parallel_switch *arg = data;
> +     struct parallel_switch *arg =
> +             container_of(work, typeof(*arg), work);
>       struct i915_request *rq = NULL;
>       IGT_TIMEOUT(end_time);
>       unsigned long count;
>       int n;
>  
>       count = 0;
> +     arg->result = 0;
>       do {
> -             for (n = 0; n < ARRAY_SIZE(arg->ce); n++) {
> +             for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) {
>                       struct i915_request *prev = rq;
> -                     int err = 0;
>  
>                       rq = i915_request_create(arg->ce[n]);
>                       if (IS_ERR(rq)) {
>                               i915_request_put(prev);
> -                             return PTR_ERR(rq);
> +                             arg->result = PTR_ERR(rq);
> +                             break;
>                       }
>  
>                       i915_request_get(rq);
>                       if (prev) {
> -                             err = i915_request_await_dma_fence(rq, 
> &prev->fence);
> +                             arg->result =
> +                                     i915_request_await_dma_fence(rq,
> +                                                                  
> &prev->fence);
>                               i915_request_put(prev);
>                       }
>  
>                       i915_request_add(rq);
> -                     if (err) {
> -                             i915_request_put(rq);
> -                             return err;
> -                     }
>               }
>  
>               count++;
> -     } while (!__igt_timeout(end_time, NULL));
> -     i915_request_put(rq);
> +     } while (!arg->result && !__igt_timeout(end_time, NULL));
>  
> -     pr_info("%s: %lu switches (many)\n", arg->ce[0]->engine->name, count);
> -     return 0;
> +     if (!IS_ERR_OR_NULL(rq))
> +             i915_request_put(rq);
> +
> +     pr_info("%s: %lu switches (many) <%d>\n",
> +             arg->ce[0]->engine->name, count, arg->result);
>  }
>  
>  static int live_parallel_switch(void *arg)
>  {
>       struct drm_i915_private *i915 = arg;
> -     static int (* const func[])(void *arg) = {
> +     static void (* const func[])(struct kthread_work *) = {
>               __live_parallel_switch1,
>               __live_parallel_switchN,
>               NULL,
> @@ -277,7 +288,7 @@ static int live_parallel_switch(void *arg)
>       struct parallel_switch *data = NULL;
>       struct i915_gem_engines *engines;
>       struct i915_gem_engines_iter it;
> -     int (* const *fn)(void *arg);
> +     void (* const *fn)(struct kthread_work *);
>       struct i915_gem_context *ctx;
>       struct intel_context *ce;
>       struct file *file;
> @@ -348,9 +359,22 @@ static int live_parallel_switch(void *arg)
>               }
>       }
>  
> +     for (n = 0; n < count; n++) {
> +             struct kthread_worker *worker;
> +
> +             if (!data[n].ce[0])
> +                     continue;
> +
> +             worker = kthread_create_worker(0, "igt/parallel:%s",
> +                                            data[n].ce[0]->engine->name);
> +             if (IS_ERR(worker))
> +                     goto out;
> +
> +             data[n].worker = worker;
> +     }
> +
>       for (fn = func; !err && *fn; fn++) {
>               struct igt_live_test t;
> -             int n;
>  
>               err = igt_live_test_begin(&t, i915, __func__, "");
>               if (err)
> @@ -360,30 +384,17 @@ static int live_parallel_switch(void *arg)
>                       if (!data[n].ce[0])
>                               continue;
>  
> -                     data[n].tsk = kthread_run(*fn, &data[n],
> -                                               "igt/parallel:%s",
> -                                               data[n].ce[0]->engine->name);
> -                     if (IS_ERR(data[n].tsk)) {
> -                             err = PTR_ERR(data[n].tsk);
> -                             break;
> -                     }
> -                     get_task_struct(data[n].tsk);
> +                     data[n].result = 0;
> +                     kthread_init_work(&data[n].work, *fn);
> +                     kthread_queue_work(data[n].worker, &data[n].work);
>               }
>  
> -             yield(); /* start all threads before we kthread_stop() */
> -
>               for (n = 0; n < count; n++) {
> -                     int status;
> -
> -                     if (IS_ERR_OR_NULL(data[n].tsk))
> -                             continue;
> -
> -                     status = kthread_stop(data[n].tsk);
> -                     if (status && !err)
> -                             err = status;
> -
> -                     put_task_struct(data[n].tsk);
> -                     data[n].tsk = NULL;
> +                     if (data[n].ce[0]) {
> +                             kthread_flush_work(&data[n].work);
> +                             if (data[n].result && !err)
> +                                     err = data[n].result;
> +                     }
>               }
>  
>               if (igt_live_test_end(&t))
> @@ -399,6 +410,9 @@ static int live_parallel_switch(void *arg)
>                       intel_context_unpin(data[n].ce[m]);
>                       intel_context_put(data[n].ce[m]);
>               }
> +
> +             if (data[n].worker)
> +                     kthread_destroy_worker(data[n].worker);
>       }
>       kfree(data);
>  out_file:
> diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c 
> b/drivers/gpu/drm/i915/gt/selftest_execlists.c
> index 56b7d5b5fea0..2c7c053a8808 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
> @@ -3473,12 +3473,14 @@ static int random_priority(struct rnd_state *rnd)
>  
>  struct preempt_smoke {
>       struct intel_gt *gt;
> +     struct kthread_work work;
>       struct i915_gem_context **contexts;
>       struct intel_engine_cs *engine;
>       struct drm_i915_gem_object *batch;
>       unsigned int ncontext;
>       struct rnd_state prng;
>       unsigned long count;
> +     int result;
>  };
>  
>  static struct i915_gem_context *smoke_context(struct preempt_smoke *smoke)
> @@ -3538,34 +3540,31 @@ static int smoke_submit(struct preempt_smoke *smoke,
>       return err;
>  }
>  
> -static int smoke_crescendo_thread(void *arg)
> +static void smoke_crescendo_work(struct kthread_work *work)
>  {
> -     struct preempt_smoke *smoke = arg;
> +     struct preempt_smoke *smoke = container_of(work, typeof(*smoke), work);
>       IGT_TIMEOUT(end_time);
>       unsigned long count;
>  
>       count = 0;
>       do {
>               struct i915_gem_context *ctx = smoke_context(smoke);
> -             int err;
>  
> -             err = smoke_submit(smoke,
> -                                ctx, count % I915_PRIORITY_MAX,
> -                                smoke->batch);
> -             if (err)
> -                     return err;
> +             smoke->result = smoke_submit(smoke, ctx,
> +                                          count % I915_PRIORITY_MAX,
> +                                          smoke->batch);
>  
>               count++;
> -     } while (count < smoke->ncontext && !__igt_timeout(end_time, NULL));
> +     } while (!smoke->result && count < smoke->ncontext &&
> +              !__igt_timeout(end_time, NULL));
>  
>       smoke->count = count;
> -     return 0;
>  }
>  
>  static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags)
>  #define BATCH BIT(0)
>  {
> -     struct task_struct *tsk[I915_NUM_ENGINES] = {};
> +     struct kthread_worker *worker[I915_NUM_ENGINES] = {};
>       struct preempt_smoke *arg;
>       struct intel_engine_cs *engine;
>       enum intel_engine_id id;
> @@ -3576,6 +3575,8 @@ static int smoke_crescendo(struct preempt_smoke *smoke, 
> unsigned int flags)
>       if (!arg)
>               return -ENOMEM;
>  
> +     memset(arg, 0, I915_NUM_ENGINES * sizeof(*arg));


kcalloc()?

> +
>       for_each_engine(engine, smoke->gt, id) {
>               arg[id] = *smoke;
>               arg[id].engine = engine;
> @@ -3583,31 +3584,28 @@ static int smoke_crescendo(struct preempt_smoke 
> *smoke, unsigned int flags)
>                       arg[id].batch = NULL;
>               arg[id].count = 0;
>  
> -             tsk[id] = kthread_run(smoke_crescendo_thread, arg,
> -                                   "igt/smoke:%d", id);
> -             if (IS_ERR(tsk[id])) {
> -                     err = PTR_ERR(tsk[id]);
> +             worker[id] = kthread_create_worker(0, "igt/smoke:%d", id);
> +             if (IS_ERR(worker[id])) {
> +                     err = PTR_ERR(worker[id]);
>                       break;
>               }
> -             get_task_struct(tsk[id]);
> -     }
>  
> -     yield(); /* start all threads before we kthread_stop() */
> +             kthread_init_work(&arg[id].work, smoke_crescendo_work);
> +             kthread_queue_work(worker[id], &arg[id].work);
> +     }
>  
>       count = 0;
>       for_each_engine(engine, smoke->gt, id) {
> -             int status;
> -
> -             if (IS_ERR_OR_NULL(tsk[id]))
> +             if (IS_ERR_OR_NULL(worker[id]))
>                       continue;
>  
> -             status = kthread_stop(tsk[id]);
> -             if (status && !err)
> -                     err = status;
> +             kthread_flush_work(&arg[id].work);
> +             if (arg[id].result && !err)
> +                     err = arg[id].result;
>  
>               count += arg[id].count;
>  
> -             put_task_struct(tsk[id]);
> +             kthread_destroy_worker(worker[id]);
>       }
>  
>       pr_info("Submitted %lu crescendo:%x requests across %d engines and %d 
> contexts\n",
> diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c 
> b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> index 7f3bb1d34dfb..71263058a7b0 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
> @@ -866,10 +866,13 @@ static int igt_reset_active_engine(void *arg)
>  }
>  
>  struct active_engine {
> -     struct task_struct *task;
> +     struct kthread_worker *worker;
> +     struct kthread_work work;
>       struct intel_engine_cs *engine;
>       unsigned long resets;
>       unsigned int flags;
> +     bool stop;
> +     int result;
>  };
>  
>  #define TEST_ACTIVE  BIT(0)
> @@ -900,10 +903,10 @@ static int active_request_put(struct i915_request *rq)
>       return err;
>  }
>  
> -static int active_engine(void *data)
> +static void active_engine(struct kthread_work *work)
>  {
>       I915_RND_STATE(prng);
> -     struct active_engine *arg = data;
> +     struct active_engine *arg = container_of(work, typeof(*arg), work);
>       struct intel_engine_cs *engine = arg->engine;
>       struct i915_request *rq[8] = {};
>       struct intel_context *ce[ARRAY_SIZE(rq)];
> @@ -913,16 +916,17 @@ static int active_engine(void *data)
>       for (count = 0; count < ARRAY_SIZE(ce); count++) {
>               ce[count] = intel_context_create(engine);
>               if (IS_ERR(ce[count])) {
> -                     err = PTR_ERR(ce[count]);
> -                     pr_err("[%s] Create context #%ld failed: %d!\n", 
> engine->name, count, err);
> +                     arg->result = PTR_ERR(ce[count]);
> +                     pr_err("[%s] Create context #%ld failed: %d!\n",
> +                            engine->name, count, arg->result);
>                       while (--count)
>                               intel_context_put(ce[count]);
> -                     return err;
> +                     return;
>               }
>       }
>  
>       count = 0;
> -     while (!kthread_should_stop()) {
> +     while (!READ_ONCE(arg->stop)) {
>               unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
>               struct i915_request *old = rq[idx];
>               struct i915_request *new;
> @@ -967,7 +971,7 @@ static int active_engine(void *data)
>               intel_context_put(ce[count]);
>       }
>  
> -     return err;
> +     arg->result = err;
>  }
>  
>  static int __igt_reset_engines(struct intel_gt *gt,
> @@ -1022,7 +1026,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
>  
>               memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
>               for_each_engine(other, gt, tmp) {
> -                     struct task_struct *tsk;
> +                     struct kthread_worker *worker;
>  
>                       threads[tmp].resets =
>                               i915_reset_engine_count(global, other);
> @@ -1036,19 +1040,21 @@ static int __igt_reset_engines(struct intel_gt *gt,
>                       threads[tmp].engine = other;
>                       threads[tmp].flags = flags;
>  
> -                     tsk = kthread_run(active_engine, &threads[tmp],
> -                                       "igt/%s", other->name);
> -                     if (IS_ERR(tsk)) {
> -                             err = PTR_ERR(tsk);
> -                             pr_err("[%s] Thread spawn failed: %d!\n", 
> engine->name, err);
> +                     worker = kthread_create_worker(0, "igt/%s",
> +                                                    other->name);
> +                     if (IS_ERR(worker)) {
> +                             err = PTR_ERR(worker);
> +                             pr_err("[%s] Worker create failed: %d!\n",
> +                                    engine->name, err);
>                               goto unwind;
>                       }
>  
> -                     threads[tmp].task = tsk;
> -                     get_task_struct(tsk);
> -             }
> +                     threads[tmp].worker = worker;
>  
> -             yield(); /* start all threads before we begin */
> +                     kthread_init_work(&threads[tmp].work, active_engine);
> +                     kthread_queue_work(threads[tmp].worker,
> +                                        &threads[tmp].work);
> +             }
>  
>               st_engine_heartbeat_disable_no_pm(engine);
>               GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
> @@ -1197,17 +1203,20 @@ static int __igt_reset_engines(struct intel_gt *gt,
>               for_each_engine(other, gt, tmp) {
>                       int ret;
>  
> -                     if (!threads[tmp].task)
> +                     if (!threads[tmp].worker)
>                               continue;
>  
> -                     ret = kthread_stop(threads[tmp].task);
> +                     WRITE_ONCE(threads[tmp].stop, true);
> +                     kthread_flush_work(&threads[tmp].work);

Could perhaps optimize things a bit flagging all the threads
to stop before doing any flush_work()s. And maybe also do all
the init stuff before queuing any works. But dunno if that's
worth the hassle.

Looks reasonable enough to me.
Reviewed-by: Ville Syrjälä <ville.syrj...@linux.intel.com>

-- 
Ville Syrjälä
Intel

Re: [PATCH] drm/i915/selftests: Stop using kthread_stop()

Reply via email to