On Thu, Aug 26, 2021 at 12:27:31PM +0100, Tvrtko Ursulin wrote:
> 
> On 26/08/2021 04:49, Matthew Brost wrote:
> > On Wed, Aug 25, 2021 at 11:39:10AM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 27/07/2021 01:23, Matthew Brost wrote:
> > > > When using GuC submission, if a context gets banned disable scheduling
> > > > and mark all inflight requests as complete.
> > > > 
> > > > Cc: John Harrison <john.c.harri...@intel.com>
> > > > Signed-off-by: Matthew Brost <matthew.br...@intel.com>
> > > > Reviewed-by: John Harrison <john.c.harri...@intel.com>
> > > > ---
> > > >    drivers/gpu/drm/i915/gem/i915_gem_context.c   |   2 +-
> > > >    drivers/gpu/drm/i915/gt/intel_context.h       |  13 ++
> > > >    drivers/gpu/drm/i915/gt/intel_context_types.h |   2 +
> > > >    drivers/gpu/drm/i915/gt/intel_reset.c         |  32 +---
> > > >    .../gpu/drm/i915/gt/intel_ring_submission.c   |  20 +++
> > > >    drivers/gpu/drm/i915/gt/uc/intel_guc.h        |   2 +
> > > >    .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 151 
> > > > ++++++++++++++++--
> > > >    drivers/gpu/drm/i915/i915_trace.h             |  10 ++
> > > >    8 files changed, 195 insertions(+), 37 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c 
> > > > b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > index e3df01a201d7..05c3ee191710 100644
> > > > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> > > > @@ -1084,7 +1084,7 @@ static void kill_engines(struct i915_gem_engines 
> > > > *engines, bool ban)
> > > >         for_each_gem_engine(ce, engines, it) {
> > > >                 struct intel_engine_cs *engine;
> > > > -               if (ban && intel_context_set_banned(ce))
> > > > +               if (ban && intel_context_ban(ce, NULL))
> > > >                         continue;
> > > >                 /*
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context.h 
> > > > b/drivers/gpu/drm/i915/gt/intel_context.h
> > > > index 2ed9bf5f91a5..814d9277096a 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context.h
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context.h
> > > > @@ -16,6 +16,7 @@
> > > >    #include "intel_engine_types.h"
> > > >    #include "intel_ring_types.h"
> > > >    #include "intel_timeline_types.h"
> > > > +#include "i915_trace.h"
> > > >    #define CE_TRACE(ce, fmt, ...) do {                                  
> > > > \
> > > >         const struct intel_context *ce__ = (ce);                        
> > > > \
> > > > @@ -243,6 +244,18 @@ static inline bool intel_context_set_banned(struct 
> > > > intel_context *ce)
> > > >         return test_and_set_bit(CONTEXT_BANNED, &ce->flags);
> > > >    }
> > > > +static inline bool intel_context_ban(struct intel_context *ce,
> > > > +                                    struct i915_request *rq)
> > > > +{
> > > > +       bool ret = intel_context_set_banned(ce);
> > > > +
> > > > +       trace_intel_context_ban(ce);
> > > > +       if (ce->ops->ban)
> > > > +               ce->ops->ban(ce, rq);
> > > > +
> > > > +       return ret;
> > > > +}
> > > > +
> > > >    static inline bool
> > > >    intel_context_force_single_submission(const struct intel_context *ce)
> > > >    {
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h 
> > > > b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > index 035108c10b2c..57c19ee3e313 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
> > > > @@ -35,6 +35,8 @@ struct intel_context_ops {
> > > >         int (*alloc)(struct intel_context *ce);
> > > > +       void (*ban)(struct intel_context *ce, struct i915_request *rq);
> > > > +
> > > >         int (*pre_pin)(struct intel_context *ce, struct i915_gem_ww_ctx 
> > > > *ww, void **vaddr);
> > > >         int (*pin)(struct intel_context *ce, void *vaddr);
> > > >         void (*unpin)(struct intel_context *ce);
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
> > > > b/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > index 4d281bc8a38c..91200c43951f 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> > > > @@ -22,7 +22,6 @@
> > > >    #include "intel_reset.h"
> > > >    #include "uc/intel_guc.h"
> > > > -#include "uc/intel_guc_submission.h"
> > > >    #define RESET_MAX_RETRIES 3
> > > > @@ -39,21 +38,6 @@ static void rmw_clear_fw(struct intel_uncore 
> > > > *uncore, i915_reg_t reg, u32 clr)
> > > >         intel_uncore_rmw_fw(uncore, reg, clr, 0);
> > > >    }
> > > > -static void skip_context(struct i915_request *rq)
> > > > -{
> > > > -       struct intel_context *hung_ctx = rq->context;
> > > > -
> > > > -       list_for_each_entry_from_rcu(rq, &hung_ctx->timeline->requests, 
> > > > link) {
> > > > -               if (!i915_request_is_active(rq))
> > > > -                       return;
> > > > -
> > > > -               if (rq->context == hung_ctx) {
> > > > -                       i915_request_set_error_once(rq, -EIO);
> > > > -                       __i915_request_skip(rq);
> > > > -               }
> > > > -       }
> > > > -}
> > > > -
> > > >    static void client_mark_guilty(struct i915_gem_context *ctx, bool 
> > > > banned)
> > > >    {
> > > >         struct drm_i915_file_private *file_priv = ctx->file_priv;
> > > > @@ -88,10 +72,8 @@ static bool mark_guilty(struct i915_request *rq)
> > > >         bool banned;
> > > >         int i;
> > > > -       if (intel_context_is_closed(rq->context)) {
> > > > -               intel_context_set_banned(rq->context);
> > > > +       if (intel_context_is_closed(rq->context))
> > > >                 return true;
> > > > -       }
> > > >         rcu_read_lock();
> > > >         ctx = rcu_dereference(rq->context->gem_context);
> > > > @@ -123,11 +105,9 @@ static bool mark_guilty(struct i915_request *rq)
> > > >         banned = !i915_gem_context_is_recoverable(ctx);
> > > >         if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
> > > >                 banned = true;
> > > > -       if (banned) {
> > > > +       if (banned)
> > > >                 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, 
> > > > banned\n",
> > > >                         ctx->name, atomic_read(&ctx->guilty_count));
> > > > -               intel_context_set_banned(rq->context);
> > > > -       }
> > > >         client_mark_guilty(ctx, banned);
> > > > @@ -149,6 +129,8 @@ static void mark_innocent(struct i915_request *rq)
> > > >    void __i915_request_reset(struct i915_request *rq, bool guilty)
> > > >    {
> > > > +       bool banned = false;
> > > > +
> > > >         RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
> > > >         GEM_BUG_ON(__i915_request_is_complete(rq));
> > > > @@ -156,13 +138,15 @@ void __i915_request_reset(struct i915_request 
> > > > *rq, bool guilty)
> > > >         if (guilty) {
> > > >                 i915_request_set_error_once(rq, -EIO);
> > > >                 __i915_request_skip(rq);
> > > > -               if (mark_guilty(rq) && 
> > > > !intel_engine_uses_guc(rq->engine))
> > > > -                       skip_context(rq);
> > > > +               banned = mark_guilty(rq);
> > > >         } else {
> > > >                 i915_request_set_error_once(rq, -EAGAIN);
> > > >                 mark_innocent(rq);
> > > >         }
> > > >         rcu_read_unlock();
> > > > +
> > > > +       if (banned)
> > > > +               intel_context_ban(rq->context, rq);
> > > >    }
> > > >    static bool i915_in_reset(struct pci_dev *pdev)
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c 
> > > > b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > > > index a5404c7b600f..05bb9f449df1 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
> > > > @@ -586,9 +586,29 @@ static void ring_context_reset(struct 
> > > > intel_context *ce)
> > > >         clear_bit(CONTEXT_VALID_BIT, &ce->flags);
> > > >    }
> > > > +static void ring_context_ban(struct intel_context *ce,
> > > > +                            struct i915_request *rq)
> > > > +{
> > > > +       struct intel_engine_cs *engine;
> > > > +
> > > > +       if (!rq || !i915_request_is_active(rq))
> > > > +               return;
> > > When this gets called from context close, via intel_context_ban, rq will 
> > > be
> > > always NULL - so the below loop which skips a line of executing requests
> > > never gets to run.
> > > 
> > > This appears a functional change versus previous state of the codebase,
> > > where skip_context() would run in any case.
> > > 
> > 
> > Hmm, not so sure about this being a functional change. When called from
> > __i915_request_reset, intel_context_ban (previously skip_context there)
> > always has a non-NULL request argument. When called from kill_engines
> > the request is NULL (short circuiting ring function), but we never
> > hooked into the ring backend before.
> 
> Yeah that's the area of confusion. You have defined the interface as
> "intel_context_ban(ce, rq)" and it is called two times during a common flow.
> It is true there is no functional change, but it is confusing what the
> purpose of intel_context_ban then is, given how absence of the rq parameter
> on the context close path makes it do almost nothing (apart in the case of
> GuC), plus on top, the flow actually relies on it being called 2nd time from
> deeper in the stack in case of ringbuf (other backend do not care).
> 

This was an attempt to pull backend specific behavior (i.e
skip_context prior to my patch) into the backend.

> > 
> > > I had this observation from a patch I am working on 
> > > (https://intel-gfx-ci.01.org/tree/drm-tip/Trybot_7950/shard-snb6/igt@gem_ctx_e...@basic-nohangcheck.html
> > > - so a long line of executing requests which did not get zapped post 
> > > reset),
> > > but may be wrong. Maybe I am missing something since I don't yet 
> > > understand
> > > why would I be first to hit this issue. So take it with a grain of salt 
> > > for
> > > now.
> > > 
> > 
> > CI was green on my series but CI doesn't always catch everything...
> > 
> > Do you have link to your series that I can look at?
> 
> It was on trybot, thought you'll figure it out, but now I have sent it to
> intel-gfx as well so please have a look. To be clear I am not too happy with
> the current state of that patch..
> 

I found it, almost certainly your problem is short circuiting it on the
state of the ban bit.

> > 
> > > Ah.. maybe the key is that in my patch I made intel_context_ban not call
> > > ce->ops->"ban" (I renamed it to revoke) unconditionally. Hence there may 
> > > be
> > > a path there intel_context_ban is first called wo/ a rq, then from within
> > > __i915_request_reset it gets called with rq, which now fails to call the
> > > vfunc. Hm that's clunky and fragile which ever way I look at it. I'll 
> > > trybot
> > > one more experiment..
> > > 
> > 
> > I'm open to suggestions on how to change this, to make it makes sense to
> > call into the function unconditionally with a correct arguments and the
> > backend handles the rest (i.e. the way it is currently).
> 
> .. given what I wrote in the first paragraph. And I presently have no
> suggestions how to improve it, since it seems complicated enough to require
> quite a bit of thinking.
> 
> The issues as I see them now, intel_context_ban, when called from the
> context close path:
> 
> Guc:
>  * Does not use the rq parameter even if it was available.
>  * Deals with the list of queued requests.
>    (Although why does guc_cancel_context_requests isn't doing any skipping,
> contrary to the comment inside?)
>

It calls i915_request_mark_eio which skips a request without putting in
back on the HW. The context is banned thus we can't / shouldn't put that
context on the HW ever again.

> Execlists:
>  * Does not use the vfunc at all, remains to be just a set_bit call.
>

Chris pulled the execlists vfunc out but isn't the point of having a
vfunc, it can be present or not.

> Ringbuffer:
>  * Just sets the bit.
> 
> When called from the reset path:
> 
> Guc:
>  * Same as on context close path - redundant?
>

Not redundant, in both cases we need to ban context in the GuC.
Scheduling is likely disabled in this path so we can cancel the requests
straight away.

> Execlists:
>  * Nothing, sets the same already set bit potentially.
> 
> Ringbuf:
>  * Deals with cancelling queued requests - like GuC does in the 1st
> invocation already.
>  * Does not use appear to use the rq argument meanigfully. Isn't the "is not
> active" check redundant to sched_engine->requests only having active request
> on the list?

Not sure. This is why added the request argument, perhaps it could be
removed? It being present / NULL also has a meaning (post-reset,
pre-reset).

> 
> Most obvious open is whether the rq paramenter is even needed.
>

Maybe, see above.

> Then is the redundant call to the same func in case of GuC needed? If not,
> which one should remain - first or the second call?
>

In the case of the GuC we need both calls.

Matt

> Regards,
> 
> Tvrtko
> 
> > Matt
> > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > > +
> > > > +       engine = rq->engine;
> > > > +       lockdep_assert_held(&engine->sched_engine->lock);
> > > > +       list_for_each_entry_continue(rq, 
> > > > &engine->sched_engine->requests,
> > > > +                                    sched.link)
> > > > +               if (rq->context == ce) {
> > > > +                       i915_request_set_error_once(rq, -EIO);
> > > > +                       __i915_request_skip(rq);
> > > > +               }
> > > > +}
> > > > +
> > > >    static const struct intel_context_ops ring_context_ops = {
> > > >         .alloc = ring_context_alloc,
> > > > +       .ban = ring_context_ban,
> > > > +
> > > >         .pre_pin = ring_context_pre_pin,
> > > >         .pin = ring_context_pin,
> > > >         .unpin = ring_context_unpin,
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
> > > > b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > index 1875303c3bca..8ab70a2223b0 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > > > @@ -281,6 +281,8 @@ void intel_guc_find_hung_context(struct 
> > > > intel_engine_cs *engine);
> > > >    int intel_guc_global_policies_update(struct intel_guc *guc);
> > > > +void intel_guc_context_ban(struct intel_context *ce, struct 
> > > > i915_request *rq);
> > > > +
> > > >    void intel_guc_submission_reset_prepare(struct intel_guc *guc);
> > > >    void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
> > > >    void intel_guc_submission_reset_finish(struct intel_guc *guc);
> > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
> > > > b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > index cea3e3073a71..ad9a38a861df 100644
> > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > > > @@ -125,6 +125,7 @@ static inline void 
> > > > clr_context_pending_enable(struct intel_context *ce)
> > > >    #define SCHED_STATE_WAIT_FOR_DEREGISTER_TO_REGISTER  BIT(0)
> > > >    #define SCHED_STATE_DESTROYED                                BIT(1)
> > > >    #define SCHED_STATE_PENDING_DISABLE                  BIT(2)
> > > > +#define SCHED_STATE_BANNED                             BIT(3)
> > > >    static inline void init_sched_state(struct intel_context *ce)
> > > >    {
> > > >         /* Only should be called from guc_lrc_desc_pin() */
> > > > @@ -185,6 +186,23 @@ static inline void 
> > > > clr_context_pending_disable(struct intel_context *ce)
> > > >         ce->guc_state.sched_state &= ~SCHED_STATE_PENDING_DISABLE;
> > > >    }
> > > > +static inline bool context_banned(struct intel_context *ce)
> > > > +{
> > > > +       return ce->guc_state.sched_state & SCHED_STATE_BANNED;
> > > > +}
> > > > +
> > > > +static inline void set_context_banned(struct intel_context *ce)
> > > > +{
> > > > +       lockdep_assert_held(&ce->guc_state.lock);
> > > > +       ce->guc_state.sched_state |= SCHED_STATE_BANNED;
> > > > +}
> > > > +
> > > > +static inline void clr_context_banned(struct intel_context *ce)
> > > > +{
> > > > +       lockdep_assert_held(&ce->guc_state.lock);
> > > > +       ce->guc_state.sched_state &= ~SCHED_STATE_BANNED;
> > > > +}
> > > > +
> > > >    static inline bool context_guc_id_invalid(struct intel_context *ce)
> > > >    {
> > > >         return ce->guc_id == GUC_INVALID_LRC_ID;
> > > > @@ -357,13 +375,23 @@ static int guc_lrc_desc_pin(struct intel_context 
> > > > *ce, bool loop);
> > > >    static int guc_add_request(struct intel_guc *guc, struct 
> > > > i915_request *rq)
> > > >    {
> > > > -       int err;
> > > > +       int err = 0;
> > > >         struct intel_context *ce = rq->context;
> > > >         u32 action[3];
> > > >         int len = 0;
> > > >         u32 g2h_len_dw = 0;
> > > >         bool enabled;
> > > > +       /*
> > > > +        * Corner case where requests were sitting in the priority list 
> > > > or a
> > > > +        * request resubmitted after the context was banned.
> > > > +        */
> > > > +       if (unlikely(intel_context_is_banned(ce))) {
> > > > +               i915_request_put(i915_request_mark_eio(rq));
> > > > +               intel_engine_signal_breadcrumbs(ce->engine);
> > > > +               goto out;
> > > > +       }
> > > > +
> > > >         GEM_BUG_ON(!atomic_read(&ce->guc_id_ref));
> > > >         GEM_BUG_ON(context_guc_id_invalid(ce));
> > > > @@ -399,6 +427,8 @@ static int guc_add_request(struct intel_guc *guc, 
> > > > struct i915_request *rq)
> > > >                 clr_context_pending_enable(ce);
> > > >                 intel_context_put(ce);
> > > >         }
> > > > +       if (likely(!err))
> > > > +               trace_i915_request_guc_submit(rq);
> > > >    out:
> > > >         return err;
> > > > @@ -463,7 +493,6 @@ static int guc_dequeue_one_context(struct intel_guc 
> > > > *guc)
> > > >                         guc->stalled_request = last;
> > > >                         return false;
> > > >                 }
> > > > -               trace_i915_request_guc_submit(last);
> > > >         }
> > > >         guc->stalled_request = NULL;
> > > > @@ -502,12 +531,13 @@ static void cs_irq_handler(struct intel_engine_cs 
> > > > *engine, u16 iir)
> > > >    static void __guc_context_destroy(struct intel_context *ce);
> > > >    static void release_guc_id(struct intel_guc *guc, struct 
> > > > intel_context *ce);
> > > >    static void guc_signal_context_fence(struct intel_context *ce);
> > > > +static void guc_cancel_context_requests(struct intel_context *ce);
> > > >    static void scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
> > > >    {
> > > >         struct intel_context *ce;
> > > >         unsigned long index, flags;
> > > > -       bool pending_disable, pending_enable, deregister, destroyed;
> > > > +       bool pending_disable, pending_enable, deregister, destroyed, 
> > > > banned;
> > > >         xa_for_each(&guc->context_lookup, index, ce) {
> > > >                 /* Flush context */
> > > > @@ -525,6 +555,7 @@ static void 
> > > > scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
> > > >                 pending_enable = context_pending_enable(ce);
> > > >                 pending_disable = context_pending_disable(ce);
> > > >                 deregister = 
> > > > context_wait_for_deregister_to_register(ce);
> > > > +               banned = context_banned(ce);
> > > >                 init_sched_state(ce);
> > > >                 if (pending_enable || destroyed || deregister) {
> > > > @@ -542,6 +573,10 @@ static void 
> > > > scrub_guc_desc_for_outstanding_g2h(struct intel_guc *guc)
> > > >                 /* Not mutualy exclusive with above if statement. */
> > > >                 if (pending_disable) {
> > > >                         guc_signal_context_fence(ce);
> > > > +                       if (banned) {
> > > > +                               guc_cancel_context_requests(ce);
> > > > +                               
> > > > intel_engine_signal_breadcrumbs(ce->engine);
> > > > +                       }
> > > >                         intel_context_sched_disable_unpin(ce);
> > > >                         atomic_dec(&guc->outstanding_submission_g2h);
> > > >                         intel_context_put(ce);
> > > > @@ -661,6 +696,9 @@ static void guc_reset_state(struct intel_context 
> > > > *ce, u32 head, bool scrub)
> > > >    {
> > > >         struct intel_engine_cs *engine = 
> > > > __context_to_physical_engine(ce);
> > > > +       if (intel_context_is_banned(ce))
> > > > +               return;
> > > > +
> > > >         GEM_BUG_ON(!intel_context_is_pinned(ce));
> > > >         /*
> > > > @@ -731,6 +769,8 @@ static void __guc_reset_context(struct 
> > > > intel_context *ce, bool stalled)
> > > >         struct i915_request *rq;
> > > >         u32 head;
> > > > +       intel_context_get(ce);
> > > > +
> > > >         /*
> > > >          * GuC will implicitly mark the context as non-schedulable
> > > >          * when it sends the reset notification. Make sure our state
> > > > @@ -756,6 +796,7 @@ static void __guc_reset_context(struct 
> > > > intel_context *ce, bool stalled)
> > > >    out_replay:
> > > >         guc_reset_state(ce, head, stalled);
> > > >         __unwind_incomplete_requests(ce);
> > > > +       intel_context_put(ce);
> > > >    }
> > > >    void intel_guc_submission_reset(struct intel_guc *guc, bool stalled)
> > > > @@ -940,8 +981,6 @@ static int guc_bypass_tasklet_submit(struct 
> > > > intel_guc *guc,
> > > >         ret = guc_add_request(guc, rq);
> > > >         if (ret == -EBUSY)
> > > >                 guc->stalled_request = rq;
> > > > -       else
> > > > -               trace_i915_request_guc_submit(rq);
> > > >         if (unlikely(ret == -EPIPE))
> > > >                 disable_submission(guc);
> > > > @@ -1344,13 +1383,77 @@ static u16 prep_context_pending_disable(struct 
> > > > intel_context *ce)
> > > >         return ce->guc_id;
> > > >    }
> > > > +static void __guc_context_set_preemption_timeout(struct intel_guc *guc,
> > > > +                                                u16 guc_id,
> > > > +                                                u32 preemption_timeout)
> > > > +{
> > > > +       u32 action[] = {
> > > > +               INTEL_GUC_ACTION_SET_CONTEXT_PREEMPTION_TIMEOUT,
> > > > +               guc_id,
> > > > +               preemption_timeout
> > > > +       };
> > > > +
> > > > +       intel_guc_send_busy_loop(guc, action, ARRAY_SIZE(action), 0, 
> > > > true);
> > > > +}
> > > > +
> > > > +static void guc_context_ban(struct intel_context *ce, struct 
> > > > i915_request *rq)
> > > > +{
> > > > +       struct intel_guc *guc = ce_to_guc(ce);
> > > > +       struct intel_runtime_pm *runtime_pm =
> > > > +               &ce->engine->gt->i915->runtime_pm;
> > > > +       intel_wakeref_t wakeref;
> > > > +       unsigned long flags;
> > > > +
> > > > +       guc_flush_submissions(guc);
> > > > +
> > > > +       spin_lock_irqsave(&ce->guc_state.lock, flags);
> > > > +       set_context_banned(ce);
> > > > +
> > > > +       if (submission_disabled(guc) ||
> > > > +           (!context_enabled(ce) && !context_pending_disable(ce))) {
> > > > +               spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > > +
> > > > +               guc_cancel_context_requests(ce);
> > > > +               intel_engine_signal_breadcrumbs(ce->engine);
> > > > +       } else if (!context_pending_disable(ce)) {
> > > > +               u16 guc_id;
> > > > +
> > > > +               /*
> > > > +                * We add +2 here as the schedule disable complete CTB 
> > > > handler
> > > > +                * calls intel_context_sched_disable_unpin (-2 to 
> > > > pin_count).
> > > > +                */
> > > > +               atomic_add(2, &ce->pin_count);
> > > > +
> > > > +               guc_id = prep_context_pending_disable(ce);
> > > > +               spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > > +
> > > > +               /*
> > > > +                * In addition to disabling scheduling, set the 
> > > > preemption
> > > > +                * timeout to the minimum value (1 us) so the banned 
> > > > context
> > > > +                * gets kicked off the HW ASAP.
> > > > +                */
> > > > +               with_intel_runtime_pm(runtime_pm, wakeref) {
> > > > +                       __guc_context_set_preemption_timeout(guc, 
> > > > guc_id, 1);
> > > > +                       __guc_context_sched_disable(guc, ce, guc_id);
> > > > +               }
> > > > +       } else {
> > > > +               if (!context_guc_id_invalid(ce))
> > > > +                       with_intel_runtime_pm(runtime_pm, wakeref)
> > > > +                               
> > > > __guc_context_set_preemption_timeout(guc,
> > > > +                                                                    
> > > > ce->guc_id,
> > > > +                                                                    1);
> > > > +               spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > > +       }
> > > > +}
> > > > +
> > > >    static void guc_context_sched_disable(struct intel_context *ce)
> > > >    {
> > > >         struct intel_guc *guc = ce_to_guc(ce);
> > > > -       struct intel_runtime_pm *runtime_pm = 
> > > > &ce->engine->gt->i915->runtime_pm;
> > > >         unsigned long flags;
> > > > -       u16 guc_id;
> > > > +       struct intel_runtime_pm *runtime_pm = 
> > > > &ce->engine->gt->i915->runtime_pm;
> > > >         intel_wakeref_t wakeref;
> > > > +       u16 guc_id;
> > > > +       bool enabled;
> > > >         if (submission_disabled(guc) || context_guc_id_invalid(ce) ||
> > > >             !lrc_desc_registered(guc, ce->guc_id)) {
> > > > @@ -1364,14 +1467,22 @@ static void guc_context_sched_disable(struct 
> > > > intel_context *ce)
> > > >         spin_lock_irqsave(&ce->guc_state.lock, flags);
> > > >         /*
> > > > -        * We have to check if the context has been pinned again as 
> > > > another pin
> > > > -        * operation is allowed to pass this function. Checking the pin 
> > > > count,
> > > > -        * within ce->guc_state.lock, synchronizes this function with
> > > > +        * We have to check if the context has been disabled by another 
> > > > thread.
> > > > +        * We also have to check if the context has been pinned again 
> > > > as another
> > > > +        * pin operation is allowed to pass this function. Checking the 
> > > > pin
> > > > +        * count, within ce->guc_state.lock, synchronizes this function 
> > > > with
> > > >          * guc_request_alloc ensuring a request doesn't slip through the
> > > >          * 'context_pending_disable' fence. Checking within the spin 
> > > > lock (can't
> > > >          * sleep) ensures another process doesn't pin this context and 
> > > > generate
> > > >          * a request before we set the 'context_pending_disable' flag 
> > > > here.
> > > >          */
> > > > +       enabled = context_enabled(ce);
> > > > +       if (unlikely(!enabled || submission_disabled(guc))) {
> > > > +               if (enabled)
> > > > +                       clr_context_enabled(ce);
> > > > +               spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > > +               goto unpin;
> > > > +       }
> > > >         if (unlikely(atomic_add_unless(&ce->pin_count, -2, 2))) {
> > > >                 spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > >                 return;
> > > > @@ -1529,6 +1640,8 @@ static const struct intel_context_ops 
> > > > guc_context_ops = {
> > > >         .unpin = guc_context_unpin,
> > > >         .post_unpin = guc_context_post_unpin,
> > > > +       .ban = guc_context_ban,
> > > > +
> > > >         .enter = intel_context_enter_engine,
> > > >         .exit = intel_context_exit_engine,
> > > > @@ -1722,6 +1835,8 @@ static const struct intel_context_ops 
> > > > virtual_guc_context_ops = {
> > > >         .unpin = guc_context_unpin,
> > > >         .post_unpin = guc_context_post_unpin,
> > > > +       .ban = guc_context_ban,
> > > > +
> > > >         .enter = guc_virtual_context_enter,
> > > >         .exit = guc_virtual_context_exit,
> > > > @@ -2164,6 +2279,8 @@ int intel_guc_sched_done_process_msg(struct 
> > > > intel_guc *guc,
> > > >         if (context_pending_enable(ce)) {
> > > >                 clr_context_pending_enable(ce);
> > > >         } else if (context_pending_disable(ce)) {
> > > > +               bool banned;
> > > > +
> > > >                 /*
> > > >                  * Unpin must be done before __guc_signal_context_fence,
> > > >                  * otherwise a race exists between the requests getting
> > > > @@ -2174,9 +2291,16 @@ int intel_guc_sched_done_process_msg(struct 
> > > > intel_guc *guc,
> > > >                 intel_context_sched_disable_unpin(ce);
> > > >                 spin_lock_irqsave(&ce->guc_state.lock, flags);
> > > > +               banned = context_banned(ce);
> > > > +               clr_context_banned(ce);
> > > >                 clr_context_pending_disable(ce);
> > > >                 __guc_signal_context_fence(ce);
> > > >                 spin_unlock_irqrestore(&ce->guc_state.lock, flags);
> > > > +
> > > > +               if (banned) {
> > > > +                       guc_cancel_context_requests(ce);
> > > > +                       intel_engine_signal_breadcrumbs(ce->engine);
> > > > +               }
> > > >         }
> > > >         decr_outstanding_submission_g2h(guc);
> > > > @@ -2211,8 +2335,11 @@ static void guc_handle_context_reset(struct 
> > > > intel_guc *guc,
> > > >                                      struct intel_context *ce)
> > > >    {
> > > >         trace_intel_context_reset(ce);
> > > > -       capture_error_state(guc, ce);
> > > > -       guc_context_replay(ce);
> > > > +
> > > > +       if (likely(!intel_context_is_banned(ce))) {
> > > > +               capture_error_state(guc, ce);
> > > > +               guc_context_replay(ce);
> > > > +       }
> > > >    }
> > > >    int intel_guc_context_reset_process_msg(struct intel_guc *guc,
> > > > diff --git a/drivers/gpu/drm/i915/i915_trace.h 
> > > > b/drivers/gpu/drm/i915/i915_trace.h
> > > > index 3f43d904f043..9613a7c19661 100644
> > > > --- a/drivers/gpu/drm/i915/i915_trace.h
> > > > +++ b/drivers/gpu/drm/i915/i915_trace.h
> > > > @@ -925,6 +925,11 @@ DEFINE_EVENT(intel_context, intel_context_reset,
> > > >              TP_ARGS(ce)
> > > >    );
> > > > +DEFINE_EVENT(intel_context, intel_context_ban,
> > > > +            TP_PROTO(struct intel_context *ce),
> > > > +            TP_ARGS(ce)
> > > > +);
> > > > +
> > > >    DEFINE_EVENT(intel_context, intel_context_register,
> > > >              TP_PROTO(struct intel_context *ce),
> > > >              TP_ARGS(ce)
> > > > @@ -1017,6 +1022,11 @@ trace_intel_context_reset(struct intel_context 
> > > > *ce)
> > > >    {
> > > >    }
> > > > +static inline void
> > > > +trace_intel_context_ban(struct intel_context *ce)
> > > > +{
> > > > +}
> > > > +
> > > >    static inline void
> > > >    trace_intel_context_register(struct intel_context *ce)
> > > >    {
> > > > 

Reply via email to