Chris Wilson <ch...@chris-wilson.co.uk> writes:

> Some clients, such as mesa, may only emit minimal incremental batches
> that rely on the logical context state from previous batches. They know
> that recovery is impossible after a hang as their required GPU state is
> lost, and that each in flight and subsequent batch will hang (resetting
> the context image back to default perpetuating the problem).
>
> To avoid getting into the state in the first place, we can allow clients
> to opt out of automatic recovery and elect to ban any guilty context
> following a hang. This prevents the continual stream of hangs and allows
> the client to recreate their context and rebuild the state from scratch.
>
> v2: Prefer calling it recoverable rather than unrecoverable.
>

Ok. You went forward with this. I am fine with both
flavours,

Reviewed-by: Mika Kuoppala <mika.kuopp...@linux.intel.com>

> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenn...@whitecape.org>
> Cc: Mika Kuoppala <mika.kuopp...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_context.c | 13 +++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 16 ++++++++++++++++
>  include/uapi/drm/i915_drm.h             | 20 ++++++++++++++++++++
>  4 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..9b02d70fbabf 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3018,7 +3018,8 @@ static void i915_gem_context_mark_guilty(struct 
> i915_gem_context *ctx)
>  
>       bannable = i915_gem_context_is_bannable(ctx);
>       score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -     banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +     banned = (!i915_gem_context_is_recoverable(ctx) ||
> +               score >= CONTEXT_SCORE_BAN_THRESHOLD);
>  
>       /* Cool contexts don't accumulate client ban score */
>       if (!bannable)
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8cbe58070561..b31da0375c8c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -338,6 +338,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
>       list_add_tail(&ctx->link, &dev_priv->contexts.list);
>       ctx->i915 = dev_priv;
>       ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
> +     ctx->user_flags = BIT(UCONTEXT_RECOVERABLE);
>  
>       for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
>               struct intel_context *ce = &ctx->__engine[n];
> @@ -878,6 +879,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device 
> *dev, void *data,
>       case I915_CONTEXT_PARAM_BANNABLE:
>               args->value = i915_gem_context_is_bannable(ctx);
>               break;
> +     case I915_CONTEXT_PARAM_RECOVERABLE:
> +             args->value = i915_gem_context_is_recoverable(ctx);
> +             break;
>       case I915_CONTEXT_PARAM_PRIORITY:
>               args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>               break;
> @@ -933,6 +937,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device 
> *dev, void *data,
>                       i915_gem_context_clear_bannable(ctx);
>               break;
>  
> +     case I915_CONTEXT_PARAM_RECOVERABLE:
> +             if (args->size)
> +                     ret = -EINVAL;
> +             else if (args->value)
> +                     i915_gem_context_set_recoverable(ctx);
> +             else
> +                     i915_gem_context_clear_recoverable(ctx);
> +             break;
> +
>       case I915_CONTEXT_PARAM_PRIORITY:
>               {
>                       s64 priority = args->value;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
> b/drivers/gpu/drm/i915/i915_gem_context.h
> index 08165f6a0a84..397d50b826d7 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -123,6 +123,7 @@ struct i915_gem_context {
>  #define UCONTEXT_NO_ZEROMAP          0
>  #define UCONTEXT_NO_ERROR_CAPTURE    1
>  #define UCONTEXT_BANNABLE            2
> +#define UCONTEXT_RECOVERABLE         3
>  
>       /**
>        * @flags: small set of booleans
> @@ -247,6 +248,21 @@ static inline void 
> i915_gem_context_clear_bannable(struct i915_gem_context *ctx)
>       clear_bit(UCONTEXT_BANNABLE, &ctx->user_flags);
>  }
>  
> +static inline bool i915_gem_context_is_recoverable(const struct 
> i915_gem_context *ctx)
> +{
> +     return test_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_set_recoverable(struct i915_gem_context 
> *ctx)
> +{
> +     set_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
> +static inline void i915_gem_context_clear_recoverable(struct 
> i915_gem_context *ctx)
> +{
> +     clear_bit(UCONTEXT_RECOVERABLE, &ctx->user_flags);
> +}
> +
>  static inline bool i915_gem_context_is_banned(const struct i915_gem_context 
> *ctx)
>  {
>       return test_bit(CONTEXT_BANNED, &ctx->flags);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..eeb258a12b95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1486,6 +1486,26 @@ struct drm_i915_gem_context_param {
>  #define   I915_CONTEXT_MAX_USER_PRIORITY     1023 /* inclusive */
>  #define   I915_CONTEXT_DEFAULT_PRIORITY              0
>  #define   I915_CONTEXT_MIN_USER_PRIORITY     -1023 /* inclusive */
> +
> +/*
> + * Not all clients may want to attempt automatic recover of a context after
> + * a hang (for example, some clients may only submit very small incremental
> + * batches relying on known logical state of previous batches which will 
> never
> + * recover correctly and each attempt will hang), and so would prefer that
> + * the context is forever banned instead.
> + *
> + * If set to false (0), after a reset, subsequent (and in flight) rendering
> + * from this context is discarded, and the client will need to create a new
> + * context to use instead.
> + *
> + * If set to true (1), the kernel will automatically attempt to recover the
> + * context by skipping the hanging batch and executing the next batch 
> starting
> + * from the default context state (discarding the incomplete logical context
> + * state lost due to the reset).
> + *
> + * On creation, all new contexts are marked as recoverable.
> + */
> +#define I915_CONTEXT_PARAM_RECOVERABLE       0x7
>       __u64 value;
>  };
>  
> -- 
> 2.19.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to