On Wed, Jan 13, 2016 at 11:44:39AM +0100, Artur Harasimiuk wrote:
> Starting from Gen9 we can use IA-Coherent caches. Coherence may be not
> required in certain cases and can be disabled in an easy way. To do this
> we can set HDC_FORCE_NON_COHERENT bit in HDC_CHICKEN0 register. This
> register is part of HW context, however it is private and cannot be
> programmed from non-privileged batch buffer.
> 
> New parameter is to override default programming and allow UMD to
> decide whether IA-Coherency is not needed for submitted batch buffer.
> When flag is set KMD emits commands to disable coherency before batch
> buffer execution starts. After execution finished state is restored.

What's the usecase for batch vs context flag?
 
> When WaForceEnableNonCoherent is programmed, it does not make sense to
> allow for coherency because this can lead to GPU hangs. In such
> situation flag is ignored.
> 
> Signed-off-by: Artur Harasimiuk <artur.harasim...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c            |  3 +++
>  drivers/gpu/drm/i915/i915_drv.h            |  4 ++++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 ++++++++
>  drivers/gpu/drm/i915/intel_lrc.c           | 28 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  7 +++++++
>  include/uapi/drm/i915_drm.h                |  8 +++++++-
>  6 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 44a896c..79ecf20 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -172,6 +172,9 @@ static int i915_getparam(struct drm_device *dev, void 
> *data,
>       case I915_PARAM_HAS_EXEC_SOFTPIN:
>               value = 1;
>               break;
> +     case I915_PARAM_HAS_EXEC_FORCE_NON_COHERENT:
> +             value = INTEL_INFO(dev)->gen >= 9;

This should actually report the w/a status, or else you need to provide
some other means for userspace to detect if it can successfully force
non-coherency.

> +             break;
>       default:
>               DRM_DEBUG("Unknown parameter %d\n", param->param);
>               return -EINVAL;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 104bd18..71d739c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -886,6 +886,10 @@ struct intel_context {
>       } engine[I915_NUM_RINGS];
>  
>       struct list_head link;
> +
> +     struct {
> +             unsigned int WaForceEnableNonCoherent:1;
> +     } wa;
>  };
>  
>  enum fb_op_origin {
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index d469c47..2997a58 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1400,6 +1400,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
> *data,
>       if (!i915_gem_check_execbuffer(args))
>               return -EINVAL;
>  
> +     if ((args->flags & I915_EXEC_FORCE_NON_COHERENT) &&
> +             INTEL_INFO(dev)->gen < 9)
> +             return -EINVAL;

It would seem easier to simply ignore this request in all situations
where it doesn't apply, rather than the current select few.

>       ret = validate_exec_list(dev, exec, args->buffer_count);
>       if (ret)
>               return ret;
> @@ -1494,6 +1498,10 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
> *data,
>  
>       i915_gem_context_reference(ctx);
>  
> +     /* Clear this flag when WA is programmed */
> +     if (ctx->wa.WaForceEnableNonCoherent)
> +             args->flags &= ~I915_EXEC_FORCE_NON_COHERENT;
> +
>       if (ctx->ppgtt)
>               vm = &ctx->ppgtt->base;
>       else
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index ab344e0..4482a6a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -879,6 +879,24 @@ int intel_logical_ring_reserve_space(struct 
> drm_i915_gem_request *request)
>       return intel_logical_ring_begin(request, 0);
>  }
>  
> +static inline void
> +intel_lr_emit_force_non_coherent(struct i915_execbuffer_params *params,
> +             struct drm_i915_gem_execbuffer2 *args, bool force)
> +{
> +     if (args->flags & I915_EXEC_FORCE_NON_COHERENT) {
> +             struct intel_ringbuffer *ringbuf =
> +                             params->ctx->engine[params->ring->id].ringbuf;

This should be using the request->ringbuf

> +

Missing ring_begin.

> +             intel_logical_ring_emit(ringbuf, MI_NOOP);
> +             intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
> +             intel_logical_ring_emit(ringbuf, HDC_CHICKEN0.reg);

intel_logical_ring_emit_reg(ringbuf, HDC_CHICKEN0);

> +             intel_logical_ring_emit(ringbuf, force ?
> +                             _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT) :
> +                             _MASKED_BIT_DISABLE(HDC_FORCE_NON_COHERENT));
> +             intel_logical_ring_advance(ringbuf);
> +     }
> +}
> +
>  /**
>   * execlists_submission() - submit a batchbuffer for execution, Execlists 
> style
>   * @dev: DRM device.
> @@ -959,6 +977,8 @@ int intel_execlists_submission(struct 
> i915_execbuffer_params *params,
>               dev_priv->relative_constants_mode = instp_mode;
>       }
>  
> +     intel_lr_emit_force_non_coherent(params, args, true);

This is after cache invalidation. Do you need to send another cache
invalidate?

> +
>       exec_start = params->batch_obj_vm_offset +
>                    args->batch_start_offset;
>  
> @@ -966,6 +986,8 @@ int intel_execlists_submission(struct 
> i915_execbuffer_params *params,
>       if (ret)
>               return ret;
>  
> +     intel_lr_emit_force_non_coherent(params, args, false);

You anticipate a context switching between modes between every batch?
This goes back to whether it is a context flag vs batch flag, and
whether we should cache the register value.

> +
>       trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
>  
>       i915_gem_execbuffer_move_to_active(vmas, params->request);
> @@ -1112,6 +1134,9 @@ static int intel_logical_ring_workarounds_emit(struct 
> drm_i915_gem_request *req)
>       struct drm_device *dev = ring->dev;
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       struct i915_workarounds *w = &dev_priv->workarounds;
> +     struct intel_context *ctx = req->ctx;
> +
> +     ctx->wa.WaForceEnableNonCoherent = 0;
>  
>       if (w->count == 0)
>               return 0;
> @@ -1129,6 +1154,9 @@ static int intel_logical_ring_workarounds_emit(struct 
> drm_i915_gem_request *req)
>       for (i = 0; i < w->count; i++) {
>               intel_logical_ring_emit_reg(ringbuf, w->reg[i].addr);
>               intel_logical_ring_emit(ringbuf, w->reg[i].value);
> +             ctx->wa.WaForceEnableNonCoherent |=
> +                             (w->reg[i].addr.reg == HDC_CHICKEN0.reg) &&
> +                             (w->reg[i].value & HDC_FORCE_NON_COHERENT);

I don't like this second guessing of what w/a are being applied. We
could just as easily explicitly list the w/a on the engine (i.e. a
synopsis of the wa reg list, plus those setup globally) and reference
that instead of copying that flag to every context.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to