On Fri, Aug 22, 2014 at 08:39:11PM +0100, Arun Siluvery wrote:
> For BDW workarounds are currently initialized in init_clock_gating() but
> they are lost during reset, suspend/resume etc; this patch moves the WAs
> that are part of register state context to render ring init fn otherwise
> default context ends up with incorrect values as they don't get initialized
> until init_clock_gating fn.
> 
> v2: Add workarounds to golden render state
> This method has its own issues, first of all this is different for
> each gen and it is generated using a tool so adding new workaround
> and mainitaining them across gens is not a straightforward process.
> 
> v3: Use LRIs to emit these workarounds (Ville)
> Instead of modifying the golden render state the same LRIs are
> emitted from within the driver.
> 
> For: VIZ-4092
> Signed-off-by: Arun Siluvery <arun.siluv...@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |  6 +++
>  drivers/gpu/drm/i915/intel_pm.c         | 48 ----------------------
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 70 
> +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  7 ++++
>  4 files changed, 83 insertions(+), 48 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index 9683e62..2debce4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -631,20 +631,26 @@ static int do_switch(struct intel_engine_cs *ring,
>       }
>  
>       uninitialized = !to->legacy_hw_ctx.initialized && from == NULL;
>       to->legacy_hw_ctx.initialized = true;
>  
>  done:
>       i915_gem_context_reference(to);
>       ring->last_context = to;
>  
>       if (uninitialized) {
> +             if (IS_BROADWELL(ring->dev)) {
> +                     ret = bdw_init_workarounds(ring);
> +                     if (ret)
> +                             DRM_ERROR("init workarounds: %d\n", ret);
> +             }
> +
>               ret = i915_gem_render_state_init(ring);
>               if (ret)
>                       DRM_ERROR("init render state: %d\n", ret);
>       }
>  
>       return 0;
>  
>  unpin_out:
>       if (ring->id == RCS)
>               i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state);
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index c8f744c..668acd9 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -5507,101 +5507,53 @@ static void gen8_init_clock_gating(struct drm_device 
> *dev)
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       enum pipe pipe;
>  
>       I915_WRITE(WM3_LP_ILK, 0);
>       I915_WRITE(WM2_LP_ILK, 0);
>       I915_WRITE(WM1_LP_ILK, 0);
>  
>       /* FIXME(BDW): Check all the w/a, some might only apply to
>        * pre-production hw. */
>  
> -     /* WaDisablePartialInstShootdown:bdw */
> -     I915_WRITE(GEN8_ROW_CHICKEN,
> -                _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
> -
> -     /* WaDisableThreadStallDopClockGating:bdw */
> -     /* FIXME: Unclear whether we really need this on production bdw. */
> -     I915_WRITE(GEN8_ROW_CHICKEN,
> -                _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
>  
> -     /*
> -      * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> -      * pre-production hardware
> -      */
> -     I915_WRITE(HALF_SLICE_CHICKEN3,
> -                _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
> -     I915_WRITE(HALF_SLICE_CHICKEN3,
> -                _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
>       I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>  
>       I915_WRITE(_3D_CHICKEN3,
>                  
> _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>  
> -     I915_WRITE(COMMON_SLICE_CHICKEN2,
> -                _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> -
> -     I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
> -                _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> -
> -     /* WaDisableDopClockGating:bdw May not be needed for production */
> -     I915_WRITE(GEN7_ROW_CHICKEN2,
> -                _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>  
>       /* WaSwitchSolVfFArbitrationPriority:bdw */
>       I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>  
>       /* WaPsrDPAMaskVBlankInSRD:bdw */
>       I915_WRITE(CHICKEN_PAR1_1,
>                  I915_READ(CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD);
>  
>       /* WaPsrDPRSUnmaskVBlankInSRD:bdw */
>       for_each_pipe(pipe) {
>               I915_WRITE(CHICKEN_PIPESL_1(pipe),
>                          I915_READ(CHICKEN_PIPESL_1(pipe)) |
>                          BDW_DPRS_MASK_VBLANK_SRD);
>       }
>  
> -     /* Use Force Non-Coherent whenever executing a 3D context. This is a
> -      * workaround for for a possible hang in the unlikely event a TLB
> -      * invalidation occurs during a PSD flush.
> -      */
> -     I915_WRITE(HDC_CHICKEN0,
> -                I915_READ(HDC_CHICKEN0) |
> -                _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> -
>       /* WaVSRefCountFullforceMissDisable:bdw */
>       /* WaDSRefCountFullforceMissDisable:bdw */
>       I915_WRITE(GEN7_FF_THREAD_MODE,
>                  I915_READ(GEN7_FF_THREAD_MODE) &
>                  ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>  
> -     /*
> -      * BSpec recommends 8x4 when MSAA is used,
> -      * however in practice 16x4 seems fastest.
> -      *
> -      * Note that PS/WM thread counts depend on the WIZ hashing
> -      * disable bit, which we don't touch here, but it's good
> -      * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> -      */
> -     I915_WRITE(GEN7_GT_MODE,
> -                GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> -
>       I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>                  _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>  
>       /* WaDisableSDEUnitClockGating:bdw */
>       I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
>                  GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
> -
> -     /* Wa4x4STCOptimizationDisable:bdw */
> -     I915_WRITE(CACHE_MODE_1,
> -                _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>  }
>  
>  static void haswell_init_clock_gating(struct drm_device *dev)
>  {
>       struct drm_i915_private *dev_priv = dev->dev_private;
>  
>       ilk_init_lp_watermarks(dev);
>  
>       /* L3 caching of data atomics doesn't work -- disable it. */
>       I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 13543f8..9e24073 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -643,20 +643,90 @@ intel_init_pipe_control(struct intel_engine_cs *ring)
>       return 0;
>  
>  err_unpin:
>       i915_gem_object_ggtt_unpin(ring->scratch.obj);
>  err_unref:
>       drm_gem_object_unreference(&ring->scratch.obj->base);
>  err:
>       return ret;
>  }
>  
> +int bdw_init_workarounds(struct intel_engine_cs *ring)
> +{
> +     int ret;
> +
> +     /*
> +      * workarounds applied in this fn are part of register state context,
> +      * they need to be re-initialized followed by gpu reset, suspend/resume,
> +      * module reload.
> +      */
> +
> +     /*
> +      * update the number of dwords required based on the
> +      * actual number of workarounds applied
> +      */
> +     ret = intel_ring_begin(ring, 24);
> +     if (ret)
> +             return ret;
> +
> +     /* WaDisablePartialInstShootdown:bdw */
> +     /* WaDisableThreadStallDopClockGating:bdw */
> +     /* FIXME: Unclear whether we really need this on production bdw. */
> +     INTEL_RING_EMIT_WA(ring, GEN8_ROW_CHICKEN,
> +                        
> _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
> +                                          | STALL_DOP_GATING_DISABLE));
> +
> +     /* WaDisableDopClockGating:bdw May not be needed for production */
> +     INTEL_RING_EMIT_WA(ring, GEN7_ROW_CHICKEN2,
> +                        _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> +
> +     /*
> +      * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> +      * pre-production hardware
> +      */
> +     INTEL_RING_EMIT_WA(ring, HALF_SLICE_CHICKEN3,
> +                        _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
> +                                           | GEN8_SAMPLER_POWER_BYPASS_DIS));
> +
> +     INTEL_RING_EMIT_WA(ring, GEN7_HALF_SLICE_CHICKEN1,
> +                        
> _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> +
> +     INTEL_RING_EMIT_WA(ring, COMMON_SLICE_CHICKEN2,
> +                        
> _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> +
> +     /* Use Force Non-Coherent whenever executing a 3D context. This is a
> +      * workaround for for a possible hang in the unlikely event a TLB
> +      * invalidation occurs during a PSD flush.
> +      */
> +     INTEL_RING_EMIT_WA(ring, HDC_CHICKEN0,
> +                        _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> +
> +     /* Wa4x4STCOptimizationDisable:bdw */
> +     INTEL_RING_EMIT_WA(ring, CACHE_MODE_1,
> +                        
> _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
> +
> +     /*
> +      * BSpec recommends 8x4 when MSAA is used,
> +      * however in practice 16x4 seems fastest.
> +      *
> +      * Note that PS/WM thread counts depend on the WIZ hashing
> +      * disable bit, which we don't touch here, but it's good
> +      * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> +      */
> +     INTEL_RING_EMIT_WA(ring, GEN7_GT_MODE,
> +                        GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> +
> +     intel_ring_advance(ring);
> +
> +     return 0;
> +}
> +
>  static int init_render_ring(struct intel_engine_cs *ring)
>  {
>       struct drm_device *dev = ring->dev;
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       int ret = init_ring_common(ring);
>       if (ret)
>               return ret;
>  
>       /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
>       if (INTEL_INFO(dev)->gen >= 4 && INTEL_INFO(dev)->gen < 7)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 9cbf7b0..77fe667 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -66,20 +66,26 @@ struct  intel_hw_status_page {
>               break; \
>       } \
>       ring->semaphore.signal_ggtt[RCS] = GEN8_SIGNAL_OFFSET(ring, RCS); \
>       ring->semaphore.signal_ggtt[VCS] = GEN8_SIGNAL_OFFSET(ring, VCS); \
>       ring->semaphore.signal_ggtt[BCS] = GEN8_SIGNAL_OFFSET(ring, BCS); \
>       ring->semaphore.signal_ggtt[VECS] = GEN8_SIGNAL_OFFSET(ring, VECS); \
>       ring->semaphore.signal_ggtt[VCS2] = GEN8_SIGNAL_OFFSET(ring, VCS2); \
>       ring->semaphore.signal_ggtt[ring->id] = MI_SEMAPHORE_SYNC_INVALID; \
>       } while(0)
>  
> +#define INTEL_RING_EMIT_WA(_ring, _wa_reg, _wa_val) ({                       
> \
> +                     intel_ring_emit(_ring, MI_LOAD_REGISTER_IMM(1)); \
> +                     intel_ring_emit(_ring, _wa_reg);                \
> +                     intel_ring_emit(_ring, _wa_val);                \
> +             })

do {} while (0) would suffice since the macro doesn't return anything.
Not sure why it's a macro actually. A static function in intel_ringbuffer.c
would do just as well. If you want to keep it a macro, the argument
evaluations should have () around them.

I was also going to say that you could just call it ring_emit_lri() or
something, but I guess you plan to add the debugfs w/a stuff there which
makes the current name more appropriate. I'll let others bikeshed the
debugfs stuff since I have no real opinion on it.

Apart from that the patch seems good to me.

> +
>  enum intel_ring_hangcheck_action {
>       HANGCHECK_IDLE = 0,
>       HANGCHECK_WAIT,
>       HANGCHECK_ACTIVE,
>       HANGCHECK_ACTIVE_LOOP,
>       HANGCHECK_KICK,
>       HANGCHECK_HUNG,
>  };
>  
>  #define HANGCHECK_SCORE_RING_HUNG 31
> @@ -411,20 +417,21 @@ int intel_ring_flush_all_caches(struct intel_engine_cs 
> *ring);
>  int intel_ring_invalidate_all_caches(struct intel_engine_cs *ring);
>  
>  void intel_fini_pipe_control(struct intel_engine_cs *ring);
>  int intel_init_pipe_control(struct intel_engine_cs *ring);
>  
>  int intel_init_render_ring_buffer(struct drm_device *dev);
>  int intel_init_bsd_ring_buffer(struct drm_device *dev);
>  int intel_init_bsd2_ring_buffer(struct drm_device *dev);
>  int intel_init_blt_ring_buffer(struct drm_device *dev);
>  int intel_init_vebox_ring_buffer(struct drm_device *dev);
> +int bdw_init_workarounds(struct intel_engine_cs *ring);
>  
>  u64 intel_ring_get_active_head(struct intel_engine_cs *ring);
>  void intel_ring_setup_status_page(struct intel_engine_cs *ring);
>  
>  static inline u32 intel_ring_get_tail(struct intel_ringbuffer *ringbuf)
>  {
>       return ringbuf->tail;
>  }
>  
>  static inline u32 intel_ring_get_seqno(struct intel_engine_cs *ring)
> -- 
> 2.0.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to