On Thu, 09 Jan 2014, akash.g...@intel.com wrote:
> From: akashgoe <akash.g...@intel.com>
>
> The following changes leads to stable behavior, especially
> when playing 3D Apps, benchmarks.
>
> Added 4 new rendering specific Workarounds
> 1. WaTlbInvalidateStoreDataBefore :-
>      Before pipecontrol with TLB invalidate set,
>      need 2 store data commands
> 2. WaReadAfterWriteHazard :-
>      Send 8 store dword commands after flush
>      for read after write hazard
>      (HSD Gen6 bug_de 3047871)
> 3. WaVSThreadDispatchOverride
>      Performance optimization - Hw will
>      decide which half slice the thread
>      will dispatch, May not be really needed
>      for VLV, as its single slice
> 4. WaDisable_RenderCache_OperationalFlush
>      Operational flush cannot be enabled on
>      BWG A0 [Errata BWT006]
>
> Removed 3 workarounds as not needed for VLV+(B0 onwards)
> 1. WaDisableRHWOOptimizationForRenderHang
> 2. WaDisableL3CacheAging
> 3. WaDisableDopClockGating
>
> Modified the implementation of 1 workaround
> 1. WaDisableL3Bank2xClockGate
>     Disabling L3 clock gating- MMIO 940c[25] = 1
>
> Modified the programming of 2 registers in render ring init function
> 1. GFX_MODE_GEN7 (Enabling TLB invalidate)
> 2. MI_MODE (Enabling MI Flush)


Frankly, I don't know much about these particular workarounds, but for
bisecting any regressions it would probably be a good idea to split the
patch per workaround touched.

BR,
Jani.

>
> Change-Id: Ib60f8dc7d2213552e498d588e1bba7eaa1a921ed
> Signed-off-by: akashgoe <akash.g...@intel.com>
> Signed-off-by: Akash Goel <akash.g...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_reg.h         |  3 ++
>  drivers/gpu/drm/i915/intel_pm.c         | 32 +++++++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 58 
> ++++++++++++++++++++++++++++++---
>  3 files changed, 77 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index a699efd..d829754 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -934,6 +934,9 @@
>  #define   ECO_GATING_CX_ONLY (1<<3)
>  #define   ECO_FLIP_DONE              (1<<0)
>  
> +#define GEN7_CACHE_MODE_0    0x07000 /* IVB+ only */
> +#define GEN7_RC_OP_FLUSH_ENABLE (1<<0)
> +
>  #define CACHE_MODE_1         0x7004 /* IVB+ */
>  #define   PIXEL_SUBSPAN_COLLECT_OPT_DISABLE (1<<6)
>  
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 469170c..e4d220c 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -4947,22 +4947,18 @@ static void valleyview_init_clock_gating(struct 
> drm_device *dev)
>                  _MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP |
>                                     GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
>  
> -     /* Apply the WaDisableRHWOOptimizationForRenderHang:vlv workaround. */
> -     I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
> -                GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
> -
> -     /* WaApplyL3ControlAndL3ChickenMode:vlv */
> -     I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
>       I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
>  
> +     /* WaDisable_RenderCache_OperationalFlush
> +      * Clear bit 0, so we do a AND with the mask
> +      * to keep other bits the same */
> +     I915_WRITE(GEN7_CACHE_MODE_0,  (I915_READ(GEN7_CACHE_MODE_0) |
> +                       _MASKED_BIT_DISABLE(GEN7_RC_OP_FLUSH_ENABLE)));
> +
>       /* WaForceL3Serialization:vlv */
>       I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
>                  ~L3SQ_URB_READ_CAM_MATCH_DISABLE);
>  
> -     /* WaDisableDopClockGating:vlv */
> -     I915_WRITE(GEN7_ROW_CHICKEN2,
> -                _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> -
>       /* This is required by WaCatErrorRejectionIssue:vlv */
>       I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
>                  I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
> @@ -4991,10 +4987,24 @@ static void valleyview_init_clock_gating(struct 
> drm_device *dev)
>                  GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
>                  GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
>  
> -     I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
> +     /* WaDisableL3Bank2xClockGate
> +      * Disabling L3 clock gating- MMIO 940c[25] = 1
> +      * Set bit 25, to disable L3_BANK_2x_CLK_GATING */
> +     I915_WRITE(GEN7_UCGCTL4,
> +             I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
>  
>       I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
>  
> +     /* WaVSThreadDispatchOverride
> +      * Hw will decide which half slice the thread will dispatch.
> +      * May not be needed for VLV, as its a single slice */
> +     I915_WRITE(GEN7_CACHE_MODE_0,
> +             I915_READ(GEN7_FF_THREAD_MODE) &
> +             (~GEN7_FF_VS_SCHED_LOAD_BALANCE));
> +
> +     /* WaDisable4x2SubspanOptimization,
> +      * Disable combining of two 2x2 subspans into a 4x2 subspan
> +      * Set chicken bit to disable subspan optimization */
>       I915_WRITE(CACHE_MODE_1,
>                  _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 442c9a6..c9350a1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -563,7 +563,9 @@ static int init_render_ring(struct intel_ring_buffer 
> *ring)
>       int ret = init_ring_common(ring);
>  
>       if (INTEL_INFO(dev)->gen > 3)
> -             I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
> +             if (!IS_VALLEYVIEW(dev))
> +                     I915_WRITE(MI_MODE,
> +                             _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
>  
>       /* We need to disable the AsyncFlip performance optimisations in order
>        * to use MI_WAIT_FOR_EVENT within the CS. It should already be
> @@ -579,10 +581,17 @@ static int init_render_ring(struct intel_ring_buffer 
> *ring)
>               I915_WRITE(GFX_MODE,
>                          _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_ALWAYS));
>  
> -     if (IS_GEN7(dev))
> -             I915_WRITE(GFX_MODE_GEN7,
> -                        _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> -                        _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +     if (IS_GEN7(dev)) {
> +             if (IS_VALLEYVIEW(dev)) {
> +                     I915_WRITE(GFX_MODE_GEN7,
> +                             _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +                     I915_WRITE(MI_MODE, I915_READ(MI_MODE) |
> +                             _MASKED_BIT_ENABLE(MI_FLUSH_ENABLE));
> +             } else
> +                     I915_WRITE(GFX_MODE_GEN7,
> +                             _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
> +                             _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
> +     }
>  
>       if (INTEL_INFO(dev)->gen >= 5) {
>               ret = init_pipe_control(ring);
> @@ -2157,6 +2166,7 @@ int
>  intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
>  {
>       int ret;
> +     int i;
>  
>       if (!ring->gpu_caches_dirty)
>               return 0;
> @@ -2165,6 +2175,26 @@ intel_ring_flush_all_caches(struct intel_ring_buffer 
> *ring)
>       if (ret)
>               return ret;
>  
> +     /* WaReadAfterWriteHazard*/
> +     /* Send a number of Store Data commands here to finish
> +        flushing hardware pipeline.This is needed in the case
> +        where the next workload tries reading from the same
> +        surface that this batch writes to. Without these StoreDWs,
> +        not all of the data will actually be flushd to the surface
> +        by the time the next batch starts reading it, possibly
> +        causing a small amount of corruption.*/
> +     ret = intel_ring_begin(ring, 4 * 12);
> +     if (ret)
> +             return ret;
> +     for (i = 0; i < 12; i++) {
> +             intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +             intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +                                             MI_STORE_DWORD_INDEX_SHIFT);
> +             intel_ring_emit(ring, 0);
> +             intel_ring_emit(ring, MI_NOOP);
> +     }
> +     intel_ring_advance(ring);
> +
>       trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
>  
>       ring->gpu_caches_dirty = false;
> @@ -2176,6 +2206,24 @@ intel_ring_invalidate_all_caches(struct 
> intel_ring_buffer *ring)
>  {
>       uint32_t flush_domains;
>       int ret;
> +     int i;
> +
> +     /* WaTlbInvalidateStoreDataBefore*/
> +     /* Before pipecontrol with TLB invalidate set, need 2 store
> +        data commands (such as MI_STORE_DATA_IMM or MI_STORE_DATA_INDEX)
> +        Without this, hardware cannot guarantee the command after the
> +        PIPE_CONTROL with TLB inv will not use the old TLB values.*/
> +     ret = intel_ring_begin(ring, 4 * 2);
> +     if (ret)
> +             return ret;
> +     for (i = 0; i < 2; i++) {
> +             intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
> +             intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
> +                                             MI_STORE_DWORD_INDEX_SHIFT);
> +             intel_ring_emit(ring, 0);
> +             intel_ring_emit(ring, MI_NOOP);
> +     }
> +     intel_ring_advance(ring);
>  
>       flush_domains = 0;
>       if (ring->gpu_caches_dirty)
> -- 
> 1.8.5.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Jani Nikula, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to