On Fri, Mar 01, 2013 at 08:32:57PM +0200, ville.syrj...@linux.intel.com wrote:
> From: Ville Syrjälä <ville.syrj...@linux.intel.com>
> 
> Currently all scanout buffers must be uncached because the
> display controller doesn't snoop the LLC. SNB introduced another
> method to guarantee coherency for the display controller. It's
> called the GFDT or graphics data type.
> 
> Pages that have the GFDT bit enabled in their PTEs get flushed
> all the way to memory when a MI_FLUSH_DW or PIPE_CONTROL is
> issued with the "synchronize GFDT" bit set.
> 
> So rather than making all scanout buffers uncached, set the GFDT
> bit in their PTEs, and modify the ring flush functions to enable
> the "synchronize GFDT" bit.
> 
> On HSW the GFDT bit was removed from the PTE, and it's only present in
> surface state, so we can't really set it from the kernel. Also the docs
> state that the hardware isn't actually guaranteed to respect the GFDT
> bit. So it looks like GFDT isn't all that useful on HSW.
> 
> So far I've tried this very quickly on an IVB machine, and
> it seems to be working as advertised. No idea if it does any
> good though.
> 
> TODO:
> - make sure there are no missing flushes (CPU access doesn't
>   respect GFDT after all).
> - measure it and see if there's some real benefit
> - maybe we can track whether "synchronize GFDT" needs to be
>   issued, and skip it when possible. needs some numbers to
>   determine if it's worthwile.
> 
> Signed-off-by: Ville Syrjälä <ville.syrj...@linux.intel.com>

Iirc when I've tried this out a while back it regressed a few benchmarks.
Chris&me suspected cache trahsing, but hard to tell without proper
instrumentation support. Chris played around with a few tricks to mark
other giant bos as uncacheable, but he couldn't find any improved
workloads.

In short I think this needs to come with decent amounts of numbers
attached, like the TODO says ;-)

The other thing was that I didn't manage to get things to work properly,
leaving some random cachline dirt on the screen. Looking at your code, you
add the gfdt flush to every ring_flush, whereas I've tried to be clever
and only flushed after batches rendering to the frontbuffer. So probably a
bug in my code, or a flush on a given ring doesn't flush out caches for
one of the other engines.

And finally a bikeshed: I'd vote for a slightly more generice unsigned
long cache_flags instead of bool gfdt.

Cheers, Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  18 +++--
>  drivers/gpu/drm/i915/i915_gem.c            | 109 
> ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_gem_context.c    |   3 +-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  10 +--
>  drivers/gpu/drm/i915/i915_gem_gtt.c        |  38 ++++++----
>  drivers/gpu/drm/i915/i915_reg.h            |   2 +
>  drivers/gpu/drm/i915/intel_display.c       |   4 +-
>  drivers/gpu/drm/i915/intel_overlay.c       |   2 +-
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  25 +++++++
>  9 files changed, 167 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e95337c..537b344 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -408,7 +408,8 @@ struct i915_gtt {
>       void (*gtt_insert_entries)(struct drm_device *dev,
>                                  struct sg_table *st,
>                                  unsigned int pg_start,
> -                                enum i915_cache_level cache_level);
> +                                enum i915_cache_level cache_level,
> +                                bool gfdt);
>  };
>  #define gtt_total_entries(gtt) ((gtt).total >> PAGE_SHIFT)
>  
> @@ -429,7 +430,8 @@ struct i915_hw_ppgtt {
>       void (*insert_entries)(struct i915_hw_ppgtt *ppgtt,
>                              struct sg_table *st,
>                              unsigned int pg_start,
> -                            enum i915_cache_level cache_level);
> +                            enum i915_cache_level cache_level,
> +                            bool gfdt);
>       void (*cleanup)(struct i915_hw_ppgtt *ppgtt);
>  };
>  
> @@ -1169,6 +1171,7 @@ struct drm_i915_gem_object {
>       unsigned int fenced_gpu_access:1;
>  
>       unsigned int cache_level:2;
> +     unsigned int gfdt:1;
>  
>       unsigned int has_aliasing_ppgtt_mapping:1;
>       unsigned int has_global_gtt_mapping:1;
> @@ -1310,6 +1313,10 @@ struct drm_i915_file_private {
>  #define HAS_LLC(dev)            (INTEL_INFO(dev)->has_llc)
>  #define I915_NEED_GFX_HWS(dev)       (INTEL_INFO(dev)->need_gfx_hws)
>  
> +/* Only SNB and IVB have GFDT in PTEs */
> +#define HAS_GFDT(dev)                (HAS_LLC(dev) && !IS_HASWELL(dev) && \
> +                              (IS_GEN6(dev) || IS_GEN7(dev)))
> +
>  #define HAS_HW_CONTEXTS(dev) (INTEL_INFO(dev)->gen >= 6)
>  #define HAS_ALIASING_PPGTT(dev)      (INTEL_INFO(dev)->gen >=6 && 
> !IS_VALLEYVIEW(dev))
>  
> @@ -1639,7 +1646,8 @@ i915_gem_object_set_to_gtt_domain(struct 
> drm_i915_gem_object *obj,
>  int __must_check
>  i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool 
> write);
>  int __must_check
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> +                                  struct drm_i915_gem_object *obj,
>                                    u32 alignment,
>                                    struct intel_ring_buffer *pipelined);
>  int i915_gem_attach_phys_object(struct drm_device *dev,
> @@ -1681,14 +1689,14 @@ int i915_gem_context_destroy_ioctl(struct drm_device 
> *dev, void *data,
>  void i915_gem_cleanup_aliasing_ppgtt(struct drm_device *dev);
>  void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
>                           struct drm_i915_gem_object *obj,
> -                         enum i915_cache_level cache_level);
> +                         enum i915_cache_level cache_level, bool gfdt);
>  void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
>                             struct drm_i915_gem_object *obj);
>  
>  void i915_gem_restore_gtt_mappings(struct drm_device *dev);
>  int __must_check i915_gem_gtt_prepare_object(struct drm_i915_gem_object 
> *obj);
>  void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> -                             enum i915_cache_level cache_level);
> +                           enum i915_cache_level cache_level, bool gfdt);
>  void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
>  void i915_gem_gtt_finish_object(struct drm_i915_gem_object *obj);
>  void i915_gem_init_global_gtt(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8413ffc..c635430 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3035,8 +3035,13 @@ i915_gem_clflush_object(struct drm_i915_gem_object 
> *obj)
>        * and flushes when moving into and out of the RENDER domain, correct
>        * snooping behaviour occurs naturally as the result of our domain
>        * tracking.
> +      *
> +      * Also flush all pinned objects with GFDT enabled. Such objects
> +      * may be used for scanout, and the CPU doesn't know anything
> +      * about GFDT.
>        */
> -     if (obj->cache_level != I915_CACHE_NONE)
> +     if (obj->cache_level != I915_CACHE_NONE &&
> +         !(obj->gfdt && obj->pin_count))
>               return;
>  
>       trace_i915_gem_object_clflush(obj);
> @@ -3154,6 +3159,11 @@ int i915_gem_object_set_cache_level(struct 
> drm_i915_gem_object *obj,
>       struct drm_device *dev = obj->base.dev;
>       drm_i915_private_t *dev_priv = dev->dev_private;
>       int ret;
> +     bool gfdt = obj->gfdt;
> +
> +     /* Clear GFDT when moving to uncached */
> +     if (cache_level == I915_CACHE_NONE)
> +             gfdt = false;
>  
>       if (obj->cache_level == cache_level)
>               return 0;
> @@ -3187,10 +3197,10 @@ int i915_gem_object_set_cache_level(struct 
> drm_i915_gem_object *obj,
>               }
>  
>               if (obj->has_global_gtt_mapping)
> -                     i915_gem_gtt_bind_object(obj, cache_level);
> +                     i915_gem_gtt_bind_object(obj, cache_level, gfdt);
>               if (obj->has_aliasing_ppgtt_mapping)
>                       i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> -                                            obj, cache_level);
> +                                            obj, cache_level, gfdt);
>  
>               obj->gtt_space->color = cache_level;
>       }
> @@ -3219,6 +3229,65 @@ int i915_gem_object_set_cache_level(struct 
> drm_i915_gem_object *obj,
>       }
>  
>       obj->cache_level = cache_level;
> +     obj->gfdt = gfdt;
> +     i915_gem_verify_gtt(dev);
> +     return 0;
> +}
> +
> +static int i915_gem_object_set_gfdt(struct drm_i915_gem_object *obj,
> +                                 bool gfdt)
> +{
> +     struct drm_device *dev = obj->base.dev;
> +     drm_i915_private_t *dev_priv = dev->dev_private;
> +     int ret;
> +
> +     if (!HAS_GFDT(dev))
> +             return -ENODEV;
> +
> +     if (obj->gfdt == gfdt)
> +             return 0;
> +
> +     /* no point in setting GFDT on uncached object */
> +     if (obj->cache_level == I915_CACHE_NONE)
> +             return -EINVAL;
> +
> +     if (obj->gtt_space) {
> +             ret = i915_gem_object_finish_gpu(obj);
> +             if (ret)
> +                     return ret;
> +
> +             i915_gem_object_finish_gtt(obj);
> +
> +             if (obj->has_global_gtt_mapping)
> +                     i915_gem_gtt_bind_object(obj, obj->cache_level, gfdt);
> +             if (obj->has_aliasing_ppgtt_mapping)
> +                     i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> +                                            obj, obj->cache_level, gfdt);
> +     }
> +
> +     if (gfdt) {
> +             u32 old_read_domains, old_write_domain;
> +
> +             /*
> +              * Since we're just now enabling GFDT there's most
> +              * likely dirty data in the LLC. It must be flushed
> +              * to memory the old fashined way.
> +              */
> +             WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
> +             WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU);
> +
> +             old_read_domains = obj->base.read_domains;
> +             old_write_domain = obj->base.write_domain;
> +
> +             obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +             obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +
> +             trace_i915_gem_object_change_domain(obj,
> +                                                 old_read_domains,
> +                                                 old_write_domain);
> +     }
> +
> +     obj->gfdt = gfdt;
>       i915_gem_verify_gtt(dev);
>       return 0;
>  }
> @@ -3291,7 +3360,8 @@ unlock:
>   * any flushes to be pipelined (for pageflips).
>   */
>  int
> -i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
> +i915_gem_object_pin_to_display_plane(struct drm_device *dev,
> +                                  struct drm_i915_gem_object *obj,
>                                    u32 alignment,
>                                    struct intel_ring_buffer *pipelined)
>  {
> @@ -3304,18 +3374,23 @@ i915_gem_object_pin_to_display_plane(struct 
> drm_i915_gem_object *obj,
>                       return ret;
>       }
>  
> -     /* The display engine is not coherent with the LLC cache on gen6.  As
> -      * a result, we make sure that the pinning that is about to occur is
> -      * done with uncached PTEs. This is lowest common denominator for all
> -      * chipsets.
> -      *
> -      * However for gen6+, we could do better by using the GFDT bit instead
> -      * of uncaching, which would allow us to flush all the LLC-cached data
> -      * with that bit in the PTE to main memory with just one PIPE_CONTROL.
> +     /*
> +      * Try to set the GFDT bit instead of uncaching. This allow us to flush
> +      * all the LLC-cached data with that bit in the PTE to main memory with
> +      * just one PIPE_CONTROL.
>        */
> -     ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> -     if (ret)
> -             return ret;
> +     ret = i915_gem_object_set_gfdt(obj, true);
> +     if (ret) {
> +             /*
> +              * The display engine is not coherent with the LLC cache on 
> gen6.  As
> +              * a result, we make sure that the pinning that is about to 
> occur is
> +              * done with uncached PTEs. This is lowest common denominator 
> for all
> +              * chipsets.
> +              */
> +             ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
> +             if (ret)
> +                     return ret;
> +     }
>  
>       /* As the user may map the buffer once pinned in the display plane
>        * (e.g. libkms for the bootup splash), we have to ensure that we
> @@ -3499,11 +3574,11 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
>                       return ret;
>  
>               if (!dev_priv->mm.aliasing_ppgtt)
> -                     i915_gem_gtt_bind_object(obj, obj->cache_level);
> +                     i915_gem_gtt_bind_object(obj, obj->cache_level, 
> obj->gfdt);
>       }
>  
>       if (!obj->has_global_gtt_mapping && map_and_fenceable)
> -             i915_gem_gtt_bind_object(obj, obj->cache_level);
> +             i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  
>       obj->pin_count++;
>       obj->pin_mappable |= map_and_fenceable;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
> b/drivers/gpu/drm/i915/i915_gem_context.c
> index 21177d9..7244c0c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -382,7 +382,8 @@ static int do_switch(struct i915_hw_context *to)
>       }
>  
>       if (!to->obj->has_global_gtt_mapping)
> -             i915_gem_gtt_bind_object(to->obj, to->obj->cache_level);
> +             i915_gem_gtt_bind_object(to->obj, to->obj->cache_level,
> +                                      to->obj->gfdt);
>  
>       if (!to->is_initialized || is_default_context(to))
>               hw_flags |= MI_RESTORE_INHIBIT;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 2f2daeb..28a3c42 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -197,7 +197,8 @@ i915_gem_execbuffer_relocate_entry(struct 
> drm_i915_gem_object *obj,
>           reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
>           !target_i915_obj->has_global_gtt_mapping)) {
>               i915_gem_gtt_bind_object(target_i915_obj,
> -                                      target_i915_obj->cache_level);
> +                                      target_i915_obj->cache_level,
> +                                      target_i915_obj->gfdt);
>       }
>  
>       /* Validate that the target is in a valid r/w GPU domain */
> @@ -432,7 +433,7 @@ i915_gem_execbuffer_reserve_object(struct 
> drm_i915_gem_object *obj,
>       /* Ensure ppgtt mapping exists if needed */
>       if (dev_priv->mm.aliasing_ppgtt && !obj->has_aliasing_ppgtt_mapping) {
>               i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
> -                                    obj, obj->cache_level);
> +                                    obj, obj->cache_level, obj->gfdt);
>  
>               obj->has_aliasing_ppgtt_mapping = 1;
>       }
> @@ -449,7 +450,7 @@ i915_gem_execbuffer_reserve_object(struct 
> drm_i915_gem_object *obj,
>  
>       if (entry->flags & EXEC_OBJECT_NEEDS_GTT &&
>           !obj->has_global_gtt_mapping)
> -             i915_gem_gtt_bind_object(obj, obj->cache_level);
> +             i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>  
>       return 0;
>  }
> @@ -1011,7 +1012,8 @@ i915_gem_do_execbuffer(struct drm_device *dev, void 
> *data,
>        * hsw should have this fixed, but let's be paranoid and do it
>        * unconditionally for now. */
>       if (flags & I915_DISPATCH_SECURE && !batch_obj->has_global_gtt_mapping)
> -             i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level);
> +             i915_gem_gtt_bind_object(batch_obj, batch_obj->cache_level,
> +                                      batch_obj->gfdt);
>  
>       ret = i915_gem_execbuffer_move_to_gpu(ring, &eb->objects);
>       if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 926a1e2..3c7e48b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -42,15 +42,20 @@ typedef uint32_t gtt_pte_t;
>  #define HSW_PTE_UNCACHED             (0)
>  #define GEN6_PTE_CACHE_LLC           (2 << 1)
>  #define GEN6_PTE_CACHE_LLC_MLC               (3 << 1)
> +#define GEN6_PTE_GFDT                        (1 << 3)
>  #define GEN6_PTE_ADDR_ENCODE(addr)   GEN6_GTT_ADDR_ENCODE(addr)
>  
>  static inline gtt_pte_t gen6_pte_encode(struct drm_device *dev,
>                                       dma_addr_t addr,
> -                                     enum i915_cache_level level)
> +                                     enum i915_cache_level level,
> +                                     bool gfdt)
>  {
>       gtt_pte_t pte = GEN6_PTE_VALID;
>       pte |= GEN6_PTE_ADDR_ENCODE(addr);
>  
> +     if (gfdt && HAS_GFDT(dev))
> +             pte |= GEN6_PTE_GFDT;
> +
>       switch (level) {
>       case I915_CACHE_LLC_MLC:
>               /* Haswell doesn't set L3 this way */
> @@ -89,7 +94,7 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt 
> *ppgtt,
>  
>       scratch_pte = gen6_pte_encode(ppgtt->dev,
>                                     ppgtt->scratch_page_dma_addr,
> -                                   I915_CACHE_LLC);
> +                                   I915_CACHE_LLC, false);
>  
>       while (num_entries) {
>               last_pte = first_pte + num_entries;
> @@ -112,7 +117,8 @@ static void gen6_ppgtt_clear_range(struct i915_hw_ppgtt 
> *ppgtt,
>  static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
>                                     struct sg_table *pages,
>                                     unsigned first_entry,
> -                                   enum i915_cache_level cache_level)
> +                                   enum i915_cache_level cache_level,
> +                                   bool gfdt)
>  {
>       gtt_pte_t *pt_vaddr;
>       unsigned act_pd = first_entry / I915_PPGTT_PT_ENTRIES;
> @@ -133,7 +139,7 @@ static void gen6_ppgtt_insert_entries(struct 
> i915_hw_ppgtt *ppgtt,
>               for (j = first_pte; j < I915_PPGTT_PT_ENTRIES; j++) {
>                       page_addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
>                       pt_vaddr[j] = gen6_pte_encode(ppgtt->dev, page_addr,
> -                                                   cache_level);
> +                                                   cache_level, gfdt);
>  
>                       /* grab the next page */
>                       if (++m == segment_len) {
> @@ -279,11 +285,12 @@ void i915_gem_cleanup_aliasing_ppgtt(struct drm_device 
> *dev)
>  
>  void i915_ppgtt_bind_object(struct i915_hw_ppgtt *ppgtt,
>                           struct drm_i915_gem_object *obj,
> -                         enum i915_cache_level cache_level)
> +                         enum i915_cache_level cache_level,
> +                         bool gfdt)
>  {
>       ppgtt->insert_entries(ppgtt, obj->pages,
>                             obj->gtt_space->start >> PAGE_SHIFT,
> -                           cache_level);
> +                           cache_level, gfdt);
>  }
>  
>  void i915_ppgtt_unbind_object(struct i915_hw_ppgtt *ppgtt,
> @@ -401,7 +408,7 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
>  
>       list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) {
>               i915_gem_clflush_object(obj);
> -             i915_gem_gtt_bind_object(obj, obj->cache_level);
> +             i915_gem_gtt_bind_object(obj, obj->cache_level, obj->gfdt);
>       }
>  
>       i915_gem_chipset_flush(dev);
> @@ -429,7 +436,8 @@ int i915_gem_gtt_prepare_object(struct 
> drm_i915_gem_object *obj)
>  static void gen6_ggtt_insert_entries(struct drm_device *dev,
>                                    struct sg_table *st,
>                                    unsigned int first_entry,
> -                                  enum i915_cache_level level)
> +                                  enum i915_cache_level level,
> +                                  bool gfdt)
>  {
>       struct drm_i915_private *dev_priv = dev->dev_private;
>       struct scatterlist *sg = st->sgl;
> @@ -443,7 +451,7 @@ static void gen6_ggtt_insert_entries(struct drm_device 
> *dev,
>               len = sg_dma_len(sg) >> PAGE_SHIFT;
>               for (m = 0; m < len; m++) {
>                       addr = sg_dma_address(sg) + (m << PAGE_SHIFT);
> -                     iowrite32(gen6_pte_encode(dev, addr, level),
> +                     iowrite32(gen6_pte_encode(dev, addr, level, gfdt),
>                                 &gtt_entries[i]);
>                       i++;
>               }
> @@ -457,7 +465,7 @@ static void gen6_ggtt_insert_entries(struct drm_device 
> *dev,
>        */
>       if (i != 0)
>               WARN_ON(readl(&gtt_entries[i-1])
> -                     != gen6_pte_encode(dev, addr, level));
> +                     != gen6_pte_encode(dev, addr, level, gfdt));
>  
>       /* This next bit makes the above posting read even more important. We
>        * want to flush the TLBs only after we're certain all the PTE updates
> @@ -483,7 +491,7 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
>               num_entries = max_entries;
>  
>       scratch_pte = gen6_pte_encode(dev, dev_priv->gtt.scratch_page_dma,
> -                                   I915_CACHE_LLC);
> +                                   I915_CACHE_LLC, false);
>       for (i = 0; i < num_entries; i++)
>               iowrite32(scratch_pte, &gtt_base[i]);
>       readl(gtt_base);
> @@ -493,7 +501,8 @@ static void gen6_ggtt_clear_range(struct drm_device *dev,
>  static void i915_ggtt_insert_entries(struct drm_device *dev,
>                                    struct sg_table *st,
>                                    unsigned int pg_start,
> -                                  enum i915_cache_level cache_level)
> +                                  enum i915_cache_level cache_level,
> +                                  bool gfdt)
>  {
>       unsigned int flags = (cache_level == I915_CACHE_NONE) ?
>               AGP_USER_MEMORY : AGP_USER_CACHED_MEMORY;
> @@ -511,14 +520,15 @@ static void i915_ggtt_clear_range(struct drm_device 
> *dev,
>  
>  
>  void i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
> -                           enum i915_cache_level cache_level)
> +                           enum i915_cache_level cache_level,
> +                           bool gfdt)
>  {
>       struct drm_device *dev = obj->base.dev;
>       struct drm_i915_private *dev_priv = dev->dev_private;
>  
>       dev_priv->gtt.gtt_insert_entries(dev, obj->pages,
>                                        obj->gtt_space->start >> PAGE_SHIFT,
> -                                      cache_level);
> +                                      cache_level, gfdt);
>  
>       obj->has_global_gtt_mapping = 1;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index cd226c2..6aa29d5 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -242,6 +242,7 @@
>  #define MI_FLUSH_DW          MI_INSTR(0x26, 1) /* for GEN6 */
>  #define   MI_FLUSH_DW_STORE_INDEX    (1<<21)
>  #define   MI_INVALIDATE_TLB          (1<<18)
> +#define   MI_SYNCHRONIZE_GFDT                (1<<17)
>  #define   MI_FLUSH_DW_OP_STOREDW     (1<<14)
>  #define   MI_INVALIDATE_BSD          (1<<7)
>  #define   MI_FLUSH_DW_USE_GTT                (1<<2)
> @@ -311,6 +312,7 @@
>  #define   PIPE_CONTROL_GLOBAL_GTT_IVB                        (1<<24) /* 
> gen7+ */
>  #define   PIPE_CONTROL_CS_STALL                              (1<<20)
>  #define   PIPE_CONTROL_TLB_INVALIDATE                        (1<<18)
> +#define   PIPE_CONTROL_SYNCHRONIZE_GFDT                      (1<<17)
>  #define   PIPE_CONTROL_QW_WRITE                              (1<<14)
>  #define   PIPE_CONTROL_DEPTH_STALL                   (1<<13)
>  #define   PIPE_CONTROL_WRITE_FLUSH                   (1<<12)
> diff --git a/drivers/gpu/drm/i915/intel_display.c 
> b/drivers/gpu/drm/i915/intel_display.c
> index d912306..fc99d55 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -1968,7 +1968,7 @@ intel_pin_and_fence_fb_obj(struct drm_device *dev,
>       }
>  
>       dev_priv->mm.interruptible = false;
> -     ret = i915_gem_object_pin_to_display_plane(obj, alignment, pipelined);
> +     ret = i915_gem_object_pin_to_display_plane(dev, obj, alignment, 
> pipelined);
>       if (ret)
>               goto err_interruptible;
>  
> @@ -6340,7 +6340,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
>                       goto fail_locked;
>               }
>  
> -             ret = i915_gem_object_pin_to_display_plane(obj, 0, NULL);
> +             ret = i915_gem_object_pin_to_display_plane(dev, obj, 0, NULL);
>               if (ret) {
>                       DRM_ERROR("failed to move cursor bo into the GTT\n");
>                       goto fail_locked;
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c 
> b/drivers/gpu/drm/i915/intel_overlay.c
> index 67a2501..b4110a2 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -695,7 +695,7 @@ static int intel_overlay_do_put_image(struct 
> intel_overlay *overlay,
>       if (ret != 0)
>               return ret;
>  
> -     ret = i915_gem_object_pin_to_display_plane(new_bo, 0, NULL);
> +     ret = i915_gem_object_pin_to_display_plane(dev, new_bo, 0, NULL);
>       if (ret != 0)
>               return ret;
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 1d5d613..8406247 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -234,6 +234,12 @@ gen6_render_ring_flush(struct intel_ring_buffer *ring,
>                * when the render cache is indeed flushed.
>                */
>               flags |= PIPE_CONTROL_CS_STALL;
> +
> +             /* Flush GFDT out to memory */
> +             if (HAS_GFDT(ring->dev)) {
> +                     flags |= PIPE_CONTROL_QW_WRITE;
> +                     flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> +             }
>       }
>       if (invalidate_domains) {
>               flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -306,6 +312,13 @@ gen7_render_ring_flush(struct intel_ring_buffer *ring,
>       if (flush_domains) {
>               flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
>               flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
> +
> +             /* Flush GFDT out to memory */
> +             if (HAS_GFDT(ring->dev)) {
> +                     flags |= PIPE_CONTROL_QW_WRITE;
> +                     flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
> +                     flags |= PIPE_CONTROL_SYNCHRONIZE_GFDT;
> +             }
>       }
>       if (invalidate_domains) {
>               flags |= PIPE_CONTROL_TLB_INVALIDATE;
> @@ -1557,6 +1570,12 @@ static int gen6_ring_flush(struct intel_ring_buffer 
> *ring,
>               return ret;
>  
>       cmd = MI_FLUSH_DW;
> +
> +     /* Flush GFDT out to memory */
> +     if (flush & I915_GEM_GPU_DOMAINS && HAS_GFDT(ring->dev))
> +             cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> +                     MI_SYNCHRONIZE_GFDT;
> +
>       /*
>        * Bspec vol 1c.5 - video engine command streamer:
>        * "If ENABLED, all TLBs will be invalidated once the flush
> @@ -1629,6 +1648,12 @@ static int blt_ring_flush(struct intel_ring_buffer 
> *ring,
>               return ret;
>  
>       cmd = MI_FLUSH_DW;
> +
> +     /* Flush GFDT out to memory */
> +     if (flush & I915_GEM_DOMAIN_RENDER && HAS_GFDT(ring->dev))
> +             cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW |
> +                     MI_SYNCHRONIZE_GFDT;
> +
>       /*
>        * Bspec vol 1c.3 - blitter engine command streamer:
>        * "If ENABLED, all TLBs will be invalidated once the flush
> -- 
> 1.7.12.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to