If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
we've already blown out the entire cache via a wbinvd, there is nothing more to
do.

With this and the previous patches, I am seeing a 3x FPS increase on a certain
benchmark which uses a giant 2d array texture. Unless I missed something in the
code, it should only effect non-LLC i915 platforms.

I haven't yet run any numbers for other benchmarks, nor have I attempted to
check if various conformance tests still pass.

v2: Rewrite the patch to be i915 only
Obtain whether or not we wbinvd up front.

Signed-off-by: Ben Widawsky <b...@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h            |  8 ++++++++
 drivers/gpu/drm/i915/i915_gem.c            | 11 +++++------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 ++++++++++++++++----
 drivers/gpu/drm/i915/intel_lrc.c           | 10 ++++++++--
 4 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 90ff6aa..5d2f62d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1643,6 +1643,7 @@ struct i915_workarounds {
 
 struct eb_vmas {
        struct list_head vmas;
+       bool do_wbinvd;
        int and;
        union {
                struct i915_vma *lut[0];
@@ -1913,6 +1914,8 @@ struct drm_i915_private {
                void (*stop_ring)(struct intel_engine_cs *ring);
        } gt;
 
+       size_t wbinvd_threshold;
+
        uint32_t request_uniq;
 
        /*
@@ -2810,6 +2813,11 @@ static inline bool i915_stop_ring_allow_warn(struct 
drm_i915_private *dev_priv)
 
 void i915_gem_reset(struct drm_device *dev);
 bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+static inline bool cpu_cache_is_coherent(struct drm_device *dev,
+                                        enum i915_cache_level level)
+{
+       return HAS_LLC(dev) || level != I915_CACHE_NONE;
+}
 int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_init(struct drm_device *dev);
 int i915_gem_init_rings(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fc81889..5bfb332 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -61,12 +61,6 @@ static int i915_gem_shrinker_oom(struct notifier_block *nb,
                                 void *ptr);
 static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
 
-static bool cpu_cache_is_coherent(struct drm_device *dev,
-                                 enum i915_cache_level level)
-{
-       return HAS_LLC(dev) || level != I915_CACHE_NONE;
-}
-
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
        if (!cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
@@ -4878,6 +4872,11 @@ int i915_gem_init(struct drm_device *dev)
                dev_priv->gt.stop_ring = intel_logical_ring_stop;
        }
 
+       dev_priv->wbinvd_threshold = boot_cpu_data.x86_cache_size << 10;
+       /* Pick a high default in the unlikely case we got nothing */
+       if (!dev_priv->wbinvd_threshold)
+               dev_priv->wbinvd_threshold = (8 << 20);
+
        ret = i915_gem_init_userptr(dev);
        if (ret)
                goto out_unlock;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 13ed13e..56f9268 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -50,7 +50,7 @@ eb_create(struct drm_i915_gem_execbuffer2 *args)
                unsigned size = args->buffer_count;
                size *= sizeof(struct i915_vma *);
                size += sizeof(struct eb_vmas);
-               eb = kmalloc(size, GFP_TEMPORARY | __GFP_NOWARN | 
__GFP_NORETRY);
+               eb = kzalloc(size, GFP_TEMPORARY | __GFP_NOWARN | 
__GFP_NORETRY);
        }
 
        if (eb == NULL) {
@@ -78,6 +78,7 @@ eb_reset(struct eb_vmas *eb)
 {
        if (eb->and >= 0)
                memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+       eb->do_wbinvd = false;
 }
 
 static int
@@ -154,6 +155,11 @@ eb_lookup_vmas(struct eb_vmas *eb,
                        hlist_add_head(&vma->exec_node,
                                       &eb->buckets[handle & eb->and]);
                }
+
+               if (vma->node.size >= to_i915(obj->base.dev)->wbinvd_threshold 
&&
+                   obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+                   !cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
+                       eb->do_wbinvd = true;
                ++i;
        }
 
@@ -826,7 +832,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs 
*ring,
        struct list_head *vmas = &eb->vmas;
        struct i915_vma *vma;
        uint32_t flush_domains = 0;
-       bool flush_chipset = false;
+       bool flush_chipset = eb->do_wbinvd;
        int ret;
 
        list_for_each_entry(vma, vmas, exec_list) {
@@ -835,12 +841,18 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs 
*ring,
                if (ret)
                        return ret;
 
+               flush_domains |= obj->base.write_domain;
+
+               if (eb->do_wbinvd)
+                       continue;
+
                if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
                        flush_chipset |= i915_gem_clflush_object(obj, false);
-
-               flush_domains |= obj->base.write_domain;
        }
 
+       if (eb->do_wbinvd)
+               wbinvd();
+
        if (flush_chipset)
                i915_gem_chipset_flush(ring->dev);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 03741f9..16ca4a2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -586,12 +586,18 @@ static int execlists_move_to_gpu(struct intel_ringbuffer 
*ringbuf,
                if (ret)
                        return ret;
 
+               flush_domains |= obj->base.write_domain;
+
+               if (eb->do_wbinvd)
+                       continue;
+
                if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
                        i915_gem_clflush_object(obj, false);
-
-               flush_domains |= obj->base.write_domain;
        }
 
+       if (eb->do_wbinvd)
+               wbinvd();
+
        if (flush_domains & I915_GEM_DOMAIN_GTT)
                wmb();
 
-- 
2.3.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to