On Thu, Mar 03, 2016 at 08:50:46PM +0000, Chris Wilson wrote:
> Yes, patch is sane, I'm just messing around with my Braswell at the
> moment and then I'll try again at getting some numbers. First glance
> said 10% in reducing latency (with a 100% throughput improvement in one
> particular small copy scenario, that I want to reprdouce and do some back
> of the envelope calculations to check that it is sane), but the machine
> (thanks to execlists) just dies as soon as I try some more interesting
> benchmarks.

I rebased the patch ontop of the execlists thread so that I could
actually use the machine...

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 4d005dd..5c0a4e0 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -549,13 +549,10 @@ static int intel_execlists_submit(void *arg)
 
        intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
        do {
-               u32 status;
-               u32 status_id;
-               u32 submit_contexts;
-               u32 status_pointer;
                unsigned read_pointer, write_pointer;
-
-               spin_lock(&ring->execlist_lock);
+               u32 csb[GEN8_CSB_ENTRIES][2];
+               u32 status_pointer;
+               unsigned i, read, submit_contexts;
 
                set_current_state(TASK_INTERRUPTIBLE);
                status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(ring));
@@ -563,8 +560,6 @@ static int intel_execlists_submit(void *arg)
                write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
                if (read_pointer == write_pointer) {
                        intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-                       spin_unlock(&ring->execlist_lock);
-
                        if (kthread_should_stop())
                                return 0;
 
@@ -577,37 +572,41 @@ static int intel_execlists_submit(void *arg)
                if (read_pointer > write_pointer)
                        write_pointer += GEN8_CSB_ENTRIES;
 
-               submit_contexts = 0;
+               read = 0;
                while (read_pointer < write_pointer) {
-                       status = get_context_status(ring, ++read_pointer, 
&status_id);
+                       csb[read][0] = get_context_status(ring, ++read_pointer,
+                                                         &csb[read][1]);
+                       read++;
+               }
 
-                       if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
-                               if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-                                       if 
(execlists_check_remove_request(ring, status_id))
+               I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
+                             _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+                                           (write_pointer % GEN8_CSB_ENTRIES) 
<< 8));
+
+               spin_lock(&ring->execlist_lock);
+
+               submit_contexts = 0;
+               for (i = 0; i < read; i++) {
+                       if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
+                               if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
+                                       if 
(execlists_check_remove_request(ring, csb[i][1]))
                                                WARN(1, "Lite Restored request 
removed from queue\n");
                                } else
                                        WARN(1, "Preemption without Lite 
Restore\n");
                        }
 
-                       if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
-                                     GEN8_CTX_STATUS_ELEMENT_SWITCH))
+                       if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+                                        GEN8_CTX_STATUS_ELEMENT_SWITCH))
                                submit_contexts +=
-                                       execlists_check_remove_request(ring, 
status_id);
+                                       execlists_check_remove_request(ring, 
csb[i][1]);
                }
 
                if (submit_contexts) {
                        if (!ring->disable_lite_restore_wa ||
-                           (status & GEN8_CTX_STATUS_ACTIVE_IDLE))
+                           (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
                                execlists_context_unqueue__locked(ring);
                }
 
-
-               /* Update the read pointer to the old write pointer. Manual 
ringbuffer
-                * management ftw </sarcasm> */
-               I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(ring),
-                             _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-                                           (write_pointer % GEN8_CSB_ENTRIES) 
<< 8));
-
                spin_unlock(&ring->execlist_lock);
 
                if (unlikely(submit_contexts > 2))

On braswell that gives improves the nop dispatch latency by 20%

          gem:exec:latency:0: -0.35%
          gem:exec:latency:1: +4.57%
          gem:exec:latency:2: +0.07%
          gem:exec:latency:4: +18.05%
          gem:exec:latency:8: +26.97%
         gem:exec:latency:16: +20.37%
         gem:exec:latency:32: +19.91%
         gem:exec:latency:64: +24.06%
        gem:exec:latency:128: +23.75%
        gem:exec:latency:256: +24.54%
        gem:exec:latency:512: +24.30%
       gem:exec:latency:1024: +24.43%

Outside of that scenario, the changes are more or less in the noise.
Even if we look at the full round-trip latency for synchronous execution.

     gem:exec:nop:rcs:single: -2.68%
 gem:exec:nop:rcs:continuous: -2.28%
     gem:exec:nop:bcs:single: -2.31%
 gem:exec:nop:bcs:continuous: +16.64%
     gem:exec:nop:vcs:single: -6.24%
 gem:exec:nop:vcs:continuous: +3.76%
    gem:exec:nop:vecs:single: +2.56%
gem:exec:nop:vecs:continuous: +1.83%

And with any busywork on top, we lose the effect:

     gem:exec:trace:Atlantis: -0.12%
       gem:exec:trace:glamor: +2.08%
     gem:exec:trace:glxgears: +0.79%
    gem:exec:trace:OglBatch7: +0.45%
          gem:exec:trace:sna: +0.57%
gem:exec:trace:unigine-valley:+0.01%
          gem:exec:trace:uxa: +5.63%

I am hesistant to r-b something that cannot be tested since the relevant
igt simply explode on my machine both before and after the patch.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to