Other than dramatically simplifying the submission code (requests ftw),
we can reduce the execlist spinlock duration and importantly avoid
having to hold it across the context switch register reads.

Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  20 +-
 drivers/gpu/drm/i915/i915_gem.c            |   8 +-
 drivers/gpu/drm/i915/i915_gem_request.h    |  21 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |  31 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 505 +++++++++++------------------
 drivers/gpu/drm/i915/intel_lrc.h           |   3 -
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 +-
 7 files changed, 209 insertions(+), 387 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 15a6fddfb79b..a5ea90944bbb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2005,8 +2005,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
                return;
        }
 
-       seq_printf(m, "CONTEXT: %s %u\n", ring->name,
-                  intel_execlists_ctx_id(ctx_obj));
+       seq_printf(m, "CONTEXT: %s\n", ring->name);
 
        if (!i915_gem_obj_ggtt_bound(ctx_obj))
                seq_puts(m, "\tNot bound in GGTT\n");
@@ -2092,7 +2091,6 @@ static int i915_execlists(struct seq_file *m, void *data)
        intel_runtime_pm_get(dev_priv);
 
        for_each_ring(ring, dev_priv, ring_id) {
-               struct drm_i915_gem_request *head_req = NULL;
                int count = 0;
 
                seq_printf(m, "%s\n", ring->name);
@@ -2105,8 +2103,8 @@ static int i915_execlists(struct seq_file *m, void *data)
                status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
                seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
 
-               read_pointer = ring->next_context_status_buffer;
-               write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
+               read_pointer = (status_pointer >> 8) & GEN8_CSB_PTR_MASK;
+               write_pointer = status_pointer & GEN8_CSB_PTR_MASK;
                if (read_pointer > write_pointer)
                        write_pointer += GEN8_CSB_ENTRIES;
                seq_printf(m, "\tRead pointer: 0x%08X, write pointer 0x%08X\n",
@@ -2123,21 +2121,9 @@ static int i915_execlists(struct seq_file *m, void *data)
                spin_lock(&ring->execlist_lock);
                list_for_each(cursor, &ring->execlist_queue)
                        count++;
-               head_req = list_first_entry_or_null(&ring->execlist_queue,
-                               struct drm_i915_gem_request, execlist_link);
                spin_unlock(&ring->execlist_lock);
 
                seq_printf(m, "\t%d requests in queue\n", count);
-               if (head_req) {
-                       struct drm_i915_gem_object *ctx_obj;
-
-                       ctx_obj = head_req->ctx->engine[ring_id].state;
-                       seq_printf(m, "\tHead request id: %u\n",
-                                  intel_execlists_ctx_id(ctx_obj));
-                       seq_printf(m, "\tHead request tail: %u\n",
-                                  head_req->tail);
-               }
-
                seq_putc(m, '\n');
        }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index eb875ecd7907..054e11cff00f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2193,12 +2193,12 @@ static void i915_gem_reset_ring_cleanup(struct 
intel_engine_cs *engine)
 
        if (i915.enable_execlists) {
                spin_lock(&engine->execlist_lock);
-
-               /* list_splice_tail_init checks for empty lists */
                list_splice_tail_init(&engine->execlist_queue,
-                                     &engine->execlist_retired_req_list);
-
+                                     &engine->execlist_completed);
+               memset(&engine->execlist_port, 0,
+                      sizeof(engine->execlist_port));
                spin_unlock(&engine->execlist_lock);
+
                intel_execlists_retire_requests(engine);
        }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h 
b/drivers/gpu/drm/i915/i915_gem_request.h
index 59957d5edfdb..c2e83584f8a2 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -63,10 +63,11 @@ struct drm_i915_gem_request {
         * This is required to calculate the maximum available ringbuffer
         * space without overwriting the postfix.
         */
-        u32 postfix;
+       u32 postfix;
 
        /** Position in the ringbuffer of the end of the whole request */
        u32 tail;
+       u32 wa_tail;
 
        /**
         * Context and ring buffer related to this request
@@ -99,24 +100,8 @@ struct drm_i915_gem_request {
        /** process identifier submitting this request */
        struct pid *pid;
 
-       /**
-        * The ELSP only accepts two elements at a time, so we queue
-        * context/tail pairs on a given queue (ring->execlist_queue) until the
-        * hardware is available. The queue serves a double purpose: we also use
-        * it to keep track of the up to 2 contexts currently in the hardware
-        * (usually one in execution and the other queued up by the GPU): We
-        * only remove elements from the head of the queue when the hardware
-        * informs us that an element has been completed.
-        *
-        * All accesses to the queue are mediated by a spinlock
-        * (ring->execlist_lock).
-        */
-
        /** Execlist link in the submission queue.*/
-       struct list_head execlist_link;
-
-       /** Execlists no. of times this request has been sent to the ELSP */
-       int elsp_submitted;
+       struct list_head execlist_link; /* guarded by engine->execlist_lock */
 };
 
 struct drm_i915_gem_request *
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 5a6251926367..f4e09952d52c 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -393,7 +393,6 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
                struct intel_ring *ring = ctx->engine[i].ring;
                struct intel_engine_cs *engine;
                struct drm_i915_gem_object *obj;
-               uint64_t ctx_desc;
 
                /* TODO: We have a design issue to be solved here. Only when we
                 * receive the first batch, we know which engine is used by the
@@ -407,8 +406,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc,
                        break;  /* XXX: continue? */
 
                engine = ring->engine;
-               ctx_desc = intel_lr_context_descriptor(ctx, engine);
-               lrc->context_desc = (u32)ctx_desc;
+               lrc->context_desc = engine->execlist_context_descriptor;
 
                /* The state page is after PPHWSP */
                lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) +
@@ -548,7 +546,7 @@ static int guc_add_workqueue_item(struct i915_guc_client 
*gc,
                        WQ_NO_WCFLUSH_WAIT;
 
        /* The GuC wants only the low-order word of the context descriptor */
-       wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, 
rq->engine);
+       wqi->context_desc = rq->engine->execlist_context_descriptor;
 
        /* The GuC firmware wants the tail index in QWords, not bytes */
        tail = rq->ring->tail >> 3;
@@ -562,27 +560,6 @@ static int guc_add_workqueue_item(struct i915_guc_client 
*gc,
 
 #define CTX_RING_BUFFER_START          0x08
 
-/* Update the ringbuffer pointer in a saved context image */
-static void lr_context_update(struct drm_i915_gem_request *rq)
-{
-       enum intel_engine_id ring_id = rq->engine->id;
-       struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring_id].state;
-       struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-       struct page *page;
-       uint32_t *reg_state;
-
-       BUG_ON(!ctx_obj);
-       WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-       WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-       page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-       reg_state = kmap_atomic(page);
-
-       reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
-
-       kunmap_atomic(reg_state);
-}
-
 /**
  * i915_guc_submit() - Submit commands through GuC
  * @client:    the guc client where commands will go through
@@ -597,10 +574,6 @@ int i915_guc_submit(struct i915_guc_client *client,
        enum intel_engine_id ring_id = rq->engine->id;
        int q_ret, b_ret;
 
-       /* Need this because of the deferred pin ctx and ring */
-       /* Shall we move this right after ring is pinned? */
-       lr_context_update(rq);
-
        q_ret = guc_add_workqueue_item(client, rq);
        if (q_ret == 0)
                b_ret = guc_ring_doorbell(client);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index de5889e95d6d..80b346a3fd8a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -265,233 +265,133 @@ int intel_sanitize_enable_execlists(struct drm_device 
*dev, int enable_execlists
        return 0;
 }
 
-/**
- * intel_execlists_ctx_id() - get the Execlists Context ID
- * @ctx_obj: Logical Ring Context backing object.
- *
- * Do not confuse with ctx->id! Unfortunately we have a name overload
- * here: the old context ID we pass to userspace as a handler so that
- * they can refer to a context, and the new context ID we pass to the
- * ELSP so that the GPU can inform us of the context status via
- * interrupts.
- *
- * Return: 20-bits globally unique context ID.
- */
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
-{
-       u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-                       LRC_PPHWSP_PN * PAGE_SIZE;
-
-       /* LRCA is required to be 4K aligned so the more significant 20 bits
-        * are globally unique */
-       return lrca >> 12;
-}
-
-static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
-{
-       return (IS_SKL_REVID(ring->dev, 0, SKL_REVID_B0) ||
-               IS_BXT_REVID(ring->dev, 0, BXT_REVID_A1)) &&
-               (ring->id == VCS || ring->id == VCS2);
-}
-
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-                                    struct intel_engine_cs *ring)
+static u32 execlists_request_write_tail(struct drm_i915_gem_request *req)
 {
-       struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
-       uint64_t desc;
-       uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
-                       LRC_PPHWSP_PN * PAGE_SIZE;
-
-       WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
-
-       desc = GEN8_CTX_VALID;
-       desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << 
GEN8_CTX_ADDRESSING_MODE_SHIFT;
-       if (IS_GEN8(ring->i915))
-               desc |= GEN8_CTX_L3LLC_COHERENT;
-       desc |= GEN8_CTX_PRIVILEGE;
-       desc |= lrca;
-       desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
-
-       /* TODO: WaDisableLiteRestore when we start using semaphore
-        * signalling between Command Streamers */
-       /* desc |= GEN8_CTX_FORCE_RESTORE; */
+       struct intel_ring *ring = req->ring;
+       struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
 
-       /* WaEnableForceRestoreInCtxtDescForVCS:skl */
-       /* WaEnableForceRestoreInCtxtDescForVCS:bxt */
-       if (disable_lite_restore_wa(ring))
-               desc |= GEN8_CTX_FORCE_RESTORE;
+       if (ppgtt && !USES_FULL_48BIT_PPGTT(req->i915)) {
+               /* True 32b PPGTT with dynamic page allocation: update PDP
+                * registers and point the unallocated PDPs to scratch page.
+                * PML4 is allocated during ppgtt init, so this is not needed
+                * in 48-bit mode.
+                */
+               if (ppgtt->pd_dirty_rings & intel_engine_flag(req->engine)) {
+                       ASSIGN_CTX_PDP(ppgtt, ring->registers, 3);
+                       ASSIGN_CTX_PDP(ppgtt, ring->registers, 2);
+                       ASSIGN_CTX_PDP(ppgtt, ring->registers, 1);
+                       ASSIGN_CTX_PDP(ppgtt, ring->registers, 0);
+                       ppgtt->pd_dirty_rings &= 
~intel_engine_flag(req->engine);
+               }
+       }
 
-       return desc;
+       ring->registers[CTX_RING_TAIL+1] = req->tail;
+       return ring->context_descriptor;
 }
 
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
-                                struct drm_i915_gem_request *rq1)
+static void execlists_submit_pair(struct intel_engine_cs *ring)
 {
+       struct drm_i915_private *dev_priv = ring->i915;
+       uint32_t desc[4];
 
-       struct intel_engine_cs *engine = rq0->engine;
-       struct drm_i915_private *dev_priv = rq0->i915;
-       uint64_t desc[2];
-
-       if (rq1) {
-               desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
-               rq1->elsp_submitted++;
-       } else {
-               desc[1] = 0;
-       }
+       if (ring->execlist_port[1]) {
+               desc[0] = execlists_request_write_tail(ring->execlist_port[1]);
+               desc[1] = ring->execlist_port[1]->fence.seqno;
+       } else
+               desc[1] = desc[0] = 0;
 
-       desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-       rq0->elsp_submitted++;
+       desc[2] = execlists_request_write_tail(ring->execlist_port[0]);
+       desc[3] = ring->execlist_port[0]->fence.seqno;
 
-       /* You must always write both descriptors in the order below. */
-       spin_lock_irq(&dev_priv->uncore.lock);
-       intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-       I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
-       I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
+       /* Note: You must always write both descriptors in the order below. */
+       I915_WRITE_FW(RING_ELSP(ring), desc[1]);
+       I915_WRITE_FW(RING_ELSP(ring), desc[0]);
+       I915_WRITE_FW(RING_ELSP(ring), desc[3]);
 
-       I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
        /* The context is automatically loaded after the following */
-       I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
-
-       /* ELSP is a wo register, use another nearby reg for posting */
-       POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
-       intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-       spin_unlock_irq(&dev_priv->uncore.lock);
+       I915_WRITE_FW(RING_ELSP(ring), desc[2]);
 }
 
-static int execlists_update_context(struct drm_i915_gem_request *rq)
+static void execlists_context_unqueue(struct intel_engine_cs *engine)
 {
-       struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-       struct drm_i915_gem_object *ctx_obj = 
rq->ctx->engine[rq->engine->id].state;
-       struct drm_i915_gem_object *rb_obj = rq->ring->obj;
-       struct page *page;
-       uint32_t *reg_state;
-
-       BUG_ON(!ctx_obj);
-       WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
-       WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
-
-       page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
-       reg_state = kmap_atomic(page);
+       struct drm_i915_gem_request *cursor;
+       bool submit = false;
+       int port = 0;
 
-       reg_state[CTX_RING_TAIL+1] = rq->tail;
-       reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
+       assert_spin_locked(&engine->execlist_lock);
 
-       if (ppgtt && !USES_FULL_48BIT_PPGTT(rq->i915)) {
-               /* True 32b PPGTT with dynamic page allocation: update PDP
-                * registers and point the unallocated PDPs to scratch page.
-                * PML4 is allocated during ppgtt init, so this is not needed
-                * in 48-bit mode.
+       /* Try to read in pairs and fill both submission ports */
+       cursor = engine->execlist_port[port];
+       if (cursor != NULL) {
+               /* WaIdleLiteRestore:bdw,skl
+                * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+                * as we resubmit the request. See gen8_emit_request()
+                * for where we prepare the padding after the end of the
+                * request.
                 */
-               ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
-               ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
-               ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
-               ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
-       }
-
-       kunmap_atomic(reg_state);
-
-       return 0;
-}
+               cursor->tail = cursor->wa_tail;
+               cursor = list_next_entry(cursor, execlist_link);
+       } else
+               cursor = list_first_entry(&engine->execlist_queue,
+                                         typeof(*cursor),
+                                         execlist_link);
+       while (&cursor->execlist_link != &engine->execlist_queue) {
+               /* Same ctx: ignore earlier request, as the
+                * second request extends the first.
+                */
+               if (engine->execlist_port[port] &&
+                   cursor->ctx != engine->execlist_port[port]->ctx) {
+                       if (++port == ARRAY_SIZE(engine->execlist_port))
+                               break;
+               }
 
-static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
-                                     struct drm_i915_gem_request *rq1)
-{
-       execlists_update_context(rq0);
+               engine->execlist_port[port] = cursor;
+               submit = true;
 
-       if (rq1)
-               execlists_update_context(rq1);
+               cursor = list_next_entry(cursor, execlist_link);
+       }
 
-       execlists_elsp_write(rq0, rq1);
+       if (submit)
+               execlists_submit_pair(engine);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
+static bool execlists_complete_requests(struct intel_engine_cs *engine,
+                                       u32 seqno)
 {
-       struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
-       struct drm_i915_gem_request *cursor = NULL, *tmp = NULL;
-
        assert_spin_locked(&engine->execlist_lock);
 
-       /*
-        * If irqs are not active generate a warning as batches that finish
-        * without the irqs may get lost and a GPU Hang may occur.
-        */
-       WARN_ON(!intel_irqs_enabled(engine->dev->dev_private));
+       do {
+               struct drm_i915_gem_request *req;
 
-       if (list_empty(&engine->execlist_queue))
-               return;
+               req = engine->execlist_port[0];
+               if (req == NULL)
+                       break;
 
-       /* Try to read in pairs */
-       list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
-                                execlist_link) {
-               if (!req0) {
-                       req0 = cursor;
-               } else if (req0->ctx == cursor->ctx) {
-                       /* Same ctx: ignore first request, as second request
-                        * will update tail past first request's workload */
-                       cursor->elsp_submitted = req0->elsp_submitted;
-                       list_del(&req0->execlist_link);
-                       list_add_tail(&req0->execlist_link,
-                               &engine->execlist_retired_req_list);
-                       req0 = cursor;
-               } else {
-                       req1 = cursor;
+               if (!i915_seqno_passed(seqno, req->fence.seqno))
                        break;
-               }
-       }
 
-       if (IS_GEN8(engine->dev) || IS_GEN9(engine->dev)) {
-               /*
-                * WaIdleLiteRestore: make sure we never cause a lite
-                * restore with HEAD==TAIL
+               /* Move the completed set of requests from the start of the
+                * execlist_queue over to the tail of the execlist_completed.
                 */
-               if (req0->elsp_submitted) {
-                       /*
-                        * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
-                        * as we resubmit the request. See gen8_add_request()
-                        * for where we prepare the padding after the end of the
-                        * request.
-                        */
-                       struct intel_ring *ring;
-
-                       ring = req0->ctx->engine[engine->id].ring;
-                       req0->tail += 8;
-                       req0->tail &= ring->size - 1;
-               }
-       }
-
-       WARN_ON(req1 && req1->elsp_submitted);
+               engine->execlist_completed.prev->next = 
engine->execlist_queue.next;
+               engine->execlist_completed.prev = &req->execlist_link;
 
-       execlists_submit_requests(req0, req1);
-}
-
-static bool execlists_check_remove_request(struct intel_engine_cs *ring,
-                                          u32 request_id)
-{
-       struct drm_i915_gem_request *head_req;
+               engine->execlist_queue.next = req->execlist_link.next;
+               req->execlist_link.next->prev = &engine->execlist_queue;
 
-       assert_spin_locked(&ring->execlist_lock);
+               req->execlist_link.next = &engine->execlist_completed;
 
-       head_req = list_first_entry_or_null(&ring->execlist_queue,
-                                           struct drm_i915_gem_request,
-                                           execlist_link);
-
-       if (head_req != NULL) {
-               struct drm_i915_gem_object *ctx_obj =
-                               head_req->ctx->engine[ring->id].state;
-               if (intel_execlists_ctx_id(ctx_obj) == request_id) {
-                       WARN(head_req->elsp_submitted == 0,
-                            "Never submitted head request\n");
-
-                       if (--head_req->elsp_submitted <= 0) {
-                               list_del(&head_req->execlist_link);
-                               list_add_tail(&head_req->execlist_link,
-                                       &ring->execlist_retired_req_list);
-                               return true;
-                       }
-               }
-       }
+               /* The hardware has completed the request on this port, it
+                * will switch to the next.
+                */
+               engine->execlist_port[0] = engine->execlist_port[1];
+               engine->execlist_port[1] = NULL;
+       } while (1);
 
-       return false;
+       if (engine->execlist_context_descriptor & GEN8_CTX_FORCE_RESTORE)
+               return engine->execlist_port[0] == NULL;
+       else
+               return engine->execlist_port[1] == NULL;
 }
 
 static void set_rtpriority(void)
@@ -504,23 +404,29 @@ static int intel_execlists_submit(void *arg)
 {
        struct intel_engine_cs *ring = arg;
        struct drm_i915_private *dev_priv = ring->i915;
+       const i915_reg_t ptrs = RING_CONTEXT_STATUS_PTR(ring);
 
        set_rtpriority();
 
+       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
        do {
-               u32 status;
-               u32 status_id;
-               u32 submit_contexts;
                u8 head, tail;
+               u32 seqno;
 
                set_current_state(TASK_INTERRUPTIBLE);
-               head = ring->next_context_status_buffer;
-               tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & 
GEN8_CSB_PTR_MASK;
+               head = tail = 0;
+               if (READ_ONCE(ring->execlist_port[0])) {
+                       u32 x = I915_READ_FW(ptrs);
+                       head = x >> 8;
+                       tail = x;
+               }
                if (head == tail) {
+                       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
                        if (kthread_should_stop())
                                return 0;
 
                        schedule();
+                       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
                        continue;
                }
                __set_current_state(TASK_RUNNING);
@@ -528,86 +434,46 @@ static int intel_execlists_submit(void *arg)
                if (head > tail)
                        tail += GEN8_CSB_ENTRIES;
 
-               status = 0;
-               submit_contexts = 0;
-
-               spin_lock(&ring->execlist_lock);
-
+               seqno = 0;
                while (head++ < tail) {
-                       status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, 
head % GEN8_CSB_ENTRIES));
-                       status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, 
head % GEN8_CSB_ENTRIES));
-
-                       if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-                               continue;
-
-                       if (status & GEN8_CTX_STATUS_PREEMPTED) {
-                               if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-                                       if 
(execlists_check_remove_request(ring, status_id))
-                                               WARN(1, "Lite Restored request 
removed from queue\n");
-                               } else
-                                       WARN(1, "Preemption without Lite 
Restore\n");
-                       }
-
-                       if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
-                           (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
-                               if (execlists_check_remove_request(ring, 
status_id))
-                                       submit_contexts++;
+                       u32 status = 
I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring,
+                                                                            
head % GEN8_CSB_ENTRIES));
+                       if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED && 0)) {
+                               DRM_ERROR("Pre-empted request %x %s Lite 
Restore\n",
+                                         
I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES)),
+                                         status & GEN8_CTX_STATUS_LITE_RESTORE 
? "with" : "without");
                        }
+                       if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+                                     GEN8_CTX_STATUS_ELEMENT_SWITCH))
+                               seqno = 
I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring,
+                                                                               
head % GEN8_CSB_ENTRIES));
                }
 
-               if (disable_lite_restore_wa(ring)) {
-                       /* Prevent a ctx to preempt itself */
-                       if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
-                                       (submit_contexts != 0))
+               I915_WRITE_FW(ptrs,
+                             _MASKED_FIELD(GEN8_CSB_PTR_MASK<<8,
+                                           (tail % GEN8_CSB_ENTRIES) << 8));
+
+               if (seqno) {
+                       spin_lock(&ring->execlist_lock);
+                       if (execlists_complete_requests(ring, seqno))
                                execlists_context_unqueue(ring);
-               } else if (submit_contexts != 0) {
-                       execlists_context_unqueue(ring);
+                       spin_unlock(&ring->execlist_lock);
                }
-
-               spin_unlock(&ring->execlist_lock);
-
-               WARN(submit_contexts > 2, "More than two context complete 
events?\n");
-               ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES;
-               I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
-                          _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
-                                        ring->next_context_status_buffer<<8));
        } while (1);
 }
 
 static int execlists_context_queue(struct drm_i915_gem_request *request)
 {
        struct intel_engine_cs *engine = request->engine;
-       struct drm_i915_gem_request *cursor;
-       int num_elements = 0;
 
        i915_gem_request_get(request);
 
        spin_lock(&engine->execlist_lock);
-
-       list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
-               if (++num_elements > 2)
-                       break;
-
-       if (num_elements > 2) {
-               struct drm_i915_gem_request *tail_req;
-
-               tail_req = list_last_entry(&engine->execlist_queue,
-                                          struct drm_i915_gem_request,
-                                          execlist_link);
-
-               if (request->ctx == tail_req->ctx) {
-                       WARN(tail_req->elsp_submitted != 0,
-                               "More than 2 already-submitted reqs queued\n");
-                       list_del(&tail_req->execlist_link);
-                       list_add_tail(&tail_req->execlist_link,
-                               &engine->execlist_retired_req_list);
-               }
-       }
-
        list_add_tail(&request->execlist_link, &engine->execlist_queue);
-       if (num_elements == 0)
-               execlists_context_unqueue(engine);
-
+       if (engine->execlist_port[0] == NULL) {
+               engine->execlist_port[0] = request;
+               execlists_submit_pair(engine);
+       }
        spin_unlock(&engine->execlist_lock);
 
        return 0;
@@ -641,56 +507,32 @@ int intel_logical_ring_alloc_request_extras(struct 
drm_i915_gem_request *request
        return 0;
 }
 
-/*
- * intel_logical_ring_advance_and_submit() - advance the tail and submit the 
workload
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual 
context. What
- * really happens during submission is that the context and current tail will 
be placed
- * on a queue waiting for the ELSP to be ready to accept a new context 
submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static void
-intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
-{
-       struct drm_i915_private *dev_priv = request->i915;
-
-       intel_ring_advance(request->ring);
-       request->tail = request->ring->tail;
-
-       if (dev_priv->guc.execbuf_client)
-               i915_guc_submit(dev_priv->guc.execbuf_client, request);
-       else
-               execlists_context_queue(request);
-}
-
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring)
 {
        struct drm_i915_gem_request *req, *tmp;
-       struct list_head retired_list;
+       struct list_head list;
 
-       WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
-       if (list_empty(&ring->execlist_retired_req_list))
+       lockdep_assert_held(&ring->dev->struct_mutex);
+       if (list_empty(&ring->execlist_completed))
                goto out;
 
-       INIT_LIST_HEAD(&retired_list);
        spin_lock(&ring->execlist_lock);
-       list_replace_init(&ring->execlist_retired_req_list, &retired_list);
+       list_replace_init(&ring->execlist_completed, &list);
        spin_unlock(&ring->execlist_lock);
 
-       list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
+       list_for_each_entry_safe(req, tmp, &list, execlist_link) {
                struct intel_context *ctx = req->ctx;
                struct drm_i915_gem_object *ctx_obj =
                                ctx->engine[ring->id].state;
 
                if (ctx_obj && (ctx != ring->default_context))
                        intel_lr_context_unpin(req);
-               list_del(&req->execlist_link);
+
                i915_gem_request_put(req);
        }
 
 out:
-       return list_empty(&ring->execlist_queue);
+       return READ_ONCE(ring->execlist_port[0]) == NULL;
 }
 
 void intel_logical_ring_stop(struct intel_engine_cs *ring)
@@ -720,6 +562,7 @@ static int intel_lr_context_do_pin(struct intel_engine_cs 
*ring,
                struct intel_ring *ringbuf)
 {
        struct drm_i915_private *dev_priv = ring->i915;
+       u32 ggtt_offset;
        int ret = 0;
 
        WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
@@ -734,6 +577,16 @@ static int intel_lr_context_do_pin(struct intel_engine_cs 
*ring,
 
        ctx_obj->dirty = true;
 
+       ggtt_offset =
+               i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE;
+       ringbuf->context_descriptor =
+               ggtt_offset | ring->execlist_context_descriptor;
+
+       ringbuf->registers =
+               kmap(i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN));
+       ringbuf->registers[CTX_RING_BUFFER_START+1] =
+               i915_gem_obj_ggtt_offset(ringbuf->obj);
+
        /* Invalidate GuC TLB. */
        if (i915.enable_guc_submission)
                I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
@@ -768,6 +621,7 @@ static int intel_lr_context_pin(struct drm_i915_gem_request 
*rq)
 
 void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
 {
+       struct drm_i915_gem_object *ctx_obj;
        int engine = rq->engine->id;
 
        WARN_ON(!mutex_is_locked(&rq->i915->dev->struct_mutex));
@@ -775,7 +629,10 @@ void intel_lr_context_unpin(struct drm_i915_gem_request 
*rq)
                return;
 
        intel_ring_unmap(rq->ring);
-       i915_gem_object_ggtt_unpin(rq->ctx->engine[engine].state);
+
+       ctx_obj = rq->ctx->engine[engine].state;
+       kunmap(i915_gem_object_get_page(ctx_obj, LRC_STATE_PN));
+       i915_gem_object_ggtt_unpin(ctx_obj);
        i915_gem_context_unreference(rq->ctx);
 }
 
@@ -1168,12 +1025,39 @@ out:
        return ret;
 }
 
+static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
+{
+       return (IS_SKL_REVID(ring->i915, 0, SKL_REVID_B0) ||
+               IS_BXT_REVID(ring->i915, 0, BXT_REVID_A1)) &&
+               (ring->id == VCS || ring->id == VCS2);
+}
+
+static uint64_t lr_context_descriptor(struct intel_engine_cs *ring)
+{
+       uint64_t desc;
+
+       desc = GEN8_CTX_VALID;
+       desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << 
GEN8_CTX_ADDRESSING_MODE_SHIFT;
+       if (IS_GEN8(ring->i915))
+               desc |= GEN8_CTX_L3LLC_COHERENT;
+       desc |= GEN8_CTX_PRIVILEGE;
+
+       /* TODO: WaDisableLiteRestore when we start using semaphore
+        * signalling between Command Streamers */
+       /* desc |= GEN8_CTX_FORCE_RESTORE; */
+
+       /* WaEnableForceRestoreInCtxtDescForVCS:skl */
+       /* WaEnableForceRestoreInCtxtDescForVCS:bxt */
+       if (disable_lite_restore_wa(ring))
+               desc |= GEN8_CTX_FORCE_RESTORE;
+
+       return desc;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u8 next_context_status_buffer_hw;
-
        lrc_setup_hardware_status_page(ring,
                                ring->default_context->engine[ring->id].state);
 
@@ -1197,18 +1081,6 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*ring)
         * SKL  |         ?                |         ?            |
         * BXT  |         ?                |         ?            |
         */
-       next_context_status_buffer_hw =
-               GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring)));
-
-       /*
-        * When the CSB registers are reset (also after power-up / gpu reset),
-        * CSB write pointer is set to all 1's, which is not valid, use '5' in
-        * this special case, so the first element read is CSB[0].
-        */
-       if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-               next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-
-       ring->next_context_status_buffer = next_context_status_buffer_hw;
        DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
 
        memset(&ring->hangcheck, 0, sizeof(ring->hangcheck));
@@ -1482,7 +1354,8 @@ static int gen8_add_request(struct drm_i915_gem_request 
*request)
        intel_ring_emit(ring, request->fence.seqno);
        intel_ring_emit(ring, MI_USER_INTERRUPT);
        intel_ring_emit(ring, MI_NOOP);
-       intel_logical_ring_advance_and_submit(request);
+       intel_ring_advance(ring);
+       request->tail = ring->tail;
 
        /*
         * Here we add two extra NOOPs as padding to avoid
@@ -1491,6 +1364,12 @@ static int gen8_add_request(struct drm_i915_gem_request 
*request)
        intel_ring_emit(ring, MI_NOOP);
        intel_ring_emit(ring, MI_NOOP);
        intel_ring_advance(ring);
+       request->wa_tail = ring->tail;
+
+       if (request->i915->guc.execbuf_client)
+               i915_guc_submit(request->i915->guc.execbuf_client, request);
+       else
+               execlists_context_queue(request);
 
        return 0;
 }
@@ -1569,9 +1448,11 @@ static int logical_ring_init(struct drm_device *dev, 
struct intel_engine_cs *rin
 
        INIT_LIST_HEAD(&ring->buffers);
        INIT_LIST_HEAD(&ring->execlist_queue);
-       INIT_LIST_HEAD(&ring->execlist_retired_req_list);
+       INIT_LIST_HEAD(&ring->execlist_completed);
        spin_lock_init(&ring->execlist_lock);
 
+       ring->execlist_context_descriptor = lr_context_descriptor(ring);
+
        ret = i915_cmd_parser_init_ring(ring);
        if (ret)
                goto error;
@@ -1592,8 +1473,6 @@ static int logical_ring_init(struct drm_device *dev, 
struct intel_engine_cs *rin
                goto error;
        }
 
-       ring->next_context_status_buffer =
-                       I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & 
GEN8_CSB_PTR_MASK;
        task = kthread_run(intel_execlists_submit, ring,
                           "irq/i915:%de", ring->id);
        if (IS_ERR(task))
@@ -1904,9 +1783,7 @@ populate_lr_context(struct intel_context *ctx, struct 
drm_i915_gem_object *ctx_o
                                          CTX_CTRL_RS_CTX_ENABLE));
        ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0);
        ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0);
-       /* Ring buffer start address is not known until the buffer is pinned.
-        * It is written to the context image in execlists_update_context()
-        */
+       /* Ring buffer start address is not known until the buffer is pinned. */
        ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, 
RING_START(ring->mmio_base), 0);
        ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, 
RING_CTL(ring->mmio_base),
                       ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | 
RING_VALID);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 33f82a84065a..37601a35d5fc 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -74,12 +74,9 @@ int intel_lr_context_deferred_alloc(struct intel_context 
*ctx,
 void intel_lr_context_unpin(struct drm_i915_gem_request *req);
 void intel_lr_context_reset(struct drm_device *dev,
                        struct intel_context *ctx);
-uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
-                                    struct intel_engine_cs *ring);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int 
enable_execlists);
-u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
 bool intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index edaf07b2292e..3d4d5711aea9 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -122,6 +122,9 @@ struct intel_ring {
         * we can detect new retirements.
         */
        u32 last_retired_head;
+
+       u32 context_descriptor;
+       u32 *registers;
 };
 
 struct intel_context;
@@ -293,9 +296,10 @@ struct intel_engine_cs {
        /* Execlists */
        struct task_struct *execlists_submit;
        spinlock_t execlist_lock;
+       struct drm_i915_gem_request *execlist_port[2];
        struct list_head execlist_queue;
-       struct list_head execlist_retired_req_list;
-       u8 next_context_status_buffer;
+       struct list_head execlist_completed;
+       u32 execlist_context_descriptor;
        u32             irq_keep_mask; /* bitmask for interrupts that should 
not be masked */
 
        /**
-- 
2.7.0.rc3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to