i915: Simplify ELSP queue request tracking

Mika Kuoppala Tue, 23 Aug 2016 07:29:52 -0700

Chris Wilson <ch...@chris-wilson.co.uk> writes:

> Emulate HW to track and manage ELSP queue. A set of SW ports are defined
> and requests are assigned to these ports before submitting them to HW. This
> helps in cleaning up incomplete requests during reset recovery easier
> especially after engine reset by decoupling elsp queue management. This
> will become more clear in the next patch.
>
> In the engine reset case we want to resume where we left-off after skipping
> the incomplete batch which requires checking the elsp queue, removing
> element and fixing elsp_submitted counts in some cases. Instead of directly
> manipulating the elsp queue from reset path we can examine these ports, fix
> up ringbuffer pointers using the incomplete request and restart submissions
> again after reset.
>
> Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> Cc: Mika Kuoppala <mika.kuopp...@intel.com>
> Cc: Arun Siluvery <arun.siluv...@linux.intel.com>
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
> Link: 
> http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluv...@linux.intel.com
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     |   2 +-
>  drivers/gpu/drm/i915/i915_gem.c         |   6 +-
>  drivers/gpu/drm/i915/i915_gem_request.c |  11 +-
>  drivers/gpu/drm/i915/i915_gem_request.h |  24 +-
>  drivers/gpu/drm/i915/intel_lrc.c        | 417 
> +++++++++++---------------------
>  drivers/gpu/drm/i915/intel_lrc.h        |   2 -
>  drivers/gpu/drm/i915/intel_ringbuffer.h |   7 +-
>  7 files changed, 160 insertions(+), 309 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9bd41581b592..f1f937a64c27 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2200,7 +2200,7 @@ static int i915_execlists(struct seq_file *m, void 
> *data)
>               status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
>               seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
>  
> -             read_pointer = engine->next_context_status_buffer;
> +             read_pointer = GEN8_CSB_READ_PTR(status_pointer);
>               write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
>               if (read_pointer > write_pointer)
>                       write_pointer += GEN8_CSB_ENTRIES;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f4f8eaa90f2a..19715f5e698f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2443,7 +2443,11 @@ static void i915_gem_reset_engine_cleanup(struct 
> intel_engine_cs *engine)
>               /* Ensure irq handler finishes or is cancelled. */
>               tasklet_kill(&engine->irq_tasklet);
>  
> -             intel_execlists_cancel_requests(engine);
> +             INIT_LIST_HEAD(&engine->execlist_queue);
> +             i915_gem_request_assign(&engine->execlist_port[0].request,
> +                                     NULL);
> +             i915_gem_request_assign(&engine->execlist_port[1].request,
> +                                     NULL);


How about engine->reset() which would contain these?

>       }
>  
>       /*
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index 6c2553715263..49cca4233a8e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -736,16 +736,18 @@ complete:
>       return ret;
>  }
>  
> -static void engine_retire_requests(struct intel_engine_cs *engine)
> +static bool engine_retire_requests(struct intel_engine_cs *engine)
>  {
>       struct drm_i915_gem_request *request, *next;
>  
>       list_for_each_entry_safe(request, next, &engine->request_list, link) {
>               if (!i915_gem_request_completed(request))
> -                     break;
> +                     return false;
>  
>               i915_gem_request_retire(request);
>       }
> +
> +     return true;
>  }
>  
>  void i915_gem_retire_requests(struct drm_i915_private *dev_priv)
> @@ -759,9 +761,8 @@ void i915_gem_retire_requests(struct drm_i915_private 
> *dev_priv)
>  
>       GEM_BUG_ON(!dev_priv->gt.awake);
>  
> -     for_each_engine(engine, dev_priv) {
> -             engine_retire_requests(engine);
> -             if (!intel_engine_is_active(engine))
> +     for_each_engine_masked(engine, dev_priv, dev_priv->gt.active_engines) {
> +             if (engine_retire_requests(engine))
>                       dev_priv->gt.active_engines &= 
> ~intel_engine_flag(engine);
>       }
>

These were already stripped to another patch. Good.

> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h 
> b/drivers/gpu/drm/i915/i915_gem_request.h
> index 3496e28785e7..32123e38ef4b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -94,6 +94,9 @@ struct drm_i915_gem_request {
>       /** Position in the ringbuffer of the end of the whole request */
>       u32 tail;
>  
> +     /** Position in the ringbuffer after the end of the whole request */
> +     u32 wa_tail;
> +
>       /** Preallocate space in the ringbuffer for the emitting the request */
>       u32 reserved_space;
>  
> @@ -130,27 +133,8 @@ struct drm_i915_gem_request {
>       /** process identifier submitting this request */
>       struct pid *pid;
>  
> -     /**
> -      * The ELSP only accepts two elements at a time, so we queue
> -      * context/tail pairs on a given queue (ring->execlist_queue) until the
> -      * hardware is available. The queue serves a double purpose: we also use
> -      * it to keep track of the up to 2 contexts currently in the hardware
> -      * (usually one in execution and the other queued up by the GPU): We
> -      * only remove elements from the head of the queue when the hardware
> -      * informs us that an element has been completed.
> -      *
> -      * All accesses to the queue are mediated by a spinlock
> -      * (ring->execlist_lock).
> -      */
> -
> -     /** Execlist link in the submission queue.*/
> +     /** Link in the execlist submission queue, guarded by execlist_lock. */
>       struct list_head execlist_link;
> -
> -     /** Execlists no. of times this request has been sent to the ELSP */
> -     int elsp_submitted;
> -
> -     /** Execlists context hardware id. */
> -     unsigned int ctx_hw_id;
>  };
>  
>  extern const struct fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 309c5d9b1c57..56b5aa8a35c4 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -156,6 +156,11 @@
>  #define GEN8_CTX_STATUS_COMPLETE     (1 << 4)
>  #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
>  
> +#define GEN8_CTX_STATUS_COMPLETED_MASK \
> +      (GEN8_CTX_STATUS_ACTIVE_IDLE | \
> +       GEN8_CTX_STATUS_PREEMPTED | \
> +       GEN8_CTX_STATUS_ELEMENT_SWITCH)
> +
>  #define CTX_LRI_HEADER_0             0x01
>  #define CTX_CONTEXT_CONTROL          0x02
>  #define CTX_RING_HEAD                        0x04
> @@ -263,12 +268,10 @@ logical_ring_init_platform_invariants(struct 
> intel_engine_cs *engine)
>  {
>       struct drm_i915_private *dev_priv = engine->i915;
>  
> -     if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
> -             engine->idle_lite_restore_wa = ~0;
> -
> -     engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, 
> SKL_REVID_B0) ||
> -                                     IS_BXT_REVID(dev_priv, 0, 
> BXT_REVID_A1)) &&
> -                                     (engine->id == VCS || engine->id == 
> VCS2);
> +     engine->disable_lite_restore_wa =
> +             (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
> +              IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
> +             (engine->id == VCS || engine->id == VCS2);

Candidate for another cleanup patch after as both revids smells obsolete.

>  
>       engine->ctx_desc_template = GEN8_CTX_VALID;
>       if (IS_GEN8(dev_priv))
> @@ -328,36 +331,6 @@ uint64_t intel_lr_context_descriptor(struct 
> i915_gem_context *ctx,
>       return ctx->engine[engine->id].lrc_desc;
>  }
>  
> -static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
> -                              struct drm_i915_gem_request *rq1)
> -{
> -
> -     struct intel_engine_cs *engine = rq0->engine;
> -     struct drm_i915_private *dev_priv = rq0->i915;
> -     uint64_t desc[2];
> -
> -     if (rq1) {
> -             desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
> -             rq1->elsp_submitted++;
> -     } else {
> -             desc[1] = 0;
> -     }
> -
> -     desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
> -     rq0->elsp_submitted++;
> -
> -     /* You must always write both descriptors in the order below. */
> -     I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1]));
> -     I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1]));
> -
> -     I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0]));
> -     /* The context is automatically loaded after the following */
> -     I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0]));
> -
> -     /* ELSP is a wo register, use another nearby reg for posting */
> -     POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine));
> -}
> -
>  static void
>  execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
>  {
> @@ -367,13 +340,12 @@ execlists_update_context_pdps(struct i915_hw_ppgtt 
> *ppgtt, u32 *reg_state)
>       ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
>  }
>  
> -static void execlists_update_context(struct drm_i915_gem_request *rq)
> +static u64 execlists_update_context(struct drm_i915_gem_request *rq)
>  {
> -     struct intel_engine_cs *engine = rq->engine;
> +     struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
>       struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
> -     uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
>  
> -     reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
> +     ce->lrc_reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, 
> rq->tail);
>  
>       /* True 32b PPGTT with dynamic page allocation: update PDP
>        * registers and point the unallocated PDPs to scratch page.
> @@ -381,32 +353,14 @@ static void execlists_update_context(struct 
> drm_i915_gem_request *rq)
>        * in 48-bit mode.
>        */
>       if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
> -             execlists_update_context_pdps(ppgtt, reg_state);
> -}
> -
> -static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
> -                                        struct drm_i915_gem_request *rq1)
> -{
> -     struct drm_i915_private *dev_priv = rq0->i915;
> -     unsigned int fw_domains = rq0->engine->fw_domains;
> -
> -     execlists_update_context(rq0);
> -
> -     if (rq1)
> -             execlists_update_context(rq1);
> +             execlists_update_context_pdps(ppgtt, ce->lrc_reg_state);
>  
> -     spin_lock_irq(&dev_priv->uncore.lock);
> -     intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
> -
> -     execlists_elsp_write(rq0, rq1);
> -
> -     intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
> -     spin_unlock_irq(&dev_priv->uncore.lock);
> +     return ce->lrc_desc;
>  }
>  
> -static inline void execlists_context_status_change(
> -             struct drm_i915_gem_request *rq,
> -             unsigned long status)
> +static inline void
> +execlists_context_status_change(struct drm_i915_gem_request *rq,
> +                             unsigned long status)
>  {
>       /*
>        * Only used when GVT-g is enabled now. When GVT-g is disabled,
> @@ -418,122 +372,104 @@ static inline void execlists_context_status_change(
>       atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq);
>  }
>  
> -static void execlists_unqueue(struct intel_engine_cs *engine)
> +static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> -     struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
> -     struct drm_i915_gem_request *cursor, *tmp;
> -
> -     assert_spin_locked(&engine->execlist_lock);
> -
> -     /*
> -      * If irqs are not active generate a warning as batches that finish
> -      * without the irqs may get lost and a GPU Hang may occur.
> -      */
> -     WARN_ON(!intel_irqs_enabled(engine->i915));
> -
> -     /* Try to read in pairs */
> -     list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
> -                              execlist_link) {
> -             if (!req0) {
> -                     req0 = cursor;
> -             } else if (req0->ctx == cursor->ctx) {
> -                     /* Same ctx: ignore first request, as second request
> -                      * will update tail past first request's workload */
> -                     cursor->elsp_submitted = req0->elsp_submitted;
> -                     list_del(&req0->execlist_link);
> -                     i915_gem_request_put(req0);
> -                     req0 = cursor;
> -             } else {
> -                     if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
> -                             /*
> -                              * req0 (after merged) ctx requires single
> -                              * submission, stop picking
> -                              */
> -                             if 
> (req0->ctx->execlists_force_single_submission)
> -                                     break;
> -                             /*
> -                              * req0 ctx doesn't require single submission,
> -                              * but next req ctx requires, stop picking
> -                              */
> -                             if 
> (cursor->ctx->execlists_force_single_submission)
> -                                     break;
> -                     }
> -                     req1 = cursor;
> -                     WARN_ON(req1->elsp_submitted);
> -                     break;
> -             }
> -     }
> -
> -     if (unlikely(!req0))
> -             return;
> -
> -     execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
> +     struct drm_i915_private *dev_priv = engine->i915;
> +     u32 *elsp = dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
> +     u64 desc[2];
>  
> -     if (req1)
> -             execlists_context_status_change(req1,
> +     if (!engine->execlist_port[0].count)
> +             
> execlists_context_status_change(engine->execlist_port[0].request,
>                                               INTEL_CONTEXT_SCHEDULE_IN);
> +     desc[0] = execlists_update_context(engine->execlist_port[0].request);
> +     engine->preempt_wa = engine->execlist_port[0].count++;
>  
> -     if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
> -             /*
> -              * WaIdleLiteRestore: make sure we never cause a lite restore
> -              * with HEAD==TAIL.
> -              *
> -              * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
> -              * resubmit the request. See gen8_emit_request() for where we
> -              * prepare the padding after the end of the request.
> -              */
> -             req0->tail += 8;
> -             req0->tail &= req0->ring->size - 1;
> +     if (engine->execlist_port[1].request) {
> +             GEM_BUG_ON(engine->execlist_port[1].count);
> +             
> execlists_context_status_change(engine->execlist_port[1].request,
> +                                             INTEL_CONTEXT_SCHEDULE_IN);
> +             desc[1] = 
> execlists_update_context(engine->execlist_port[1].request);
> +             engine->execlist_port[1].count = 1;
> +     } else {
> +             desc[1] = 0;
>       }
>  
> -     execlists_elsp_submit_contexts(req0, req1);
> +     /* You must always write both descriptors in the order below. */
> +     writel(upper_32_bits(desc[1]), elsp);
> +     writel(lower_32_bits(desc[1]), elsp);
> +
> +     writel(upper_32_bits(desc[0]), elsp);
> +     /* The context is automatically loaded after the following */
> +     writel(lower_32_bits(desc[0]), elsp);
>  }
>  
> -static unsigned int
> -execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
> +static bool merge_ctx(struct i915_gem_context *prev,
> +                   struct i915_gem_context *next)
>  {
> -     struct drm_i915_gem_request *head_req;
> +     if (prev != next)
> +             return false;
>  
> -     assert_spin_locked(&engine->execlist_lock);
> +     if (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> +         prev->execlists_force_single_submission)
> +             return false;
>  
> -     head_req = list_first_entry_or_null(&engine->execlist_queue,
> -                                         struct drm_i915_gem_request,
> -                                         execlist_link);
> -
> -     if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
> -               return 0;
> +     return true;
> +}
>  
> -     WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
> +static void execlists_context_unqueue(struct intel_engine_cs *engine)
> +{
> +     struct drm_i915_gem_request *cursor, *last;
> +     struct execlist_port *port = engine->execlist_port;
> +     bool submit = false;
> +
> +     last = port->request;
> +     if (last)
> +             /* WaIdleLiteRestore:bdw,skl
> +              * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
> +              * as we resubmit the request. See gen8_emit_request()
> +              * for where we prepare the padding after the end of the
> +              * request.
> +              */
> +             last->tail = last->wa_tail;
>  
> -     if (--head_req->elsp_submitted > 0)
> -             return 0;
> +     /* Try to read in pairs and fill both submission ports */
> +     spin_lock(&engine->execlist_lock);
> +     list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +             if (last && !merge_ctx(cursor->ctx, last->ctx)) {
> +                     if (port != engine->execlist_port)
> +                             break;

In here you will merge ctx also for the second port?

The previous version of the code was careful to only merge for
the first port/request.

>  
> -     execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
> +                     if (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
> +                         cursor->ctx->execlists_force_single_submission)
> +                             break;
>  
> -     list_del(&head_req->execlist_link);
> -     i915_gem_request_put(head_req);
> +                     i915_gem_request_assign(&port->request, last);
> +                     port++;
> +             }
> +             last = cursor;
> +             submit = true;
> +     }
> +     if (submit) {
> +             i915_gem_request_assign(&port->request, last);
> +             engine->execlist_queue.next = &cursor->execlist_link;
> +             cursor->execlist_link.prev = &engine->execlist_queue;

We keep the cursor pointing to the request that will be in port[0].request?

while (!list_empty(engine->execlist_queue)) and then removing from
the list would have made the fill loop more complex?

> +     }
> +     spin_unlock(&engine->execlist_lock);
>  
> -     return 1;
> +     if (submit)
> +             execlists_submit_ports(engine);
>  }
>  
> -static u32
> -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
> -                u32 *context_id)
> +static bool execlists_elsp_idle(struct intel_engine_cs *engine)
>  {
> -     struct drm_i915_private *dev_priv = engine->i915;
> -     u32 status;
> -
> -     read_pointer %= GEN8_CSB_ENTRIES;
> -
> -     status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
> -
> -     if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
> -             return 0;
> +     return !engine->execlist_port[0].request;
> +}
>  
> -     *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
> -                                                           read_pointer));
> +static bool execlists_elsp_ready(struct intel_engine_cs *engine)
> +{
> +     int idx = !engine->disable_lite_restore_wa && !engine->preempt_wa;
>  
> -     return status;
> +     return !engine->execlist_port[idx].request;
>  }
>  
>  /*
> @@ -544,100 +480,63 @@ static void intel_lrc_irq_handler(unsigned long data)
>  {
>       struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
>       struct drm_i915_private *dev_priv = engine->i915;
> -     u32 status_pointer;
> -     unsigned int read_pointer, write_pointer;
> -     u32 csb[GEN8_CSB_ENTRIES][2];
> -     unsigned int csb_read = 0, i;
> -     unsigned int submit_contexts = 0;
>  
>       intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
>  
> -     status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
> -
> -     read_pointer = engine->next_context_status_buffer;
> -     write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
> -     if (read_pointer > write_pointer)
> -             write_pointer += GEN8_CSB_ENTRIES;
> -
> -     while (read_pointer < write_pointer) {
> -             if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
> -                     break;
> -             csb[csb_read][0] = get_context_status(engine, ++read_pointer,
> -                                                   &csb[csb_read][1]);
> -             csb_read++;
> -     }
> -
> -     engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
> -
> -     /* Update the read pointer to the old write pointer. Manual ringbuffer
> -      * management ftw </sarcasm> */
> -     I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
> -                   _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
> -                                 engine->next_context_status_buffer << 8));
> -
> -     intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
> -
> -     spin_lock(&engine->execlist_lock);
> -
> -     for (i = 0; i < csb_read; i++) {
> -             if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
> -                     if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
> -                             if (execlists_check_remove_request(engine, 
> csb[i][1]))
> -                                     WARN(1, "Lite Restored request removed 
> from queue\n");
> -                     } else
> -                             WARN(1, "Preemption without Lite Restore\n");
> +     if (!execlists_elsp_idle(engine)) {
> +             u32 *ring_mmio =
> +                     dev_priv->regs + 
> i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
> +             u32 *csb_mmio =
> +                     dev_priv->regs + 
> i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
> +             unsigned ring, head, tail;
> +
> +             ring = readl(ring_mmio);
> +             head = GEN8_CSB_READ_PTR(ring);
> +             tail = GEN8_CSB_WRITE_PTR(ring);
> +             if (tail < head)
> +                     tail += GEN8_CSB_ENTRIES;
> +
> +             while (head < tail) {
> +                     unsigned idx = ++head % GEN8_CSB_ENTRIES;
> +                     unsigned status = readl(&csb_mmio[2*idx]);
> +
> +                     if (status & GEN8_CTX_STATUS_COMPLETED_MASK) {
> +                             GEM_BUG_ON(engine->execlist_port[0].count == 0);

Would this be the safety valve if we go out of sync vrt submit vs head?

> +                             if (--engine->execlist_port[0].count == 0) {
> +                                     GEM_BUG_ON(status & 
> GEN8_CTX_STATUS_PREEMPTED);
> +                                     
> execlists_context_status_change(engine->execlist_port[0].request,
> +                                                                     
> INTEL_CONTEXT_SCHEDULE_OUT);
> +                                     
> i915_gem_request_put(engine->execlist_port[0].request);
> +                                     engine->execlist_port[0] = 
> engine->execlist_port[1];

Intention here is to always retire through port zero, and thus the
refcounts matches the submitted ones?


> +                                     memset(&engine->execlist_port[1], 0,
> +                                            
> sizeof(engine->execlist_port[1]));
> +                                     engine->preempt_wa = false;
> +                             }
> +                     }
> +                     GEM_BUG_ON(engine->execlist_port[0].count == 0 &&
> +                                !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
>               }
>  
> -             if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
> -                 GEN8_CTX_STATUS_ELEMENT_SWITCH))
> -                     submit_contexts +=
> -                             execlists_check_remove_request(engine, 
> csb[i][1]);
> +             writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, 
> GEN8_CSB_WRITE_PTR(ring) << 8),
> +                    ring_mmio);
>       }
>  
> -     if (submit_contexts) {
> -             if (!engine->disable_lite_restore_wa ||
> -                 (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
> -                     execlists_unqueue(engine);
> -     }
> +     if (execlists_elsp_ready(engine))
> +             execlists_context_unqueue(engine);
>  
> -     spin_unlock(&engine->execlist_lock);
> -
> -     if (unlikely(submit_contexts > 2))
> -             DRM_ERROR("More than two context complete events?\n");
> +     intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>  
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>       struct intel_engine_cs *engine = request->engine;
> -     struct drm_i915_gem_request *cursor;
> -     int num_elements = 0;
>  
>       spin_lock_bh(&engine->execlist_lock);
>  
> -     list_for_each_entry(cursor, &engine->execlist_queue, execlist_link)
> -             if (++num_elements > 2)
> -                     break;
> -
> -     if (num_elements > 2) {
> -             struct drm_i915_gem_request *tail_req;
> -
> -             tail_req = list_last_entry(&engine->execlist_queue,
> -                                        struct drm_i915_gem_request,
> -                                        execlist_link);
> -
> -             if (request->ctx == tail_req->ctx) {
> -                     WARN(tail_req->elsp_submitted != 0,
> -                             "More than 2 already-submitted reqs queued\n");
> -                     list_del(&tail_req->execlist_link);
> -                     i915_gem_request_put(tail_req);
> -             }
> -     }
> -
> -     i915_gem_request_get(request);
>       list_add_tail(&request->execlist_link, &engine->execlist_queue);
> -     request->ctx_hw_id = request->ctx->hw_id;
> -     if (num_elements == 0)
> -             execlists_unqueue(engine);
> +
> +     if (execlists_elsp_idle(engine))
> +             tasklet_hi_schedule(&engine->irq_tasklet);
>  
>       spin_unlock_bh(&engine->execlist_lock);
>  }
> @@ -731,6 +630,7 @@ intel_logical_ring_advance(struct drm_i915_gem_request 
> *request)
>       intel_ring_emit(ring, MI_NOOP);
>       intel_ring_emit(ring, MI_NOOP);
>       intel_ring_advance(ring);
> +     request->wa_tail = request->ring->tail;
>  
>       /* We keep the previous context alive until we retire the following
>        * request. This ensures that any the context object is still pinned
> @@ -743,23 +643,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request 
> *request)
>       return 0;
>  }
>  
> -void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
> -{
> -     struct drm_i915_gem_request *req, *tmp;
> -     LIST_HEAD(cancel_list);
> -
> -     WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
> -
> -     spin_lock_bh(&engine->execlist_lock);
> -     list_replace_init(&engine->execlist_queue, &cancel_list);
> -     spin_unlock_bh(&engine->execlist_lock);
> -
> -     list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
> -             list_del(&req->execlist_link);
> -             i915_gem_request_put(req);
> -     }
> -}
> -
>  static int intel_lr_context_pin(struct i915_gem_context *ctx,
>                               struct intel_engine_cs *engine)
>  {
> @@ -1285,7 +1168,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
>  static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  {
>       struct drm_i915_private *dev_priv = engine->i915;
> -     unsigned int next_context_status_buffer_hw;
>  
>       lrc_init_hws(engine);
>  
> @@ -1296,32 +1178,12 @@ static int gen8_init_common_ring(struct 
> intel_engine_cs *engine)
>       I915_WRITE(RING_MODE_GEN7(engine),
>                  _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
>                  _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
> -     POSTING_READ(RING_MODE_GEN7(engine));
>  
> -     /*
> -      * Instead of resetting the Context Status Buffer (CSB) read pointer to
> -      * zero, we need to read the write pointer from hardware and use its
> -      * value because "this register is power context save restored".
> -      * Effectively, these states have been observed:
> -      *
> -      *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
> -      * BDW  | CSB regs not reset       | CSB regs reset       |
> -      * CHT  | CSB regs not reset       | CSB regs not reset   |
> -      * SKL  |         ?                |         ?            |
> -      * BXT  |         ?                |         ?            |
> -      */
> -     next_context_status_buffer_hw =
> -             GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
> -
> -     /*
> -      * When the CSB registers are reset (also after power-up / gpu reset),
> -      * CSB write pointer is set to all 1's, which is not valid, use '5' in
> -      * this special case, so the first element read is CSB[0].
> -      */
> -     if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
> -             next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
> +     I915_WRITE(RING_CONTEXT_STATUS_PTR(engine),
> +                _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK |
> +                              GEN8_CSB_WRITE_PTR_MASK,
> +                              0));
>  
> -     engine->next_context_status_buffer = next_context_status_buffer_hw;
>       DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
>  
>       intel_engine_init_hangcheck(engine);
> @@ -1706,7 +1568,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs 
> *engine)
>       }
>       intel_lr_context_unpin(dev_priv->kernel_context, engine);
>  
> -     engine->idle_lite_restore_wa = 0;
>       engine->disable_lite_restore_wa = false;
>       engine->ctx_desc_template = 0;
>  
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h 
> b/drivers/gpu/drm/i915/intel_lrc.h
> index a52cf57dbd40..4d70346500c2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private 
> *dev_priv,
>                                   int enable_execlists);
>  void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
>  
> -void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
> -
>  #endif /* _INTEL_LRC_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 43e545e44352..ed8a8482a7fb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -288,11 +288,14 @@ struct intel_engine_cs {
>       /* Execlists */
>       struct tasklet_struct irq_tasklet;
>       spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
> +     struct execlist_port {
> +             struct drm_i915_gem_request *request;
> +             unsigned int count;
> +     } execlist_port[2];
>       struct list_head execlist_queue;
>       unsigned int fw_domains;
> -     unsigned int next_context_status_buffer;
> -     unsigned int idle_lite_restore_wa;
>       bool disable_lite_restore_wa;
> +     bool preempt_wa;

This needs to be reset also in logical_ring_cleanup().

Overall this patch really makes the submission/retirement much more
compact and simpler. The exception being the unqueuing part which
is difficult due to the merging logic.

Some bikescheds I made during review can be found in:
https://cgit.freedesktop.org/~miku/drm-intel/commit/?h=review&id=3c806a9f6665db3a08f949224560bb039af1be1a

-Mika

>       u32 ctx_desc_template;
>  
>       /**
> -- 
> 2.8.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 2/4] drm/i915: Simplify ELSP queue request tracking

Reply via email to