preemption: Allow preemption between submission ports

Chris Wilson Thu, 22 Feb 2018 06:23:19 -0800

Sometimes we need to boost the priority of an in-flight request, which
may lead to the situation where the second submission port then contains
a higher priority context than the first and so we need to inject a
preemption event. To do so we must always check inside
execlists_dequeue() whether there is a priority inversion between the
ports themselves as well as the head of the priority sorted queue, and we
cannot just skip dequeuing if the queue is empty.


Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiar...@intel.com>
Cc: Michel Thierry <michel.thie...@intel.com>
Cc: Mika Kuoppala <mika.kuopp...@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
---
Rebase for conflicts
-Chris
---
 drivers/gpu/drm/i915/intel_engine_cs.c      |   2 +
 drivers/gpu/drm/i915/intel_guc_submission.c |  17 +--
 drivers/gpu/drm/i915/intel_lrc.c            | 161 ++++++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h     |   5 +
 4 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index c31544406974..ce7fcf55ba18 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -423,6 +423,7 @@ static void intel_engine_init_execlist(struct 
intel_engine_cs *engine)
        BUILD_BUG_ON_NOT_POWER_OF_2(execlists_num_ports(execlists));
        GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS);
 
+       execlists->queue_priority = INT_MIN;
        execlists->queue = RB_ROOT;
        execlists->first = NULL;
 }
@@ -1903,6 +1904,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
        spin_lock_irq(&engine->timeline->lock);
        list_for_each_entry(rq, &engine->timeline->requests, link)
                print_request(m, rq, "\t\tE ");
+       drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
        for (rb = execlists->first; rb; rb = rb_next(rb)) {
                struct i915_priolist *p =
                        rb_entry(rb, typeof(*p), node);
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c 
b/drivers/gpu/drm/i915/intel_guc_submission.c
index 649113c7a3c2..586dde579903 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -75,6 +75,11 @@
  *
  */
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+       return rb_entry(rb, struct i915_priolist, node);
+}
+
 static inline bool is_high_priority(struct intel_guc_client *client)
 {
        return (client->priority == GUC_CLIENT_PRIORITY_KMD_HIGH ||
@@ -682,15 +687,12 @@ static void guc_dequeue(struct intel_engine_cs *engine)
        rb = execlists->first;
        GEM_BUG_ON(rb_first(&execlists->queue) != rb);
 
-       if (!rb)
-               goto unlock;
-
        if (port_isset(port)) {
                if (engine->i915->preempt_context) {
                        struct guc_preempt_work *preempt_work =
                                &engine->i915->guc.preempt_work[engine->id];
 
-                       if (rb_entry(rb, struct i915_priolist, node)->priority >
+                       if (execlists->queue_priority >
                            max(port_request(port)->priotree.priority, 0)) {
                                execlists_set_active(execlists,
                                                     EXECLISTS_ACTIVE_PREEMPT);
@@ -706,8 +708,8 @@ static void guc_dequeue(struct intel_engine_cs *engine)
        }
        GEM_BUG_ON(port_isset(port));
 
-       do {
-               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+       while (rb) {
+               struct i915_priolist *p = to_priolist(rb);
                struct i915_request *rq, *rn;
 
                list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -736,8 +738,9 @@ static void guc_dequeue(struct intel_engine_cs *engine)
                INIT_LIST_HEAD(&p->requests);
                if (p->priority != I915_PRIORITY_NORMAL)
                        kmem_cache_free(engine->i915->priorities, p);
-       } while (rb);
+       }
 done:
+       execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
        execlists->first = rb;
        if (submit) {
                port_assign(port, last);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 964885b5d7cb..4bc72fbaf793 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -169,6 +169,23 @@ static void execlists_init_reg_state(u32 *reg_state,
                                     struct intel_engine_cs *engine,
                                     struct intel_ring *ring);
 
+static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+{
+       return rb_entry(rb, struct i915_priolist, node);
+}
+
+static inline int rq_prio(const struct i915_request *rq)
+{
+       return rq->priotree.priority;
+}
+
+static inline bool need_preempt(const struct intel_engine_cs *engine,
+                                const struct i915_request *last,
+                                int prio)
+{
+       return engine->i915->preempt_context && prio > max(rq_prio(last), 0);
+}
+
 /**
  * intel_lr_context_descriptor_update() - calculate & cache the descriptor
  *                                       descriptor for a pinned context
@@ -224,7 +241,7 @@ lookup_priolist(struct intel_engine_cs *engine,
        parent = &execlists->queue.rb_node;
        while (*parent) {
                rb = *parent;
-               p = rb_entry(rb, typeof(*p), node);
+               p = to_priolist(rb);
                if (prio > p->priority) {
                        parent = &rb->rb_left;
                } else if (prio < p->priority) {
@@ -264,7 +281,7 @@ lookup_priolist(struct intel_engine_cs *engine,
        if (first)
                execlists->first = &p->node;
 
-       return ptr_pack_bits(p, first, 1);
+       return p;
 }
 
 static void unwind_wa_tail(struct i915_request *rq)
@@ -290,14 +307,10 @@ static void __unwind_incomplete_requests(struct 
intel_engine_cs *engine)
                __i915_request_unsubmit(rq);
                unwind_wa_tail(rq);
 
-               GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
-               if (rq->priotree.priority != last_prio) {
-                       p = lookup_priolist(engine,
-                                           &rq->priotree,
-                                           rq->priotree.priority);
-                       p = ptr_mask_bits(p, 1);
-
-                       last_prio = rq->priotree.priority;
+               GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+               if (rq_prio(rq) != last_prio) {
+                       last_prio = rq_prio(rq);
+                       p = lookup_priolist(engine, &rq->priotree, last_prio);
                }
 
                list_add(&rq->priotree.link, &p->requests);
@@ -397,10 +410,11 @@ static void execlists_submit_ports(struct intel_engine_cs 
*engine)
                        desc = execlists_update_context(rq);
                        GEM_DEBUG_EXEC(port[n].context_id = 
upper_32_bits(desc));
 
-                       GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
+                       GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x, prio=%d\n",
                                  engine->name, n,
                                  port[n].context_id, count,
-                                 rq->global_seqno);
+                                 rq->global_seqno,
+                                 rq_prio(rq));
                } else {
                        GEM_BUG_ON(!n);
                        desc = 0;
@@ -453,12 +467,17 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
                   _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT |
                                      CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT));
 
+       /*
+        * Switch to our empty preempt context so
+        * the state of the GPU is known (idle).
+        */
        GEM_TRACE("%s\n", engine->name);
        for (n = execlists_num_ports(&engine->execlists); --n; )
                elsp_write(0, engine->execlists.elsp);
 
        elsp_write(ce->lrc_desc, engine->execlists.elsp);
        execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
+       execlists_set_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT);
 }
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
@@ -495,8 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
        spin_lock_irq(&engine->timeline->lock);
        rb = execlists->first;
        GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-       if (!rb)
-               goto unlock;
 
        if (last) {
                /*
@@ -519,54 +536,48 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
                if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
                        goto unlock;
 
-               if (engine->i915->preempt_context &&
-                   rb_entry(rb, struct i915_priolist, node)->priority >
-                   max(last->priotree.priority, 0)) {
-                       /*
-                        * Switch to our empty preempt context so
-                        * the state of the GPU is known (idle).
-                        */
+               if (need_preempt(engine, last, execlists->queue_priority)) {
                        inject_preempt_context(engine);
-                       execlists_set_active(execlists,
-                                            EXECLISTS_ACTIVE_PREEMPT);
                        goto unlock;
-               } else {
-                       /*
-                        * In theory, we could coalesce more requests onto
-                        * the second port (the first port is active, with
-                        * no preemptions pending). However, that means we
-                        * then have to deal with the possible lite-restore
-                        * of the second port (as we submit the ELSP, there
-                        * may be a context-switch) but also we may complete
-                        * the resubmission before the context-switch. Ergo,
-                        * coalescing onto the second port will cause a
-                        * preemption event, but we cannot predict whether
-                        * that will affect port[0] or port[1].
-                        *
-                        * If the second port is already active, we can wait
-                        * until the next context-switch before contemplating
-                        * new requests. The GPU will be busy and we should be
-                        * able to resubmit the new ELSP before it idles,
-                        * avoiding pipeline bubbles (momentary pauses where
-                        * the driver is unable to keep up the supply of new
-                        * work).
-                        */
-                       if (port_count(&port[1]))
-                               goto unlock;
-
-                       /* WaIdleLiteRestore:bdw,skl
-                        * Apply the wa NOOPs to prevent
-                        * ring:HEAD == rq:TAIL as we resubmit the
-                        * request. See gen8_emit_breadcrumb() for
-                        * where we prepare the padding after the
-                        * end of the request.
-                        */
-                       last->tail = last->wa_tail;
                }
+
+               /*
+                * In theory, we could coalesce more requests onto
+                * the second port (the first port is active, with
+                * no preemptions pending). However, that means we
+                * then have to deal with the possible lite-restore
+                * of the second port (as we submit the ELSP, there
+                * may be a context-switch) but also we may complete
+                * the resubmission before the context-switch. Ergo,
+                * coalescing onto the second port will cause a
+                * preemption event, but we cannot predict whether
+                * that will affect port[0] or port[1].
+                *
+                * If the second port is already active, we can wait
+                * until the next context-switch before contemplating
+                * new requests. The GPU will be busy and we should be
+                * able to resubmit the new ELSP before it idles,
+                * avoiding pipeline bubbles (momentary pauses where
+                * the driver is unable to keep up the supply of new
+                * work). However, we have to double check that the
+                * priorities of the ports haven't been switch.
+                */
+               if (port_count(&port[1]))
+                       goto unlock;
+
+               /*
+                * WaIdleLiteRestore:bdw,skl
+                * Apply the wa NOOPs to prevent
+                * ring:HEAD == rq:TAIL as we resubmit the
+                * request. See gen8_emit_breadcrumb() for
+                * where we prepare the padding after the
+                * end of the request.
+                */
+               last->tail = last->wa_tail;
        }
 
-       do {
-               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+       while (rb) {
+               struct i915_priolist *p = to_priolist(rb);
                struct i915_request *rq, *rn;
 
                list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
@@ -628,8 +639,9 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
                INIT_LIST_HEAD(&p->requests);
                if (p->priority != I915_PRIORITY_NORMAL)
                        kmem_cache_free(engine->i915->priorities, p);
-       } while (rb);
+       }
 done:
+       execlists->queue_priority = rb ? to_priolist(rb)->priority : INT_MIN;
        execlists->first = rb;
        if (submit)
                port_assign(port, last);
@@ -690,7 +702,7 @@ static void execlists_cancel_requests(struct 
intel_engine_cs *engine)
        /* Flush the queued requests to the timeline list (for retiring). */
        rb = execlists->first;
        while (rb) {
-               struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+               struct i915_priolist *p = to_priolist(rb);
 
                list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) {
                        INIT_LIST_HEAD(&rq->priotree.link);
@@ -708,7 +720,7 @@ static void execlists_cancel_requests(struct 
intel_engine_cs *engine)
 
        /* Remaining _unready_ requests will be nop'ed when submitted */
 
-
+       execlists->queue_priority = INT_MIN;
        execlists->queue = RB_ROOT;
        execlists->first = NULL;
        GEM_BUG_ON(port_isset(execlists->port));
@@ -864,10 +876,11 @@ static void execlists_submission_tasklet(unsigned long 
data)
                                                        EXECLISTS_ACTIVE_USER));
 
                        rq = port_unpack(port, &count);
-                       GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
+                       GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x, prio=%d\n",
                                  engine->name,
                                  port->context_id, count,
-                                 rq ? rq->global_seqno : 0);
+                                 rq ? rq->global_seqno : 0,
+                                 rq ? rq_prio(rq) : 0);
 
                        /* Check the context/desc id for this event matches */
                        GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
@@ -912,15 +925,19 @@ static void execlists_submission_tasklet(unsigned long 
data)
                intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
 }
 
-static void insert_request(struct intel_engine_cs *engine,
-                          struct i915_priotree *pt,
-                          int prio)
+static void queue_request(struct intel_engine_cs *engine,
+                         struct i915_priotree *pt,
+                         int prio)
 {
-       struct i915_priolist *p = lookup_priolist(engine, pt, prio);
+       list_add_tail(&pt->link, &lookup_priolist(engine, pt, prio)->requests);
+}
 
-       list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
-       if (ptr_unmask_bits(p, 1))
+static void submit_queue(struct intel_engine_cs *engine, int prio)
+{
+       if (prio > engine->execlists.queue_priority) {
+               engine->execlists.queue_priority = prio;
                tasklet_hi_schedule(&engine->execlists.tasklet);
+       }
 }
 
 static void execlists_submit_request(struct i915_request *request)
@@ -931,7 +948,8 @@ static void execlists_submit_request(struct i915_request 
*request)
        /* Will be called from irq-context when using foreign fences. */
        spin_lock_irqsave(&engine->timeline->lock, flags);
 
-       insert_request(engine, &request->priotree, request->priotree.priority);
+       queue_request(engine, &request->priotree, rq_prio(request));
+       submit_queue(engine, rq_prio(request));
 
        GEM_BUG_ON(!engine->execlists.first);
        GEM_BUG_ON(list_empty(&request->priotree.link));
@@ -987,7 +1005,7 @@ static void execlists_schedule(struct i915_request 
*request, int prio)
         * static void update_priorities(struct i915_priotree *pt, prio) {
         *      list_for_each_entry(dep, &pt->signalers_list, signal_link)
         *              update_priorities(dep->signal, prio)
-        *      insert_request(pt);
+        *      queue_request(pt);
         * }
         * but that may have unlimited recursion depth and so runs a very
         * real risk of overunning the kernel stack. Instead, we build
@@ -1050,8 +1068,9 @@ static void execlists_schedule(struct i915_request 
*request, int prio)
                pt->priority = prio;
                if (!list_empty(&pt->link)) {
                        __list_del_entry(&pt->link);
-                       insert_request(engine, pt, prio);
+                       queue_request(engine, pt, prio);
                }
+               submit_queue(engine, prio);
        }
 
        spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a9b83bf7e837..c4e9022b34e3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -257,6 +257,11 @@ struct intel_engine_execlists {
         */
        unsigned int port_mask;
 
+       /**
+        * @queue_priority: Highest pending priority.
+        */
+       int queue_priority;
+
        /**
         * @queue: queue of requests, in priority lists
         */
-- 
2.16.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[Intel-gfx] [PATCH] drm/i915/preemption: Allow preemption between submission ports

Reply via email to