Doesn't work yet, but shows the direction I'm going.

Instead of relying on struct_mutex, we rely on the object locks to
paralellize submission.

Currently we still have some issues because eb_lookup_vmas() pins the
vma, and parallel submission doesn't work yet completely. I'm hoping
to find out a good way to handle this. Ideally we drop the pinning,
and rely only on object lock, but unfortunately the driver doesn't
support it yet.

Posting this as a RFC, to get some feedback on the direction we should be going.

Signed-off-by: Maarten Lankhorst <maarten.lankho...@linux.intel.com>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 295 ++++++++++--------
 drivers/gpu/drm/i915/i915_cmd_parser.c        |  15 +-
 drivers/gpu/drm/i915/i915_drv.h               |   6 -
 3 files changed, 171 insertions(+), 145 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 02fdf081c1be..ee753684f0ba 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -234,6 +234,8 @@ struct i915_execbuffer {
        struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
        struct eb_vma *vma;
 
+       struct i915_gem_ww_ctx ww;
+
        struct intel_engine_cs *engine; /** engine to queue the request to */
        struct intel_context *context; /* logical state for the request */
        struct i915_gem_context *gem_context; /** caller's context */
@@ -286,6 +288,9 @@ struct i915_execbuffer {
        struct hlist_head *buckets; /** ht for relocation handles */
 };
 
+static int eb_parse(struct i915_execbuffer *eb,
+                   struct intel_engine_pool_node *pool);
+
 /*
  * Used to convert any address to canonical form.
  * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
@@ -504,8 +509,10 @@ eb_add_vma(struct i915_execbuffer *eb,
 
        if (!(eb->args->flags & __EXEC_VALIDATED)) {
                err = eb_validate_vma(eb, entry, vma);
-               if (unlikely(err))
+               if (unlikely(err)) {
+                       DRM_DEBUG_DRIVER("eb_validate_vma => %i\n", err);
                        return err;
+               }
        }
 
        ev->vma = vma;
@@ -551,8 +558,10 @@ eb_add_vma(struct i915_execbuffer *eb,
                eb_unreserve_vma(ev);
 
                list_add_tail(&ev->bind_link, &eb->unbound);
-               if (drm_mm_node_allocated(&vma->node))
+               if (drm_mm_node_allocated(&vma->node)) {
                        err = i915_vma_unbind(vma);
+                       WARN(err, "i915_vma_unbind => %i\n", err);
+               }
        }
        return err;
 }
@@ -574,7 +583,7 @@ static inline int use_cpu_reloc(const struct reloc_cache 
*cache,
                obj->cache_level != I915_CACHE_NONE);
 }
 
-static int eb_reserve_vma(const struct i915_execbuffer *eb, struct eb_vma *ev)
+static int eb_reserve_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
 {
        struct drm_i915_gem_exec_object2 *entry = ev->exec;
        unsigned int exec_flags = ev->flags;
@@ -582,6 +591,10 @@ static int eb_reserve_vma(const struct i915_execbuffer 
*eb, struct eb_vma *ev)
        u64 pin_flags;
        int err;
 
+       err = i915_gem_object_lock(&eb->ww, vma->obj);
+       if (err)
+               return err;
+
        pin_flags = PIN_USER | PIN_NONBLOCK;
        if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
                pin_flags |= PIN_GLOBAL;
@@ -797,9 +810,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
                lut->handle = handle;
                lut->ctx = eb->gem_context;
 
-               i915_gem_object_lock(NULL, obj);
                list_add(&lut->obj_link, &obj->lut_list);
-               i915_gem_object_unlock(obj);
 
 add_vma:
                err = eb_add_vma(eb, i, batch, vma);
@@ -812,15 +823,33 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 
        mutex_unlock(&eb->gem_context->mutex);
 
+       if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) {
+               DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
+               return -EINVAL;
+       }
+
+       if (range_overflows_t(u64,
+                             eb->batch_start_offset, eb->batch_len,
+                             eb->batch->vma->size)) {
+               DRM_DEBUG("Attempting to use out-of-bounds batch\n");
+               return -EINVAL;
+       }
+
+       if (eb->batch_len == 0)
+               eb->batch_len = eb->batch->vma->size - eb->batch_start_offset;
+
        eb->args->flags |= __EXEC_VALIDATED;
        return eb_reserve(eb);
 
 err_obj:
        i915_gem_object_put(obj);
+       DRM_DEBUG_DRIVER("err_obj() => %i\n", err);
 err_vma:
        eb->vma[i].vma = NULL;
+       DRM_DEBUG_DRIVER("err_vma() => %i\n", err);
 err_ctx:
        mutex_unlock(&eb->gem_context->mutex);
+       DRM_DEBUG_DRIVER("err_ctx() => %i\n", err);
        return err;
 }
 
@@ -866,6 +895,15 @@ static void eb_release_vmas(const struct i915_execbuffer 
*eb)
        }
 }
 
+static void eb_unreserve_vmas(struct i915_execbuffer *eb)
+{
+       const unsigned int count = eb->buffer_count;
+       unsigned int i;
+
+       for (i = 0; i < count; i++)
+               eb_unreserve_vma(&eb->vma[i]);
+}
+
 static void eb_reset_vmas(const struct i915_execbuffer *eb)
 {
        eb_release_vmas(eb);
@@ -1121,7 +1159,7 @@ static int reloc_move_to_gpu(struct i915_request *rq, 
struct i915_vma *vma)
        struct drm_i915_gem_object *obj = vma->obj;
        int err;
 
-       i915_vma_lock(vma);
+       assert_vma_held(vma);
 
        if (obj->cache_dirty & ~obj->cache_coherent)
                i915_gem_clflush_object(obj, 0);
@@ -1131,8 +1169,6 @@ static int reloc_move_to_gpu(struct i915_request *rq, 
struct i915_vma *vma)
        if (err == 0)
                err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
 
-       i915_vma_unlock(vma);
-
        return err;
 }
 
@@ -1190,11 +1226,10 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
        if (err)
                goto skip_request;
 
-       i915_vma_lock(batch);
+       assert_vma_held(batch);
        err = i915_request_await_object(rq, batch->obj, false);
        if (err == 0)
                err = i915_vma_move_to_active(batch, rq, 0);
-       i915_vma_unlock(batch);
        if (err)
                goto skip_request;
 
@@ -1445,6 +1480,11 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, 
struct eb_vma *ev)
        struct drm_i915_gem_relocation_entry __user *urelocs;
        const struct drm_i915_gem_exec_object2 *entry = ev->exec;
        unsigned int remain;
+       int err;
+
+       err = i915_gem_object_lock(&eb->ww, ev->vma->obj);
+       if (err)
+               return err;
 
        urelocs = u64_to_user_ptr(entry->relocs_ptr);
        remain = entry->relocation_count;
@@ -1675,14 +1715,35 @@ static int eb_prefault_relocations(const struct 
i915_execbuffer *eb)
        return 0;
 }
 
+static int eb_lock_and_parse_cmdbuffer(struct i915_execbuffer *eb)
+{
+       struct intel_engine_pool_node *pool;
+       int err;
+
+       if (!eb_use_cmdparser(eb))
+               return 0;
+
+       pool = intel_engine_get_pool(eb->engine, eb->batch_len);
+       if (IS_ERR(pool))
+               return PTR_ERR(pool);
+
+       err = i915_gem_object_lock(&eb->ww, pool->obj);
+       if (!err)
+               err = eb_parse(eb, pool);
+
+       if (err)
+               intel_engine_pool_put(pool);
+
+       return err;
+}
+
 static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
 {
-       struct drm_device *dev = &eb->i915->drm;
        bool have_copy = false;
        struct eb_vma *ev;
        int err = 0;
 
-repeat:
+repeat_pass:
        if (signal_pending(current)) {
                err = -ERESTARTSYS;
                goto out;
@@ -1690,10 +1751,10 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
 
        /* We may process another execbuffer during the unlock... */
        eb_reset_vmas(eb);
-       mutex_unlock(&dev->struct_mutex);
+       i915_gem_ww_ctx_fini(&eb->ww);
 
        /*
-        * We take 3 passes through the slowpatch.
+        * We take 3 passes through the slowpath.
         *
         * 1 - we try to just prefault all the user relocation entries and
         * then attempt to reuse the atomic pagefault disabled fast path again.
@@ -1714,20 +1775,14 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
                cond_resched();
                err = 0;
        }
-       if (err) {
-               mutex_lock(&dev->struct_mutex);
+       i915_gem_ww_ctx_init(&eb->ww, true);
+       if (err)
                goto out;
-       }
 
        /* A frequent cause for EAGAIN are currently unavailable client pages */
        flush_workqueue(eb->i915->mm.userptr_wq);
 
-       err = i915_mutex_lock_interruptible(dev);
-       if (err) {
-               mutex_lock(&dev->struct_mutex);
-               goto out;
-       }
-
+repeat_reloc:
        /* reacquire the objects */
        err = eb_lookup_vmas(eb);
        if (err)
@@ -1740,8 +1795,10 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
                        pagefault_disable();
                        err = eb_relocate_vma(eb, ev);
                        pagefault_enable();
-                       if (err)
-                               goto repeat;
+                       if (err == -EDEADLK)
+                               goto err;
+                       else if (err)
+                               goto repeat_pass;
                } else {
                        err = eb_relocate_vma_slow(eb, ev);
                        if (err)
@@ -1749,6 +1806,9 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
                }
        }
 
+       if (!err)
+               err = eb_lock_and_parse_cmdbuffer(eb);
+
        /*
         * Leave the user relocations as are, this is the painfully slow path,
         * and we want to avoid the complication of dropping the lock whilst
@@ -1757,8 +1817,15 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
         */
 
 err:
+       if (err == -EDEADLK) {
+               eb_reset_vmas(eb);
+
+               err = i915_gem_ww_ctx_backoff(&eb->ww);
+               if (!err)
+                       goto repeat_reloc;
+       }
        if (err == -EAGAIN)
-               goto repeat;
+               goto repeat_pass;
 
 out:
        if (have_copy) {
@@ -1781,60 +1848,79 @@ static noinline int eb_relocate_slow(struct 
i915_execbuffer *eb)
        return err;
 }
 
-static int eb_relocate(struct i915_execbuffer *eb)
+static int eb_relocate_fast(struct i915_execbuffer *eb)
 {
-       if (eb_lookup_vmas(eb))
-               goto slow;
+       int err;
 
        /* The objects are in their final locations, apply the relocations. */
        if (eb->args->flags & __EXEC_HAS_RELOC) {
                struct eb_vma *ev;
 
                list_for_each_entry(ev, &eb->relocs, reloc_link) {
-                       if (eb_relocate_vma(eb, ev))
-                               goto slow;
+                       err = eb_relocate_vma(eb, ev);
+                       if (err)
+                               return err;
                }
        }
 
        return 0;
-
-slow:
-       return eb_relocate_slow(eb);
 }
 
-static int eb_move_to_gpu(struct i915_execbuffer *eb)
+static int eb_relocate_and_parse_cmdbuf_backoff(struct i915_execbuffer *eb)
 {
-       const unsigned int count = eb->buffer_count;
-       struct ww_acquire_ctx acquire;
-       unsigned int i;
-       int err = 0;
+       int err;
 
-       ww_acquire_init(&acquire, &reservation_ww_class);
+       err = eb_lookup_vmas(eb);
+       if (err) {
+               DRM_DEBUG_DRIVER("eb_lookup_vmas() => %i\n", err);
+               return err;
+       }
 
-       for (i = 0; i < count; i++) {
-               struct eb_vma *ev = &eb->vma[i];
-               struct i915_vma *vma = ev->vma;
+       DRM_DEBUG_DRIVER("eb_reserve() => %i\n", err);
 
-               err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire);
-               if (err == -EDEADLK) {
-                       GEM_BUG_ON(i == 0);
-                       do {
-                               int j = i - 1;
+retry_fast:
+       if (!err)
+               err = eb_relocate_fast(eb);
+       if (!err) {
+               err = eb_lock_and_parse_cmdbuffer(eb);
+               DRM_DEBUG_DRIVER("Parsing cmdbuf returns %i\n", err);
 
-                               ww_mutex_unlock(&eb->vma[j].vma->resv->lock);
+               /* return on success, or any error except -EDEADLK */
+               if (err != -EDEADLK)
+                       return err;
+       }
 
-                               swap(eb->vma[i],  eb->vma[j]);
-                       } while (--i);
+       if (err == -EDEADLK) {
+               eb_unreserve_vmas(eb);
 
-                       err = ww_mutex_lock_slow_interruptible(&vma->resv->lock,
-                                                              &acquire);
-               }
-               if (err == -EALREADY)
-                       err = 0;
+               err = i915_gem_ww_ctx_backoff(&eb->ww);
                if (err)
-                       break;
+                       return err;
+
+               goto retry_fast;
        }
-       ww_acquire_done(&acquire);
+
+       err = eb_relocate_slow(eb);
+       if (err)
+               /*
+                * If the user expects the execobject.offset and
+                * reloc.presumed_offset to be an exact match,
+                * as for using NO_RELOC, then we cannot update
+                * the execobject.offset until we have completed
+                * relocation.
+                */
+               eb->args->flags &= ~__EXEC_HAS_RELOC;
+
+       DRM_DEBUG_DRIVER("Slow relocation returns %i\n", err);
+       return err;
+}
+
+static int eb_move_to_gpu(struct i915_execbuffer *eb)
+{
+       unsigned int i;
+       int err = 0;
+
+       ww_acquire_done(&eb->ww.ctx);
 
        while (i--) {
                struct eb_vma *ev = &eb->vma[i];
@@ -1880,15 +1966,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
                if (err == 0)
                        err = i915_vma_move_to_active(vma, eb->request, flags);
 
-               i915_vma_unlock(vma);
-
                __eb_unreserve_vma(vma, flags);
                if (unlikely(flags & __EXEC_OBJECT_HAS_REF))
                        i915_vma_put(vma);
 
                ev->vma = NULL;
        }
-       ww_acquire_fini(&acquire);
 
        if (unlikely(err))
                goto err_skip;
@@ -1990,21 +2073,17 @@ shadow_batch_pin(struct i915_execbuffer *eb, struct 
drm_i915_gem_object *obj)
        return vma;
 }
 
-static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
+static int eb_parse(struct i915_execbuffer *eb,
+                   struct intel_engine_pool_node *pool)
 {
-       struct intel_engine_pool_node *pool;
        struct i915_vma *vma;
        u64 batch_start;
        u64 shadow_batch_start;
        int err;
 
-       pool = intel_engine_get_pool(eb->engine, eb->batch_len);
-       if (IS_ERR(pool))
-               return ERR_CAST(pool);
-
        vma = shadow_batch_pin(eb, pool->obj);
        if (IS_ERR(vma))
-               goto err;
+               return PTR_ERR(vma);
 
        batch_start = gen8_canonical_addr(eb->batch->vma->node.start) +
                      eb->batch_start_offset;
@@ -2028,12 +2107,13 @@ static struct i915_vma *eb_parse(struct i915_execbuffer 
*eb)
                 * For PPGTT backing however, we have no choice but to forcibly
                 * reject unsafe buffers
                 */
-               if (i915_vma_is_ggtt(vma) && err == -EACCES)
+               if (i915_vma_is_ggtt(vma) && err == -EACCES) {
                        /* Execute original buffer non-secure */
-                       vma = NULL;
-               else
-                       vma = ERR_PTR(err);
-               goto err;
+                       intel_engine_pool_put(pool);
+                       return 0;
+               }
+
+               return err;
        }
 
        eb->vma[eb->buffer_count].vma = i915_vma_get(vma);
@@ -2049,11 +2129,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer 
*eb)
        /* eb->batch_len unchanged */
 
        vma->private = pool;
-       return vma;
-
-err:
-       intel_engine_pool_put(pool);
-       return vma;
+       return 0;
 }
 
 static void
@@ -2485,6 +2561,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        eb.batch_len = args->batch_len;
 
        eb.batch_flags = 0;
+
        if (args->flags & I915_EXEC_SECURE) {
                if (INTEL_GEN(i915) >= 11)
                        return -ENODEV;
@@ -2529,68 +2606,36 @@ i915_gem_do_execbuffer(struct drm_device *dev,
        }
 
        err = eb_create(&eb);
-       if (err)
+       if (err) {
+               DRM_DEBUG_DRIVER("eb_create() => %i\n", err);
                goto err_out_fence;
+       }
 
        GEM_BUG_ON(!eb.lut_size);
 
        err = eb_select_context(&eb);
-       if (unlikely(err))
+       if (unlikely(err)) {
+               DRM_DEBUG_DRIVER("eb_select_context() => %i\n", err);
                goto err_destroy;
+       }
 
        err = eb_pin_engine(&eb, file, args);
-       if (unlikely(err))
+       if (unlikely(err)) {
+               DRM_DEBUG_DRIVER("eb_pin_engine() => %i\n", err);
                goto err_context;
-
-       err = i915_mutex_lock_interruptible(dev);
-       if (err)
-               goto err_engine;
-
-       err = eb_relocate(&eb);
-       if (err) {
-               /*
-                * If the user expects the execobject.offset and
-                * reloc.presumed_offset to be an exact match,
-                * as for using NO_RELOC, then we cannot update
-                * the execobject.offset until we have completed
-                * relocation.
-                */
-               args->flags &= ~__EXEC_HAS_RELOC;
-               goto err_vma;
        }
 
-       if (unlikely(eb.batch->flags & EXEC_OBJECT_WRITE)) {
-               DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
-               err = -EINVAL;
-               goto err_vma;
-       }
+       i915_gem_ww_ctx_init(&eb.ww, true);
 
-       batch = eb.batch->vma;
-       if (range_overflows_t(u64,
-                             eb.batch_start_offset, eb.batch_len,
-                             batch->size)) {
-               DRM_DEBUG("Attempting to use out-of-bounds batch\n");
-               err = -EINVAL;
+       err = eb_relocate_and_parse_cmdbuf_backoff(&eb);
+       if (err)
                goto err_vma;
-       }
-
-       if (eb.batch_len == 0)
-               eb.batch_len = eb.batch->vma->size - eb.batch_start_offset;
-
-       if (eb_use_cmdparser(&eb)) {
-               struct i915_vma *vma;
-
-               vma = eb_parse(&eb);
-               if (IS_ERR(vma)) {
-                       err = PTR_ERR(vma);
-                       goto err_vma;
-               }
-       }
 
        /*
         * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
         * batch" bit. Hence we need to pin secure batches into the global gtt.
         * hsw should have this fixed, but bdw mucks it up again. */
+       batch = eb.batch->vma;
        if (eb.batch_flags & I915_DISPATCH_SECURE) {
                struct i915_vma *vma;
 
@@ -2607,7 +2652,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
                vma = i915_gem_object_ggtt_pin(batch->obj, NULL, 0, 0, 0);
                if (IS_ERR(vma)) {
                        err = PTR_ERR(vma);
-                       goto err_vma;
+                       goto err_pool;
                }
 
                batch = vma;
@@ -2684,13 +2729,13 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 err_batch_unpin:
        if (eb.batch_flags & I915_DISPATCH_SECURE)
                i915_vma_unpin(batch);
+err_pool:
        if (batch->private)
                intel_engine_pool_put(batch->private);
 err_vma:
        if (eb.exec)
                eb_release_vmas(&eb);
-       mutex_unlock(&dev->struct_mutex);
-err_engine:
+       i915_gem_ww_ctx_fini(&eb.ww);
        eb_unpin_engine(&eb);
 err_context:
        i915_gem_context_put(eb.gem_context);
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c 
b/drivers/gpu/drm/i915/i915_cmd_parser.c
index c5b9ec5b3d25..f5e821bf5d59 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1136,31 +1136,19 @@ static u32 *copy_batch(struct drm_i915_gem_object 
*dst_obj,
        void *dst, *src;
        int ret;
 
-       ret = i915_gem_object_lock_interruptible(NULL, dst_obj);
+       ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush);
        if (ret)
                return ERR_PTR(ret);
 
-       ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush);
-       if (ret) {
-               i915_gem_object_unlock(dst_obj);
-               return ERR_PTR(ret);
-       }
-
        dst = i915_gem_object_pin_map(dst_obj, I915_MAP_FORCE_WB);
        i915_gem_object_finish_access(dst_obj);
-       i915_gem_object_unlock(dst_obj);
 
        if (IS_ERR(dst))
                return dst;
 
-       ret = i915_gem_object_lock_interruptible(NULL, src_obj);
-       if (ret)
-               return ERR_PTR(ret);
-
        ret = i915_gem_object_prepare_read(src_obj, &src_needs_clflush);
        if (ret) {
                i915_gem_object_unpin_map(dst_obj);
-               i915_gem_object_unlock(src_obj);
                return ERR_PTR(ret);
        }
 
@@ -1209,7 +1197,6 @@ static u32 *copy_batch(struct drm_i915_gem_object 
*dst_obj,
        }
 
        i915_gem_object_finish_access(src_obj);
-       i915_gem_object_unlock(src_obj);
 
        /* dst_obj is returned with vmap pinned */
        *needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c3d8af28bfc1..ab8cb0c851f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1848,12 +1848,6 @@ int i915_gem_object_unbind(struct drm_i915_gem_object 
*obj,
 
 void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv);
 
-static inline int __must_check
-i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-       return mutex_lock_interruptible(&dev->struct_mutex);
-}
-
 int i915_gem_dumb_create(struct drm_file *file_priv,
                         struct drm_device *dev,
                         struct drm_mode_create_dumb *args);
-- 
2.24.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to