With the conversion to drm_gpuvm, we lost the lazy VMA cleanup, which
means that fb cleanup/unpin when pageflipping to new scanout buffers
immediately unmaps the scanout buffer.  This is costly (with tlbinv,
it can be 4-6ms for a 1080p scanout buffer, and more for higher
resolutions)!

To avoid this, introduce a vma_ref, which is incremented for scanout,
and whenever userspace has a GEM handle or dma-buf fd.  When unpinning
if the vm is the kms->vm we defer tearing down the VMA until the
vma_ref drops to zero.  If the buffer is still part of a flip-chain
then userspace will be holding some sort of reference to the BO, either
via a GEM handle and/or dma-buf fd.  So this avoids unmapping the VMA
when there is a strong possibility that it will be needed again.

Signed-off-by: Rob Clark <robin.cl...@oss.qualcomm.com>
---
 drivers/gpu/drm/msm/msm_gem.c        | 77 +++++++++++++++++++---------
 drivers/gpu/drm/msm/msm_gem.h        | 29 +++++++++++
 drivers/gpu/drm/msm/msm_gem_prime.c  | 35 ++++++++++++-
 drivers/gpu/drm/msm/msm_gem_submit.c |  8 +++
 4 files changed, 124 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index b882647144bb..55a409ac72f5 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -19,11 +19,11 @@
 #include "msm_drv.h"
 #include "msm_gem.h"
 #include "msm_gpu.h"
+#include "msm_kms.h"
 
 static int pgprot = 0;
 module_param(pgprot, int, 0600);
 
-
 static void update_device_mem(struct msm_drm_private *priv, ssize_t size)
 {
        uint64_t total_mem = atomic64_add_return(size, &priv->total_mem);
@@ -43,6 +43,7 @@ static void update_ctx_mem(struct drm_file *file, ssize_t 
size)
 
 static int msm_gem_open(struct drm_gem_object *obj, struct drm_file *file)
 {
+       msm_gem_vma_get(obj);
        update_ctx_mem(file, obj->size);
        return 0;
 }
@@ -50,33 +51,13 @@ static int msm_gem_open(struct drm_gem_object *obj, struct 
drm_file *file)
 static void put_iova_spaces(struct drm_gem_object *obj, struct drm_gpuvm *vm,
                            bool close, const char *reason);
 
-static void detach_vm(struct drm_gem_object *obj, struct drm_gpuvm *vm)
-{
-       msm_gem_assert_locked(obj);
-       drm_gpuvm_resv_assert_held(vm);
-
-       struct drm_gpuvm_bo *vm_bo = drm_gpuvm_bo_find(vm, obj);
-       if (vm_bo) {
-               struct drm_gpuva *vma;
-
-               drm_gpuvm_bo_for_each_va (vma, vm_bo) {
-                       if (vma->vm != vm)
-                               continue;
-                       msm_gem_vma_unmap(vma, "detach");
-                       msm_gem_vma_close(vma);
-                       break;
-               }
-
-               drm_gpuvm_bo_put(vm_bo);
-       }
-}
-
 static void msm_gem_close(struct drm_gem_object *obj, struct drm_file *file)
 {
        struct msm_context *ctx = file->driver_priv;
        struct drm_exec exec;
 
        update_ctx_mem(file, -obj->size);
+       msm_gem_vma_put(obj);
 
        /*
         * If VM isn't created yet, nothing to cleanup.  And in fact calling
@@ -103,10 +84,47 @@ static void msm_gem_close(struct drm_gem_object *obj, 
struct drm_file *file)
 
        msm_gem_lock_vm_and_obj(&exec, obj, ctx->vm);
        put_iova_spaces(obj, ctx->vm, true, "close");
-       detach_vm(obj, ctx->vm);
        drm_exec_fini(&exec);     /* drop locks */
 }
 
+/*
+ * Get/put for kms->vm VMA
+ */
+
+void msm_gem_vma_get(struct drm_gem_object *obj)
+{
+       atomic_inc(&to_msm_bo(obj)->vma_ref);
+}
+
+void msm_gem_vma_put(struct drm_gem_object *obj)
+{
+       struct msm_drm_private *priv = obj->dev->dev_private;
+       struct drm_exec exec;
+
+       if (atomic_dec_return(&to_msm_bo(obj)->vma_ref))
+               return;
+
+       if (!priv->kms)
+               return;
+
+       msm_gem_lock_vm_and_obj(&exec, obj, priv->kms->vm);
+       put_iova_spaces(obj, priv->kms->vm, true, "vma_put");
+       drm_exec_fini(&exec);     /* drop locks */
+}
+
+static void msm_gem_vma_put_locked(struct drm_gem_object *obj)
+{
+       struct msm_drm_private *priv = obj->dev->dev_private;
+
+       if (atomic_dec_return(&to_msm_bo(obj)->vma_ref))
+               return;
+
+       if (!priv->kms)
+               return;
+
+       put_iova_spaces(obj, priv->kms->vm, true, "vma_put");
+}
+
 /*
  * Cache sync.. this is a bit over-complicated, to fit dma-mapping
  * API.  Really GPU cache is out of scope here (handled on cmdstream)
@@ -281,6 +299,7 @@ void msm_gem_pin_obj_locked(struct drm_gem_object *obj)
        msm_gem_assert_locked(obj);
 
        to_msm_bo(obj)->pin_count++;
+       msm_gem_vma_get(obj);
        drm_gem_lru_move_tail_locked(&priv->lru.pinned, obj);
 }
 
@@ -518,6 +537,8 @@ void msm_gem_unpin_locked(struct drm_gem_object *obj)
 
        msm_gem_assert_locked(obj);
 
+       msm_gem_vma_put_locked(obj);
+
        mutex_lock(&priv->lru.lock);
        msm_obj->pin_count--;
        GEM_WARN_ON(msm_obj->pin_count < 0);
@@ -664,6 +685,13 @@ int msm_gem_set_iova(struct drm_gem_object *obj,
        return ret;
 }
 
+static bool is_kms_vm(struct drm_gpuvm *vm)
+{
+       struct msm_drm_private *priv = vm->drm->dev_private;
+
+       return priv->kms && (priv->kms->vm == vm);
+}
+
 /*
  * Unpin a iova by updating the reference counts. The memory isn't actually
  * purged until something else (shrinker, mm_notifier, destroy, etc) decides
@@ -679,7 +707,8 @@ void msm_gem_unpin_iova(struct drm_gem_object *obj, struct 
drm_gpuvm *vm)
        if (vma) {
                msm_gem_unpin_locked(obj);
        }
-       detach_vm(obj, vm);
+       if (!is_kms_vm(vm))
+               put_iova_spaces(obj, vm, true, "close");
        drm_exec_fini(&exec);     /* drop locks */
 }
 
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 9671c4299cf8..fafb221e173b 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -211,9 +211,38 @@ struct msm_gem_object {
         * Protected by LRU lock.
         */
        int pin_count;
+
+       /**
+        * @vma_ref: Reference count of VMA users.
+        *
+        * With the vm_bo/vma holding a reference to the GEM object, we'd
+        * otherwise have to actively tear down a VMA when, for example,
+        * a buffer is unpinned for scanout, vs. the pre-drm_gpuvm approach
+        * where a VMA did not hold a reference to the BO, but instead was
+        * implicitly torn down when the BO was freed.
+        *
+        * To regain the lazy VMA teardown, we use the @vma_ref.  It is
+        * incremented for any of the following:
+        *
+        * 1) the BO is pinned for scanout/etc
+        * 2) the BO is exported as a dma_buf
+        * 3) the BO has open userspace handle
+        *
+        * All of those conditions will hold an reference to the BO,
+        * preventing it from being freed.  So lazily keeping around the
+        * VMA will not prevent the BO from being freed.  (Or rather, the
+        * reference loop is harmless in this case.)
+        *
+        * When the @vma_ref drops to zero, then kms->vm VMA will be
+        * torn down.
+        */
+       atomic_t vma_ref;
 };
 #define to_msm_bo(x) container_of(x, struct msm_gem_object, base)
 
+void msm_gem_vma_get(struct drm_gem_object *obj);
+void msm_gem_vma_put(struct drm_gem_object *obj);
+
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
 int msm_gem_prot(struct drm_gem_object *obj);
 int msm_gem_pin_vma_locked(struct drm_gem_object *obj, struct drm_gpuva *vma);
diff --git a/drivers/gpu/drm/msm/msm_gem_prime.c 
b/drivers/gpu/drm/msm/msm_gem_prime.c
index 1a6d8099196a..43f264d3cfa9 100644
--- a/drivers/gpu/drm/msm/msm_gem_prime.c
+++ b/drivers/gpu/drm/msm/msm_gem_prime.c
@@ -6,6 +6,7 @@
 
 #include <linux/dma-buf.h>
 
+#include <drm/drm_drv.h>
 #include <drm/drm_prime.h>
 
 #include "msm_drv.h"
@@ -48,13 +49,45 @@ struct drm_gem_object *msm_gem_prime_import_sg_table(struct 
drm_device *dev,
        return msm_gem_import(dev, attach->dmabuf, sg);
 }
 
+static void msm_gem_dmabuf_release(struct dma_buf *dma_buf)
+{
+       struct drm_gem_object *obj = dma_buf->priv;
+
+       msm_gem_vma_put(obj);
+       drm_gem_dmabuf_release(dma_buf);
+}
+
+static const struct dma_buf_ops msm_gem_prime_dmabuf_ops =  {
+       .cache_sgt_mapping = true,
+       .attach = drm_gem_map_attach,
+       .detach = drm_gem_map_detach,
+       .map_dma_buf = drm_gem_map_dma_buf,
+       .unmap_dma_buf = drm_gem_unmap_dma_buf,
+       .release = msm_gem_dmabuf_release,
+       .mmap = drm_gem_dmabuf_mmap,
+       .vmap = drm_gem_dmabuf_vmap,
+       .vunmap = drm_gem_dmabuf_vunmap,
+};
 
 struct dma_buf *msm_gem_prime_export(struct drm_gem_object *obj, int flags)
 {
        if (to_msm_bo(obj)->flags & MSM_BO_NO_SHARE)
                return ERR_PTR(-EPERM);
 
-       return drm_gem_prime_export(obj, flags);
+       msm_gem_vma_get(obj);
+
+       struct drm_device *dev = obj->dev;
+       struct dma_buf_export_info exp_info = {
+               .exp_name = KBUILD_MODNAME, /* white lie for debug */
+               .owner = dev->driver->fops->owner,
+               .ops = &msm_gem_prime_dmabuf_ops,
+               .size = obj->size,
+               .flags = flags,
+               .priv = obj,
+               .resv = obj->resv,
+       };
+
+       return drm_gem_dmabuf_export(dev, &exp_info);
 }
 
 int msm_gem_prime_pin(struct drm_gem_object *obj)
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c 
b/drivers/gpu/drm/msm/msm_gem_submit.c
index 8a0f5b5eda30..bf9010da7e58 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -527,6 +527,14 @@ void msm_submit_retire(struct msm_gem_submit *submit)
                struct drm_gem_object *obj = submit->bos[i].obj;
                struct drm_gpuvm_bo *vm_bo = submit->bos[i].vm_bo;
 
+               /*
+                * msm_gem_unpin_active() doesn't drop the vma ref, because
+                * requires grabbing locks which we cannot grab in the fence
+                * signaling path.  So we have to do that here
+                */
+               if (submit->bos_pinned)
+                       msm_gem_vma_put(obj);
+
                drm_gem_object_put(obj);
                drm_gpuvm_bo_put(vm_bo);
        }
-- 
2.49.0

Reply via email to