panfrost: Expose HW counters to userspace

Boris Brezillon Thu, 04 Apr 2019 08:21:18 -0700

Add the necessary infrastructure to expose GPU counters to userspace.
This takes the form of 4 new ioctls to:


- query the available counters
- create/destroy a performance monitor
- retrieve its values

The drm_panfrost_submit struct is extended to pass a list of perfmons
to attach to a job, which means perfmons will only track changes caused
by the jobs they are attached too.

Signed-off-by: Boris Brezillon <boris.brezil...@collabora.com>
---
 drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   8 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |  11 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |  22 +-
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |  43 +-
 drivers/gpu/drm/panfrost/panfrost_job.c     |  24 +
 drivers/gpu/drm/panfrost/panfrost_job.h     |   4 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 954 ++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  54 ++
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 +
 include/uapi/drm/panfrost_drm.h             | 122 +++
 11 files changed, 1260 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h

diff --git a/drivers/gpu/drm/panfrost/Makefile 
b/drivers/gpu/drm/panfrost/Makefile
index d07e0971b687..31cfb9d25682 100644
--- a/drivers/gpu/drm/panfrost/Makefile
+++ b/drivers/gpu/drm/panfrost/Makefile
@@ -6,6 +6,7 @@ panfrost-y := \
        panfrost_gem.o \
        panfrost_gpu.o \
        panfrost_job.o \
-       panfrost_mmu.o
+       panfrost_mmu.o \
+       panfrost_perfcnt.o
 
 obj-$(CONFIG_DRM_PANFROST) += panfrost.o
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c 
b/drivers/gpu/drm/panfrost/panfrost_device.c
index 148b5caa2322..f6a87bfa486b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -13,6 +13,7 @@
 #include "panfrost_gpu.h"
 #include "panfrost_job.h"
 #include "panfrost_mmu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_reset_init(struct panfrost_device *pfdev)
 {
@@ -147,7 +148,13 @@ int panfrost_device_init(struct panfrost_device *pfdev)
        pm_runtime_mark_last_busy(pfdev->dev);
        pm_runtime_put_autosuspend(pfdev->dev);
 
+       err = panfrost_perfcnt_init(pfdev);
+       if (err)
+               goto err_out5;
+
        return 0;
+err_out5:
+       panfrost_job_fini(pfdev);
 err_out4:
        panfrost_mmu_fini(pfdev);
 err_out3:
@@ -163,6 +170,7 @@ int panfrost_device_init(struct panfrost_device *pfdev)
 
 void panfrost_device_fini(struct panfrost_device *pfdev)
 {
+       panfrost_perfcnt_fini(pfdev);
        panfrost_job_fini(pfdev);
        panfrost_mmu_fini(pfdev);
        panfrost_gpu_fini(pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h 
b/drivers/gpu/drm/panfrost/panfrost_device.h
index a821b50a14c3..f7c4e9e55f1b 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -9,11 +9,13 @@
 #include <drm/drm_device.h>
 #include <drm/drm_mm.h>
 #include <drm/gpu_scheduler.h>
+#include <drm/panfrost_drm.h>
 
 struct panfrost_device;
 struct panfrost_mmu;
 struct panfrost_job_slot;
 struct panfrost_job;
+struct panfrost_perfcnt;
 
 #define NUM_JOB_SLOTS 3
 
@@ -45,6 +47,8 @@ struct panfrost_features {
 
        unsigned long hw_features[64 / BITS_PER_LONG];
        unsigned long hw_issues[64 / BITS_PER_LONG];
+
+       struct drm_panfrost_block_perfcounters 
perfcnt_layout[PANFROST_NUM_BLOCKS];
 };
 
 struct panfrost_device {
@@ -70,6 +74,8 @@ struct panfrost_device {
        struct panfrost_job *jobs[NUM_JOB_SLOTS];
        struct list_head scheduled_jobs;
 
+       struct panfrost_perfcnt *perfcnt;
+
        struct mutex sched_lock;
 };
 
@@ -77,6 +83,11 @@ struct panfrost_file_priv {
        struct panfrost_device *pfdev;
 
        struct drm_sched_entity sched_entity[NUM_JOB_SLOTS];
+
+       struct {
+               struct idr idr;
+               struct mutex lock;
+       } perfmon;
 };
 
 static inline struct panfrost_device *to_panfrost_device(struct drm_device 
*ddev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c 
b/drivers/gpu/drm/panfrost/panfrost_drv.c
index 8cffb70a3548..e5375b31627f 100644
--- a/drivers/gpu/drm/panfrost/panfrost_drv.c
+++ b/drivers/gpu/drm/panfrost/panfrost_drv.c
@@ -19,6 +19,7 @@
 #include "panfrost_mmu.h"
 #include "panfrost_job.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 
 static int panfrost_ioctl_get_param(struct drm_device *ddev, void *data, 
struct drm_file *file)
 {
@@ -219,6 +220,10 @@ static int panfrost_ioctl_submit(struct drm_device *dev, 
void *data,
        if (ret)
                goto fail;
 
+       ret = panfrost_perfcnt_create_job_ctx(job, file, args);
+       if (ret)
+               goto fail;
+
        ret = panfrost_job_push(job);
        if (ret)
                goto fail;
@@ -313,6 +318,7 @@ panfrost_open(struct drm_device *dev, struct drm_file *file)
 {
        struct panfrost_device *pfdev = dev->dev_private;
        struct panfrost_file_priv *panfrost_priv;
+       int ret;
 
        panfrost_priv = kzalloc(sizeof(*panfrost_priv), GFP_KERNEL);
        if (!panfrost_priv)
@@ -321,7 +327,16 @@ panfrost_open(struct drm_device *dev, struct drm_file 
*file)
        panfrost_priv->pfdev = pfdev;
        file->driver_priv = panfrost_priv;
 
-       return panfrost_job_open(panfrost_priv);
+       ret = panfrost_job_open(panfrost_priv);
+       if (ret)
+               goto err_free_priv;
+
+       panfrost_perfcnt_open(panfrost_priv);
+       return 0;
+
+err_free_priv:
+       kfree(panfrost_priv);
+       return ret;
 }
 
 static void
@@ -329,6 +344,7 @@ panfrost_postclose(struct drm_device *dev, struct drm_file 
*file)
 {
        struct panfrost_file_priv *panfrost_priv = file->driver_priv;
 
+       panfrost_perfcnt_close(panfrost_priv);
        panfrost_job_close(panfrost_priv);
 
        kfree(panfrost_priv);
@@ -348,6 +364,10 @@ static const struct drm_ioctl_desc 
panfrost_drm_driver_ioctls[] = {
        PANFROST_IOCTL(MMAP_BO,         mmap_bo,        DRM_RENDER_ALLOW),
        PANFROST_IOCTL(GET_PARAM,       get_param,      DRM_RENDER_ALLOW),
        PANFROST_IOCTL(GET_BO_OFFSET,   get_bo_offset,  DRM_RENDER_ALLOW),
+       PANFROST_IOCTL(GET_PERFCNT_LAYOUT, get_perfcnt_layout, 
DRM_RENDER_ALLOW),
+       PANFROST_IOCTL(CREATE_PERFMON,  create_perfmon, DRM_RENDER_ALLOW),
+       PANFROST_IOCTL(DESTROY_PERFMON, destroy_perfmon, DRM_RENDER_ALLOW),
+       PANFROST_IOCTL(GET_PERFMON_VALUES, get_perfmon_values, 
DRM_RENDER_ALLOW),
 };
 
 DEFINE_DRM_GEM_SHMEM_FOPS(panfrost_drm_driver_fops);
diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c 
b/drivers/gpu/drm/panfrost/panfrost_gpu.c
index d46d36170e18..c28a31c547cc 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
@@ -13,6 +13,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gpu.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 
 static irqreturn_t panfrost_gpu_irq_handler(int irq, void *data)
@@ -42,6 +43,12 @@ static irqreturn_t panfrost_gpu_irq_handler(int irq, void 
*data)
                done = true;
        }
 
+       if (state & GPU_IRQ_PERFCNT_SAMPLE_COMPLETED)
+               panfrost_perfcnt_sample_done(pfdev);
+
+       if (state & GPU_IRQ_CLEAN_CACHES_COMPLETED)
+               panfrost_perfcnt_clean_cache_done(pfdev);
+
        gpu_write(pfdev, GPU_INT_CLEAR, state);
 
        return IRQ_HANDLED;
@@ -152,14 +159,16 @@ struct panfrost_model {
                u32 revision;
                u64 issues;
        } revs[MAX_HW_REVS];
+       u64 perfcnt[PANFROST_NUM_BLOCKS];
 };
 
 #define GPU_MODEL(_name, _id, ...) \
-{\
+{                                                              \
        .name = __stringify(_name),                             \
        .id = _id,                                              \
        .features = hw_features_##_name,                        \
        .issues = hw_issues_##_name,                            \
+       .perfcnt = hw_perfcnt_##_name,                          \
        .revs = { __VA_ARGS__ },                                \
 }
 
@@ -198,13 +207,17 @@ static const struct panfrost_model gpu_models[] = {
 
 static void panfrost_gpu_init_features(struct panfrost_device *pfdev)
 {
+       struct drm_panfrost_block_perfcounters *perfcnt_layout;
        u32 gpu_id, num_js, major, minor, status, rev;
        const char *name = "unknown";
        u64 hw_feat = 0;
-       u64 hw_issues = hw_issues_all;
+       u64 hw_issues = hw_issues_all, mask;
        const struct panfrost_model *model;
+       unsigned int num;
        int i;
 
+       perfcnt_layout = pfdev->features.perfcnt_layout;
+
        pfdev->features.l2_features = gpu_read(pfdev, GPU_L2_FEATURES);
        pfdev->features.core_features = gpu_read(pfdev, GPU_CORE_FEATURES);
        pfdev->features.tiler_features = gpu_read(pfdev, GPU_TILER_FEATURES);
@@ -272,9 +285,35 @@ static void panfrost_gpu_init_features(struct 
panfrost_device *pfdev)
                if (best >= 0)
                        hw_issues |= model->revs[best].issues;
 
+               for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+                       perfcnt_layout[i].counters = model->perfcnt[i];
+
                break;
        }
 
+       /* Only one Job Manager. */
+       perfcnt_layout[PANFROST_JM_BLOCK].instances = BIT(0);
+       perfcnt_layout[PANFROST_SHADER_BLOCK].instances =
+                                               pfdev->features.shader_present;
+
+       /*
+        * In v4 HW we have one tiler per core group, with the number
+        * of core groups being equal to the number of L2 caches. Other
+        * HW versions just have one tiler and the number of L2 caches
+        * can be extracted from the mem_features field.
+        */
+       if (hw_feat & HW_FEATURE_V4) {
+               num = hweight64(pfdev->features.l2_present);
+               mask = GENMASK(num - 1, 0);
+               perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+               perfcnt_layout[PANFROST_TILER_BLOCK].instances = mask;
+       } else {
+               perfcnt_layout[PANFROST_TILER_BLOCK].instances = BIT(0);
+               num = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
+               mask = GENMASK(num - 1, 0);
+               perfcnt_layout[PANFROST_MMU_L2_BLOCK].instances = mask;
+       }
+
        bitmap_from_u64(pfdev->features.hw_features, hw_feat);
        bitmap_from_u64(pfdev->features.hw_issues, hw_issues);
 
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c 
b/drivers/gpu/drm/panfrost/panfrost_job.c
index 8d570c3f15d0..c2be61a9ebff 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -15,6 +15,7 @@
 #include "panfrost_features.h"
 #include "panfrost_issues.h"
 #include "panfrost_gem.h"
+#include "panfrost_perfcnt.h"
 #include "panfrost_regs.h"
 #include "panfrost_gpu.h"
 #include "panfrost_mmu.h"
@@ -153,6 +154,7 @@ static void panfrost_job_hw_submit(struct panfrost_job 
*job, int js)
                goto end;
 
        spin_lock_irqsave(&pfdev->hwaccess_lock, flags);
+       panfrost_perfcnt_run_job(job);
 
        job_write(pfdev, JS_HEAD_NEXT_LO(js), jc_head & 0xFFFFFFFF);
        job_write(pfdev, JS_HEAD_NEXT_HI(js), jc_head >> 32);
@@ -233,6 +235,12 @@ int panfrost_job_push(struct panfrost_job *job)
                goto unlock;
        }
 
+       ret = panfrost_perfcnt_push_job(job);
+       if (ret) {
+               mutex_unlock(&pfdev->sched_lock);
+               goto unlock;
+       }
+
        job->render_done_fence = dma_fence_get(&job->base.s_fence->finished);
 
        kref_get(&job->refcount); /* put by scheduler job completion */
@@ -272,6 +280,9 @@ static void panfrost_job_cleanup(struct kref *ref)
 
        for (i = 0; i < job->bo_count; i++)
                drm_gem_object_put_unlocked(job->bos[i]);
+
+       panfrost_perfcnt_clean_job_ctx(job);
+
        kvfree(job->bos);
 
        kfree(job);
@@ -316,6 +327,13 @@ static struct dma_fence *panfrost_job_dependency(struct 
drm_sched_job *sched_job
                }
        }
 
+       /* Return the perfmon wait fence if any. */
+       if (job->perfcnt_fence) {
+               fence = job->perfcnt_fence;
+               job->perfcnt_fence = NULL;
+               return fence;
+       }
+
        return NULL;
 }
 
@@ -399,6 +417,11 @@ static void panfrost_job_timedout(struct drm_sched_job 
*sched_job)
        /* restart scheduler after GPU is usable again */
        for (i = 0; i < NUM_JOB_SLOTS; i++)
                drm_sched_start(&pfdev->js->queue[i].sched, true);
+
+       /* For now, just say we're done. No reset and retry. */
+//     job_write(pfdev, JS_COMMAND(js), JS_COMMAND_HARD_STOP);
+       dma_fence_signal(job->done_fence);
+       panfrost_perfcnt_finish_job(job, true);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -442,6 +465,7 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void 
*data)
 
                if (status & JOB_INT_MASK_DONE(j)) {
                        dma_fence_signal(pfdev->jobs[j]->done_fence);
+                       panfrost_perfcnt_finish_job(pfdev->jobs[j], false);
                }
 
                status &= ~mask;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.h 
b/drivers/gpu/drm/panfrost/panfrost_job.h
index 62454128a792..18646cc5eebb 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.h
+++ b/drivers/gpu/drm/panfrost/panfrost_job.h
@@ -37,6 +37,10 @@ struct panfrost_job {
 
        /* Fence to be signaled by drm-sched once its done with the job */
        struct dma_fence *render_done_fence;
+
+       /* Perfcnt context */
+       struct panfrost_perfcnt_job_ctx *perfcnt_ctx;
+       struct dma_fence *perfcnt_fence;
 };
 
 int panfrost_job_init(struct panfrost_device *pfdev);
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c 
b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
new file mode 100644
index 000000000000..4491f153ad48
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,954 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */
+
+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"
+
+#define COUNTERS_PER_BLOCK             64
+#define BYTES_PER_COUNTER              4
+#define BLOCKS_PER_COREGROUP           8
+#define V4_SHADERS_PER_COREGROUP       4
+
+struct panfrost_perfcnt_job_ctx {
+       refcount_t refcount;
+       struct panfrost_device *pfdev;
+       struct dma_fence *wait_fence;
+       struct dma_fence *done_fence;
+       struct panfrost_perfmon **perfmons;
+       u32 perfmon_count;
+};
+
+struct panfrost_perfcnt {
+       struct work_struct dumpwork;
+       u64 fence_context;
+       u64 emit_seqno;
+       spinlock_t fence_lock;
+       struct mutex cfg_lock;
+       u32 cur_cfg[PANFROST_NUM_BLOCKS];
+       struct panfrost_gem_object *bo;
+       void *buf;
+       spinlock_t ctx_lock;
+       struct panfrost_perfcnt_job_ctx *last_ctx;
+       struct panfrost_perfcnt_job_ctx *dump_ctx;
+};
+
+struct panfrost_perfcnt_fence {
+       struct dma_fence base;
+       struct drm_device *dev;
+       u64 seqno;
+};
+
+struct panfrost_perfmon {
+       refcount_t refcnt;
+       atomic_t busycnt;
+       struct wait_queue_head wq;
+       struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+       u32 *values[PANFROST_NUM_BLOCKS];
+};
+
+static inline struct panfrost_perfcnt_fence *
+to_panfrost_perfcnt_fence(struct dma_fence *fence)
+{
+       return container_of(fence, struct panfrost_perfcnt_fence, base);
+}
+
+static const char *
+panfrost_perfcnt_fence_get_driver_name(struct dma_fence *fence)
+{
+       return "panfrost";
+}
+
+static const char *
+panfrost_perfcnt_fence_get_timeline_name(struct dma_fence *fence)
+{
+       return "panfrost-perfcnt";
+}
+
+static const struct dma_fence_ops panfrost_perfcnt_fence_ops = {
+       .get_driver_name = panfrost_perfcnt_fence_get_driver_name,
+       .get_timeline_name = panfrost_perfcnt_fence_get_timeline_name,
+};
+
+static struct dma_fence *
+panfrost_perfcnt_fence_create(struct panfrost_device *pfdev)
+{
+       struct panfrost_perfcnt_fence *fence;
+
+       fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+       if (!fence)
+               return ERR_PTR(-ENOMEM);
+
+       fence->dev = pfdev->ddev;
+       fence->seqno = ++pfdev->perfcnt->emit_seqno;
+       dma_fence_init(&fence->base, &panfrost_perfcnt_fence_ops,
+                      &pfdev->perfcnt->fence_lock,
+                      pfdev->perfcnt->fence_context, fence->seqno);
+
+       return &fence->base;
+}
+
+static void panfrost_perfmon_get(struct panfrost_perfmon *perfmon)
+{
+       if (perfmon)
+               refcount_inc(&perfmon->refcnt);
+}
+
+static void panfrost_perfmon_put(struct panfrost_perfmon *perfmon)
+{
+       if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+               unsigned int i;
+
+               for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+                       kfree(perfmon->values[i]);
+
+               kfree(perfmon);
+       }
+}
+
+static struct panfrost_perfmon *
+panfrost_perfcnt_find_perfmon(struct panfrost_file_priv *pfile, int id)
+{
+       struct panfrost_perfmon *perfmon;
+
+       mutex_lock(&pfile->perfmon.lock);
+       perfmon = idr_find(&pfile->perfmon.idr, id);
+       panfrost_perfmon_get(perfmon);
+       mutex_unlock(&pfile->perfmon.lock);
+
+       return perfmon;
+}
+
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile)
+{
+       mutex_init(&pfile->perfmon.lock);
+       idr_init(&pfile->perfmon.idr);
+}
+
+static int panfrost_perfcnt_idr_del(int id, void *elem, void *data)
+{
+       struct panfrost_perfmon *perfmon = elem;
+
+       panfrost_perfmon_put(perfmon);
+
+       return 0;
+}
+
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile)
+{
+       mutex_lock(&pfile->perfmon.lock);
+       idr_for_each(&pfile->perfmon.idr, panfrost_perfcnt_idr_del, NULL);
+       idr_destroy(&pfile->perfmon.idr);
+       mutex_unlock(&pfile->perfmon.lock);
+}
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+                                     struct drm_file *file_priv)
+{
+       struct panfrost_file_priv *pfile = file_priv->driver_priv;
+       struct panfrost_device *pfdev = pfile->pfdev;
+       struct drm_panfrost_get_perfcnt_layout *layout = data;
+
+       memcpy(layout->counters, pfdev->features.perfcnt_layout,
+              sizeof(layout->counters));
+
+       return 0;
+}
+
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+                                 struct drm_file *file_priv)
+{
+       struct panfrost_file_priv *pfile = file_priv->driver_priv;
+       struct panfrost_device *pfdev = pfile->pfdev;
+       struct drm_panfrost_create_perfmon *req = data;
+       struct drm_panfrost_block_perfcounters *layout;
+       struct panfrost_perfmon *perfmon;
+       unsigned int i;
+       int ret;
+
+       if (req->padding)
+               return -EINVAL;
+
+       perfmon = kzalloc(sizeof(*perfmon), GFP_KERNEL);
+       if (!perfmon)
+               return -ENOMEM;
+
+       ret = -ENOMEM;
+       layout = pfdev->features.perfcnt_layout;
+       for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+               unsigned int ncounters;
+
+               /* Make sure the request matches the available counters. */
+               if (~layout[i].instances & req->counters[i].instances ||
+                   ~layout[i].counters & req->counters[i].counters)
+                       goto err_free_perfmon;
+
+               ncounters = hweight64(req->counters[i].instances) *
+                           hweight64(req->counters[i].counters);
+               if (!ncounters)
+                       continue;
+
+               perfmon->counters[i] = req->counters[i];
+               perfmon->values[i] = kcalloc(ncounters, sizeof(u32), 
GFP_KERNEL);
+               if (!perfmon->values)
+                       goto err_free_perfmon;
+       }
+
+       refcount_set(&perfmon->refcnt, 1);
+       init_waitqueue_head(&perfmon->wq);
+
+       mutex_lock(&pfile->perfmon.lock);
+       ret = idr_alloc(&pfile->perfmon.idr, perfmon, 1, U32_MAX, GFP_KERNEL);
+       mutex_unlock(&pfile->perfmon.lock);
+
+       if (ret < 0)
+               goto err_free_perfmon;
+
+       req->id = ret;
+       return 0;
+
+err_free_perfmon:
+       for (i = 0; i < PANFROST_NUM_BLOCKS; i++)
+               kfree(perfmon->values[i]);
+
+       kfree(perfmon);
+       return ret;
+}
+
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+                                  struct drm_file *file_priv)
+{
+       struct panfrost_file_priv *pfile = file_priv->driver_priv;
+       struct drm_panfrost_destroy_perfmon *req = data;
+       struct panfrost_perfmon *perfmon;
+
+       mutex_lock(&pfile->perfmon.lock);
+       perfmon = idr_remove(&pfile->perfmon.idr, req->id);
+       mutex_unlock(&pfile->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       panfrost_perfmon_put(perfmon);
+       return 0;
+}
+
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+                                     struct drm_file *file_priv)
+{
+       struct panfrost_file_priv *pfile = file_priv->driver_priv;
+       struct drm_panfrost_get_perfmon_values *req = data;
+       struct panfrost_perfmon *perfmon;
+       unsigned int i;
+       int ret = 0;
+
+       mutex_lock(&pfile->perfmon.lock);
+       perfmon = idr_find(&pfile->perfmon.idr, req->id);
+       panfrost_perfmon_get(perfmon);
+       mutex_unlock(&pfile->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       if (!(req->flags & DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT))
+               ret = wait_event_interruptible(perfmon->wq,
+                                              !atomic_read(&perfmon->busycnt));
+       else if (atomic_read(&perfmon->busycnt))
+               ret = -EBUSY;
+
+       if (ret)
+               goto out;
+
+       for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+               unsigned int ncounters;
+
+               ncounters = hweight64(perfmon->counters[i].instances) *
+                           hweight64(perfmon->counters[i].counters);
+               if (!ncounters)
+                       continue;
+
+               if (copy_to_user(u64_to_user_ptr(req->values_ptrs[i]),
+                                perfmon->values[i],
+                                ncounters * sizeof(u32))) {
+                       ret = -EFAULT;
+                       break;
+               }
+
+               if (req->flags & DRM_PANFROST_GET_PERFMON_VALS_RESET)
+                       memset(perfmon->values[i], 0, ncounters * sizeof(u32));
+       }
+
+out:
+       panfrost_perfmon_put(perfmon);
+       return ret;
+}
+
+/*
+ * Returns true if the 2 jobs have exactly the same perfcnt context, false
+ * otherwise.
+ */
+static bool panfrost_perfcnt_job_ctx_cmp(struct panfrost_perfcnt_job_ctx *a,
+                                        struct panfrost_perfcnt_job_ctx *b)
+{
+       unsigned int i, j;
+
+       if (a->perfmon_count != b->perfmon_count)
+               return false;
+
+       for (i = 0; i < a->perfmon_count; i++) {
+               for (j = 0; j < b->perfmon_count; j++) {
+                       if (a->perfmons[i] == b->perfmons[j])
+                               break;
+               }
+
+               if (j == b->perfmon_count)
+                       return false;
+       }
+
+       return true;
+}
+
+static u32 counters_u64_to_u32(u64 in)
+{
+       unsigned int i;
+       u32 out = 0;
+
+       for (i = 0; i < 64; i += 4) {
+               if (GENMASK(i + 3, i) & in)
+                       out |= BIT(i / 4);
+       }
+
+       return out;
+}
+
+void panfrost_perfcnt_run_job(struct panfrost_job *job)
+{
+       struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+       struct panfrost_device *pfdev = job->pfdev;
+       u32 perfcnt_en[PANFROST_NUM_BLOCKS] = { };
+       bool disable_perfcnt = true, config_changed = false;
+       unsigned int i, j;
+       u64 gpuva;
+       u32 cfg;
+
+       mutex_lock(&pfdev->perfcnt->cfg_lock);
+       for (i = 0; i < PANFROST_NUM_BLOCKS; i++) {
+               for (j = 0; j < ctx->perfmon_count; j++) {
+                       u64 counters = ctx->perfmons[j]->counters[i].counters;
+
+                       perfcnt_en[i] |= counters_u64_to_u32(counters);
+               }
+
+               if (perfcnt_en[i])
+                       disable_perfcnt = false;
+
+               if (perfcnt_en[i] != pfdev->perfcnt->cur_cfg[i]) {
+                       pfdev->perfcnt->cur_cfg[i] = perfcnt_en[i];
+                       config_changed = true;
+               }
+       }
+       mutex_unlock(&pfdev->perfcnt->cfg_lock);
+
+       if (!config_changed)
+               return;
+
+       /*
+        * Always use address space 0 for now.
+        * FIXME: this needs to be updated when we start using different
+        * address space.
+        */
+       cfg = GPU_PERFCNT_CFG_AS(0);
+       if (panfrost_model_cmp(pfdev, 0x1000) >= 0)
+               cfg |= GPU_PERFCNT_CFG_SETSEL(1);
+
+       gpu_write(pfdev, GPU_PERFCNT_CFG,
+                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
+
+       if (disable_perfcnt)
+               return;
+
+       gpu_write(pfdev, GPU_PRFCNT_JM_EN, perfcnt_en[PANFROST_JM_BLOCK]);
+       gpu_write(pfdev, GPU_PRFCNT_SHADER_EN,
+                 perfcnt_en[PANFROST_SHADER_BLOCK]);
+       gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN,
+                 perfcnt_en[PANFROST_MMU_L2_BLOCK]);
+       gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
+       gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
+       gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
+
+       /*
+        * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
+        * counters.
+        */
+       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+       else
+               gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+                         perfcnt_en[PANFROST_TILER_BLOCK]);
+
+       gpu_write(pfdev, GPU_PERFCNT_CFG,
+                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
+
+       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
+               gpu_write(pfdev, GPU_PRFCNT_TILER_EN,
+                         perfcnt_en[PANFROST_TILER_BLOCK]);
+}
+
+static void
+panfrost_perfcnt_release_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+       unsigned int i;
+
+       WARN_ON(refcount_read(&ctx->refcount));
+       for (i = 0; i < ctx->perfmon_count; i++) {
+               if (atomic_dec_and_test(&ctx->perfmons[i]->busycnt))
+                       wake_up(&ctx->perfmons[i]->wq);
+               panfrost_perfmon_put(ctx->perfmons[i]);
+       }
+
+       dma_fence_put(ctx->wait_fence);
+       dma_fence_put(ctx->done_fence);
+       kfree(ctx->perfmons);
+       kfree(ctx);
+}
+
+static void panfrost_perfcnt_put_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+       if (!IS_ERR_OR_NULL(ctx) && refcount_dec_and_test(&ctx->refcount))
+               panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+struct panfrost_perfcnt_job_ctx *
+panfrost_perfcnt_get_job_ctx(struct panfrost_perfcnt_job_ctx *ctx)
+{
+       if (ctx)
+               refcount_inc(&ctx->refcount);
+
+       return ctx;
+}
+
+static void panfrost_perfcnt_dump_done(struct panfrost_perfcnt_job_ctx *ctx)
+{
+       struct panfrost_device *pfdev;
+       unsigned long flags;
+
+       pfdev = ctx->pfdev;
+       spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+       pfdev->perfcnt->dump_ctx = NULL;
+       if (pfdev->perfcnt->last_ctx == ctx)
+               pfdev->perfcnt->last_ctx = NULL;
+       spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+
+       dma_fence_signal(ctx->done_fence);
+       panfrost_perfcnt_release_job_ctx(ctx);
+}
+
+static void
+panfrost_perfcnt_get_counter_vals(struct panfrost_device *pfdev,
+                                 enum drm_panfrost_block_id block,
+                                 unsigned int instance, u32 *vals)
+{
+       u64 shader_present = pfdev->features.shader_present;
+       unsigned int bufoffs, shaderid, shadernum;
+
+       if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+               unsigned int ncoregroups;
+
+               ncoregroups = hweight64(pfdev->features.l2_present);
+
+               switch (block) {
+               case PANFROST_SHADER_BLOCK:
+                       for (shaderid = 0, shadernum = 0; shaderid < 64;
+                            shaderid++) {
+                               if (!(BIT_ULL(shaderid) & shader_present))
+                                       continue;
+
+                               if (shadernum == instance)
+                                       break;
+
+                               shadernum++;
+                       }
+
+                       if (WARN_ON(shaderid == 64))
+                               return;
+
+                       /* 4 shaders per core group. */
+                       bufoffs = ((shaderid / V4_SHADERS_PER_COREGROUP) *
+                                  2048) +
+                                 ((shaderid % V4_SHADERS_PER_COREGROUP) *
+                                  256);
+                       break;
+
+               case PANFROST_TILER_BLOCK:
+                       if (WARN_ON(instance >= ncoregroups))
+                               return;
+
+                       bufoffs = (instance * 2048) + 1024;
+                       break;
+               case PANFROST_MMU_L2_BLOCK:
+                       if (WARN_ON(instance >= ncoregroups))
+                               return;
+
+                       bufoffs = (instance * 2048) + 1280;
+                       break;
+               case PANFROST_JM_BLOCK:
+                       if (WARN_ON(instance))
+                               return;
+                       bufoffs = 1792;
+                       break;
+               default:
+                       WARN_ON(1);
+                       return;
+               }
+       } else {
+               unsigned int nl2c, ncores;
+
+               /*
+                * TODO: define a macro to extract the number of l2 caches from
+                * mem_features.
+                */
+               nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 
1;
+
+               /*
+                * The ARM driver is grouping cores per core group and then
+                * only using the number of cores in group 0 to calculate the
+                * size. Not sure why this is done like that, but I guess
+                * shader_present will only show cores in the first group
+                * anyway.
+                */
+               ncores = hweight64(pfdev->features.shader_present);
+
+               switch (block) {
+               case PANFROST_SHADER_BLOCK:
+                       for (shaderid = 0, shadernum = 0; shaderid < 64;
+                            shaderid++) {
+                               if (!(BIT_ULL(shaderid) & shader_present))
+                                       continue;
+
+                               if (shadernum == instance)
+                                       break;
+
+                               shadernum++;
+                       }
+
+                       if (WARN_ON(shaderid == 64))
+                               return;
+
+                       /* 4 shaders per core group. */
+                       bufoffs = 512 + ((nl2c + shaderid) * 256);
+                       break;
+
+               case PANFROST_TILER_BLOCK:
+                       if (WARN_ON(instance))
+                               return;
+
+                       bufoffs = 256;
+                       break;
+               case PANFROST_MMU_L2_BLOCK:
+                       if (WARN_ON(instance >= nl2c))
+                               return;
+
+                       bufoffs = 512 + (instance * 256);
+                       break;
+               case PANFROST_JM_BLOCK:
+                       if (WARN_ON(instance))
+                               return;
+                       bufoffs = 0;
+                       break;
+               default:
+                       WARN_ON(1);
+                       return;
+               }
+       }
+
+       memcpy(vals, pfdev->perfcnt->buf + bufoffs, 256);
+}
+
+static void
+panfrost_perfmon_upd_counter_vals(struct panfrost_perfmon *perfmon,
+                                 enum drm_panfrost_block_id block,
+                                 unsigned int instance, u32 *invals)
+{
+       u32 *outvals = perfmon->values[block];
+       unsigned int inidx, outidx;
+
+       if (WARN_ON(instance >= hweight64(perfmon->counters[block].instances)))
+               return;
+
+       if (!(perfmon->counters[block].instances & BIT_ULL(instance)))
+               return;
+
+       outvals += instance * hweight64(perfmon->counters[block].counters);
+       for (inidx = 0, outidx = 0; inidx < 64; inidx++) {
+               if (!(perfmon->counters[block].counters & BIT_ULL(inidx)))
+                       continue;
+
+               if (U32_MAX - outvals[outidx] < invals[inidx])
+                       outvals[outidx] = U32_MAX;
+               else
+                       outvals[outidx] += invals[inidx];
+               outidx++;
+       }
+}
+
+static void panfrost_perfcnt_dump_work(struct work_struct *w)
+{
+       struct panfrost_perfcnt *perfcnt = container_of(w,
+                                               struct panfrost_perfcnt,
+                                               dumpwork);
+       struct panfrost_perfcnt_job_ctx *ctx = perfcnt->dump_ctx;
+       unsigned int block, instance, pmonidx, num;
+
+       if (!ctx)
+               return;
+
+       for (block = 0; block < PANFROST_NUM_BLOCKS; block++) {
+               struct panfrost_perfmon *perfmon;
+               u32 vals[COUNTERS_PER_BLOCK];
+               u64 instances = 0;
+
+               for (pmonidx = 0; pmonidx < ctx->perfmon_count; pmonidx++) {
+                       perfmon = ctx->perfmons[pmonidx];
+                       instances |= perfmon->counters[block].instances;
+               }
+
+               for (instance = 0, num = 0; instance < 64; instance++) {
+                       if (!(instances & BIT_ULL(instance)))
+                               continue;
+
+                       panfrost_perfcnt_get_counter_vals(ctx->pfdev, block,
+                                                         instance, vals);
+
+                       for (pmonidx = 0; pmonidx < ctx->perfmon_count;
+                            pmonidx++) {
+                               perfmon = ctx->perfmons[pmonidx];
+                               panfrost_perfmon_upd_counter_vals(perfmon,
+                                                                 block,
+                                                                 num,
+                                                                 vals);
+                       }
+                       num++;
+               }
+       }
+
+       panfrost_perfcnt_dump_done(ctx);
+}
+
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{
+       schedule_work(&pfdev->perfcnt->dumpwork);
+}
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{
+       gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);
+}
+
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job)
+{
+       return panfrost_perfcnt_put_job_ctx(job->perfcnt_ctx);
+}
+
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+                                   struct drm_file *file_priv,
+                                   struct drm_panfrost_submit *args)
+{
+       struct panfrost_device *pfdev = job->pfdev;
+       struct panfrost_file_priv *pfile = file_priv->driver_priv;
+       struct panfrost_perfcnt_job_ctx *ctx;
+       unsigned int i, j;
+       u32 *handles;
+       int ret;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->pfdev = pfdev;
+       refcount_set(&ctx->refcount, 1);
+
+       ctx->perfmon_count = args->perfmon_handle_count;
+       if (!ctx->perfmon_count) {
+               job->perfcnt_ctx = ctx;
+               return 0;
+       }
+
+       handles = kcalloc(ctx->perfmon_count, sizeof(u32), GFP_KERNEL);
+       if (!handles) {
+               ret = -ENOMEM;
+               goto err_put_ctx;
+       }
+
+       if (copy_from_user(handles,
+                          u64_to_user_ptr(args->perfmon_handles),
+                          ctx->perfmon_count * sizeof(u32))) {
+               ret = -EFAULT;
+               DRM_DEBUG("Failed to copy in perfmon handles\n");
+               goto err_free_handles;
+       }
+
+       /* Make sure each perfmon only appears once. */
+       for (i = 0; i < ctx->perfmon_count - 1; i++) {
+               for (j = i + 1; j < ctx->perfmon_count; j++) {
+                       if (handles[i] == handles[j]) {
+                               ret = -EINVAL;
+                               goto err_free_handles;
+                       }
+               }
+       }
+
+       ctx->perfmons = kcalloc(ctx->perfmon_count, sizeof(*ctx->perfmons),
+                               GFP_KERNEL | __GFP_ZERO);
+       if (!ctx->perfmons) {
+               ret = -ENOMEM;
+               goto err_free_handles;
+       }
+
+       for (i = 0; i < ctx->perfmon_count; i++) {
+               ctx->perfmons[i] = panfrost_perfcnt_find_perfmon(pfile,
+                                                                handles[i]);
+               if (!ctx->perfmons[i]) {
+                       ret = -EINVAL;
+                       goto err_free_handles;
+               }
+               atomic_inc(&ctx->perfmons[i]->busycnt);
+       }
+
+       job->perfcnt_ctx = ctx;
+       kfree(handles);
+       return 0;
+
+err_free_handles:
+       kfree(handles);
+
+err_put_ctx:
+       panfrost_perfcnt_put_job_ctx(ctx);
+       return ret;
+}
+
+void panfrost_perfcnt_finish_job(struct panfrost_job *job, bool skip_dump)
+{
+       struct panfrost_perfcnt_job_ctx *ctx = job->perfcnt_ctx;
+
+       if (WARN_ON(!ctx))
+               return;
+
+       job->perfcnt_ctx = NULL;
+       if (!refcount_dec_and_test(&ctx->refcount))
+               return;
+
+       if (!ctx->perfmon_count || skip_dump) {
+               panfrost_perfcnt_dump_done(ctx);
+               return;
+       }
+
+       ctx->pfdev->perfcnt->dump_ctx = ctx;
+       gpu_write(ctx->pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
+}
+
+static bool panfrost_perfcnt_try_reuse_last_job_ctx(struct panfrost_job *job)
+{
+       struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+       struct panfrost_device *pfdev = job->pfdev;
+       unsigned int i;
+
+       new_ctx = job->perfcnt_ctx;
+       prev_ctx = pfdev->perfcnt->last_ctx;
+       if (!prev_ctx)
+               return false;
+
+       if (!refcount_inc_not_zero(&prev_ctx->refcount))
+               return false;
+
+       if (!panfrost_perfcnt_job_ctx_cmp(prev_ctx, new_ctx)) {
+               refcount_dec(&prev_ctx->refcount);
+               return false;
+       }
+
+       /*
+        * Make sure we increment busycnt, as panfrost_perfcnt_put_job_ctx()
+        * will decrement it.
+        */
+       for (i = 0; i < prev_ctx->perfmon_count; i++)
+               atomic_inc(&prev_ctx->perfmons[i]->busycnt);
+
+       panfrost_perfcnt_put_job_ctx(new_ctx);
+       job->perfcnt_ctx = prev_ctx;
+       job->perfcnt_fence = dma_fence_get(prev_ctx->wait_fence);
+       return true;
+}
+
+int panfrost_perfcnt_push_job(struct panfrost_job *job)
+{
+       struct panfrost_perfcnt_job_ctx *prev_ctx, *new_ctx;
+       struct panfrost_device *pfdev = job->pfdev;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&pfdev->perfcnt->ctx_lock, flags);
+       new_ctx = job->perfcnt_ctx;
+       prev_ctx = pfdev->perfcnt->last_ctx;
+       /*
+        * In order to keep things relatively fast even when HW counters are
+        * enabled we try to avoid having to dump perfcounters at the end of
+        * each job (which implies making other jobs wait for this dump to
+        * finish) when that's possible.
+        * This is only acceptable if all queued jobs share the same perfctx,
+        * that is, they have the same list of jobs attached to them. In this
+        * condition we are guaranteed that nothing will increment the counters
+        * behind our back.
+        */
+       if (panfrost_perfcnt_try_reuse_last_job_ctx(job))
+               goto out;
+
+       new_ctx->done_fence = panfrost_perfcnt_fence_create(pfdev);
+       if (IS_ERR(new_ctx->done_fence)) {
+               ret = PTR_ERR(new_ctx->done_fence);
+               goto out;
+       }
+
+       /*
+        * The previous job has a different perfmon ctx, so we must wait for it
+        * to be done dumping the counters before we can schedule this new job,
+        * otherwise we might corrupt the counter values.
+        */
+       if (prev_ctx)
+               new_ctx->wait_fence = dma_fence_get(prev_ctx->done_fence);
+
+       job->perfcnt_fence = dma_fence_get(new_ctx->wait_fence);
+       pfdev->perfcnt->last_ctx = new_ctx;
+
+out:
+       spin_unlock_irqrestore(&pfdev->perfcnt->ctx_lock, flags);
+       return ret;
+}
+
+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{
+       struct panfrost_perfcnt *perfcnt;
+       struct drm_gem_shmem_object *bo;
+       size_t size;
+       u32 status;
+       int ret;
+
+       if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
+               unsigned int ncoregroups;
+
+               ncoregroups = hweight64(pfdev->features.l2_present);
+               size = ncoregroups * BLOCKS_PER_COREGROUP *
+                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+       } else {
+               unsigned int nl2c, ncores;
+
+               /*
+                * TODO: define a macro to extract the number of l2 caches from
+                * mem_features.
+                */
+               nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 
1;
+
+               /*
+                * The ARM driver is grouping cores per core group and then
+                * only using the number of cores in group 0 to calculate the
+                * size. Not sure why this is done like that, but I guess
+                * shader_present will only show cores in the first group
+                * anyway.
+                */
+               ncores = hweight64(pfdev->features.shader_present);
+
+               /*
+                * There's always one JM and one Tiler block, hence the '+ 2'
+                * here.
+                */
+               size = (nl2c + ncores + 2) *
+                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
+       }
+
+       perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
+       if (!perfcnt)
+               return -ENOMEM;
+
+       bo = drm_gem_shmem_create(pfdev->ddev, size);
+       if (IS_ERR(bo))
+               return PTR_ERR(bo);
+
+       perfcnt->bo = to_panfrost_bo(&bo->base);
+
+       /*
+        * We always use the same buffer, so let's map it once and keep it
+        * mapped until the driver is unloaded. This might be a problem if
+        * we start using different AS and the perfcnt BO is not mapped at
+        * the same GPU virtual address.
+        */
+       ret = panfrost_mmu_map(perfcnt->bo);
+       if (ret)
+               goto err_put_bo;
+
+       /* Disable everything. */
+       gpu_write(pfdev, GPU_PERFCNT_CFG,
+                 GPU_PERFCNT_CFG_AS(0) |
+                 GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
+                 (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
+                  GPU_PERFCNT_CFG_SETSEL(1) : 0));
+       gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
+       gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
+       gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
+       gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
+
+       perfcnt->buf = drm_gem_vmap(&bo->base);
+       if (IS_ERR(perfcnt->buf)) {
+               ret = PTR_ERR(perfcnt->buf);
+               goto err_put_bo;
+       }
+
+       INIT_WORK(&perfcnt->dumpwork, panfrost_perfcnt_dump_work);
+       mutex_init(&perfcnt->cfg_lock);
+       spin_lock_init(&perfcnt->fence_lock);
+       spin_lock_init(&perfcnt->ctx_lock);
+       perfcnt->fence_context = dma_fence_context_alloc(1);
+       pfdev->perfcnt = perfcnt;
+
+       /*
+        * Invalidate the cache and clear the counters to start from a fresh
+        * state.
+        */
+       gpu_write(pfdev, GPU_INT_MASK, 0);
+       gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
+
+       gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
+       gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
+       ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
+                                        status,
+                                        status &
+                                        GPU_IRQ_CLEAN_CACHES_COMPLETED,
+                                        100, 10000);
+       if (ret)
+               goto err_gem_vunmap;
+
+       gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);
+
+       return 0;
+
+err_gem_vunmap:
+       drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+
+err_put_bo:
+       drm_gem_object_put_unlocked(&bo->base);
+       return ret;
+}
+
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
+{
+       drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
+       drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
+}
diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.h 
b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
new file mode 100644
index 000000000000..7cbfeb072aa1
--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2019 Collabora Ltd */
+#ifndef __PANFROST_PERFCNT_H__
+#define __PANFROST_PERFCNT_H__
+
+#include <linux/bitops.h>
+
+struct panfrost_perfcnt_job_ctx;
+
+#define PERFCNT(_shader, _tiler, _mmu_l2, _jm)         \
+       { _shader, _tiler, _mmu_l2, _jm }
+#define NO_PERFCNT      PERFCNT(0, 0, 0, 0)
+
+/* FIXME: Declare counters for all models */
+#define hw_perfcnt_t600        NO_PERFCNT
+#define hw_perfcnt_t620        NO_PERFCNT
+#define hw_perfcnt_t720        NO_PERFCNT
+#define hw_perfcnt_t760        NO_PERFCNT
+#define hw_perfcnt_t820        NO_PERFCNT
+#define hw_perfcnt_t830        NO_PERFCNT
+#define hw_perfcnt_t860        NO_PERFCNT
+#define hw_perfcnt_t880        NO_PERFCNT
+#define hw_perfcnt_g76 NO_PERFCNT
+#define hw_perfcnt_g71 NO_PERFCNT
+#define hw_perfcnt_g72 NO_PERFCNT
+#define hw_perfcnt_g51 NO_PERFCNT
+#define hw_perfcnt_g52 NO_PERFCNT
+#define hw_perfcnt_g31 NO_PERFCNT
+
+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev);
+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev);
+int panfrost_perfcnt_push_job(struct panfrost_job *job);
+void panfrost_perfcnt_run_job(struct panfrost_job *job);
+void panfrost_perfcnt_finish_job(struct panfrost_job *job,
+                                bool skip_dump);
+void panfrost_perfcnt_clean_job_ctx(struct panfrost_job *job);
+int panfrost_perfcnt_create_job_ctx(struct panfrost_job *job,
+                                   struct drm_file *file_priv,
+                                   struct drm_panfrost_submit *args);
+void panfrost_perfcnt_open(struct panfrost_file_priv *pfile);
+void panfrost_perfcnt_close(struct panfrost_file_priv *pfile);
+int panfrost_perfcnt_init(struct panfrost_device *pfdev);
+void panfrost_perfcnt_fini(struct panfrost_device *pfdev);
+
+int panfrost_ioctl_get_perfcnt_layout(struct drm_device *dev, void *data,
+                                     struct drm_file *file_priv);
+int panfrost_ioctl_create_perfmon(struct drm_device *dev, void *data,
+                                 struct drm_file *file_priv);
+int panfrost_ioctl_destroy_perfmon(struct drm_device *dev, void *data,
+                                  struct drm_file *file_priv);
+int panfrost_ioctl_get_perfmon_values(struct drm_device *dev, void *data,
+                                     struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h 
b/drivers/gpu/drm/panfrost/panfrost_regs.h
index 42d08860fd76..ea38ac60581c 100644
--- a/drivers/gpu/drm/panfrost/panfrost_regs.h
+++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
@@ -44,12 +44,31 @@
         GPU_IRQ_MULTIPLE_FAULT)
 #define GPU_CMD                                0x30
 #define   GPU_CMD_SOFT_RESET           0x01
+#define   GPU_CMD_PERFCNT_CLEAR                0x03
+#define   GPU_CMD_PERFCNT_SAMPLE       0x04
+#define   GPU_CMD_CLEAN_CACHES         0x07
+#define   GPU_CMD_CLEAN_INV_CACHES     0x08
 #define GPU_STATUS                     0x34
+#define   GPU_STATUS_PRFCNT_ACTIVE     BIT(2)
 #define GPU_LATEST_FLUSH_ID            0x38
 #define GPU_FAULT_STATUS               0x3C
 #define GPU_FAULT_ADDRESS_LO           0x40
 #define GPU_FAULT_ADDRESS_HI           0x44
 
+#define GPU_PERFCNT_BASE_LO            0x60
+#define GPU_PERFCNT_BASE_HI            0x64
+#define GPU_PERFCNT_CFG                        0x68
+#define   GPU_PERFCNT_CFG_MODE(x)      (x)
+#define   GPU_PERFCNT_CFG_MODE_OFF     0
+#define   GPU_PERFCNT_CFG_MODE_MANUAL  1
+#define   GPU_PERFCNT_CFG_MODE_TILE    2
+#define   GPU_PERFCNT_CFG_AS(x)                ((x) << 4)
+#define   GPU_PERFCNT_CFG_SETSEL(x)    ((x) << 8)
+#define GPU_PRFCNT_JM_EN               0x6c
+#define GPU_PRFCNT_SHADER_EN           0x70
+#define GPU_PRFCNT_TILER_EN            0x74
+#define GPU_PRFCNT_MMU_L2_EN           0x7c
+
 #define GPU_THREAD_MAX_THREADS         0x0A0   /* (RO) Maximum number of 
threads per core */
 #define GPU_THREAD_MAX_WORKGROUP_SIZE  0x0A4   /* (RO) Maximum workgroup size 
*/
 #define GPU_THREAD_MAX_BARRIER_SIZE    0x0A8   /* (RO) Maximum threads waiting 
at a barrier */
diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
index 508b9621d9db..e09b35bf6035 100644
--- a/include/uapi/drm/panfrost_drm.h
+++ b/include/uapi/drm/panfrost_drm.h
@@ -18,6 +18,10 @@ extern "C" {
 #define DRM_PANFROST_MMAP_BO                   0x03
 #define DRM_PANFROST_GET_PARAM                 0x04
 #define DRM_PANFROST_GET_BO_OFFSET             0x05
+#define DRM_PANFROST_GET_PERFCNT_LAYOUT                0x06
+#define DRM_PANFROST_CREATE_PERFMON            0x07
+#define DRM_PANFROST_DESTROY_PERFMON           0x08
+#define DRM_PANFROST_GET_PERFMON_VALUES                0x09
 
 #define DRM_IOCTL_PANFROST_SUBMIT              DRM_IOW(DRM_COMMAND_BASE + 
DRM_PANFROST_SUBMIT, struct drm_panfrost_submit)
 #define DRM_IOCTL_PANFROST_WAIT_BO             DRM_IOW(DRM_COMMAND_BASE + 
DRM_PANFROST_WAIT_BO, struct drm_panfrost_wait_bo)
@@ -25,6 +29,10 @@ extern "C" {
 #define DRM_IOCTL_PANFROST_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_MMAP_BO, struct drm_panfrost_mmap_bo)
 #define DRM_IOCTL_PANFROST_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
 #define DRM_IOCTL_PANFROST_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+#define DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_GET_PERFCNT_LAYOUT, struct drm_panfrost_get_perfcnt_layout)
+#define DRM_IOCTL_PANFROST_CREATE_PERFMON      DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_CREATE_PERFMON, struct drm_panfrost_create_perfmon)
+#define DRM_IOCTL_PANFROST_DESTROY_PERFMON     DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_DESTROY_PERFMON, struct drm_panfrost_destroy_perfmon)
+#define DRM_IOCTL_PANFROST_GET_PERFMON_VALUES  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_PANFROST_GET_PERFMON_VALUES, struct drm_panfrost_get_perfmon_values)
 
 #define PANFROST_JD_REQ_FS (1 << 0)
 /**
@@ -55,6 +63,15 @@ struct drm_panfrost_submit {
 
        /** A combination of PANFROST_JD_REQ_* */
        __u32 requirements;
+
+       /** Pointer to a u32 array of perfmons that should be attached to the 
job. */
+       __u64 perfmon_handles;
+
+       /** Number of perfmon handles passed in (size is that times 4). */
+       __u32 perfmon_handle_count;
+
+       /** Unused field, should be set to 0. */
+       __u32 padding;
 };
 
 /**
@@ -133,6 +150,111 @@ struct drm_panfrost_get_bo_offset {
        __u64 offset;
 };
 
+/**
+ * Panfrost HW block ids used to group HW counters. There might be several
+ * shader, tiler and MMU/L2 blocks in a given GPU. How many of them are
+ * available is exposed through the instances field of
+ * drm_panfrost_block_perfcounters.
+ */
+enum drm_panfrost_block_id {
+       PANFROST_SHADER_BLOCK,
+       PANFROST_TILER_BLOCK,
+       PANFROST_MMU_L2_BLOCK,
+       PANFROST_JM_BLOCK,
+       PANFROST_NUM_BLOCKS,
+};
+
+struct drm_panfrost_block_perfcounters {
+       /*
+        * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+        * instances for a specific given block type.
+        * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the instances the
+        * user wants to monitor.
+        * Note: the bitmap might be sparse.
+        */
+       __u64 instances;
+
+       /*
+        * For DRM_IOCTL_PANFROST_GET_PERFCNT_LAYOUT, encodes the available
+        * counters attached to a specific block type.
+        * For DRM_IOCTL_PANFROST_CREATE_PERFMON, encodes the counters the user
+        * wants to monitor.
+        * Note: the bitmap might be sparse.
+        */
+       __u64 counters;
+};
+
+/**
+ * Used to retrieve available HW counters.
+ */
+struct drm_panfrost_get_perfcnt_layout {
+       struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+};
+
+/**
+ * Used to create a performance monitor. Each perfmonance monitor is assigned 
an
+ * ID that can later be passed when submitting a job to capture hardware 
counter
+ * values (and thus count things related to this specific job).
+ * Performance monitors are attached to the GPU file descriptor and IDs are
+ * unique within this context, not across all GPU users.
+ * This implies that
+ * - perfmons are automatically released when the FD is closed
+ * - perfmons can't be shared across GPU context
+ */
+struct drm_panfrost_create_perfmon {
+       /* Input Fields. */
+       /* List all HW counters this performance monitor should track. */
+       struct drm_panfrost_block_perfcounters counters[PANFROST_NUM_BLOCKS];
+
+       /* Output fields. */
+       /* ID of the newly created perfmon. */
+       __u32 id;
+
+       /* Padding: must be set to 0. */
+       __u32 padding;
+};
+
+/**
+ * Destroy an existing performance monitor.
+ */
+struct drm_panfrost_destroy_perfmon {
+       /*
+        * ID of the perfmon to destroy (the one returned by
+        * DRM_IOCTL_PANFROST_CREATE_PERFMON)
+        */
+       __u32 id;
+};
+
+/*
+ * Don't wait when trying to get perfmon values. If the perfmon is still active
+ * (still attached to a queued or running job), EBUSY is returned.
+ */
+#define DRM_PANFROST_GET_PERFMON_VALS_DONT_WAIT                0x1
+
+/* Reset all perfmon values to zero after reading them. */
+#define DRM_PANFROST_GET_PERFMON_VALS_RESET            0x2
+
+/**
+ * Used to query values collected by a performance monitor.
+ */
+struct drm_panfrost_get_perfmon_values {
+       /* ID of the perfmon to query value on. */
+       __u32 id;
+
+       /* See DRM_PANFROST_GET_PERFMON_VALS_XXX flags */
+       __u32 flags;
+
+       /*
+        * An array of u32 userspace pointers where counters values will be
+        * copied too.
+        * The array sizes depend on the counters/instances activated at
+        * perfmon creation time: hweight64(instances) * hweight64(counters).
+        * Note that some entries in values_ptrs[] might be NULL if no counters
+        * on a specific block were activated.
+        */
+       __u64 values_ptrs[PANFROST_NUM_BLOCKS];
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.20.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 2/3] drm/panfrost: Expose HW counters to userspace

Reply via email to