On 6/12/2024 7:53 AM, Tomeu Vizoso wrote:
Using the DRM GPU scheduler infrastructure, with a scheduler for each
core.

Userspace can decide for a series of tasks to be executed sequentially
in the same core, so SRAM locality can be taken advantage of.

The job submission code was intially based on Panfrost.

intially -> initially


Signed-off-by: Tomeu Vizoso <to...@tomeuvizoso.net>
---
  drivers/accel/rocket/Makefile        |   3 +-
  drivers/accel/rocket/rocket_core.c   |   6 +
  drivers/accel/rocket/rocket_core.h   |  16 +
  drivers/accel/rocket/rocket_device.c |   2 +
  drivers/accel/rocket/rocket_device.h |   2 +
  drivers/accel/rocket/rocket_drv.c    |  15 +
  drivers/accel/rocket/rocket_drv.h    |   3 +
  drivers/accel/rocket/rocket_job.c    | 708 +++++++++++++++++++++++++++++++++++
  drivers/accel/rocket/rocket_job.h    |  49 +++
  include/uapi/drm/rocket_accel.h      |  55 +++
  10 files changed, 858 insertions(+), 1 deletion(-)

diff --git a/drivers/accel/rocket/Makefile b/drivers/accel/rocket/Makefile
index 875cac2243d9..4d59036af8d9 100644
--- a/drivers/accel/rocket/Makefile
+++ b/drivers/accel/rocket/Makefile
@@ -6,4 +6,5 @@ rocket-y := \
        rocket_core.o \
        rocket_device.o \
        rocket_drv.o \
-       rocket_gem.o
+       rocket_gem.o \
+       rocket_job.o
diff --git a/drivers/accel/rocket/rocket_core.c 
b/drivers/accel/rocket/rocket_core.c
index d6680b00fb2f..2b2d8be38f0a 100644
--- a/drivers/accel/rocket/rocket_core.c
+++ b/drivers/accel/rocket/rocket_core.c
@@ -11,6 +11,7 @@
#include "rocket_core.h"
  #include "rocket_device.h"
+#include "rocket_job.h"
  #include "rocket_registers.h"
static int rocket_clk_init(struct rocket_core *core)
@@ -122,6 +123,10 @@ int rocket_core_init(struct rocket_core *core)
                goto out_pm_domain;
        }
+ err = rocket_job_init(core);
+       if (err)
+               goto out_pm_domain;
+
        version = rocket_read(core, REG_PC_VERSION) + (rocket_read(core, 
REG_PC_VERSION_NUM) & 0xffff);
        dev_info(rdev->dev, "Rockchip NPU core %d version: %d\n", core->index, 
version);
@@ -134,6 +139,7 @@ int rocket_core_init(struct rocket_core *core) void rocket_core_fini(struct rocket_core *core)
  {
+       rocket_job_fini(core);
        rocket_pmdomain_fini(core);
  }
diff --git a/drivers/accel/rocket/rocket_core.h b/drivers/accel/rocket/rocket_core.h
index e5d4c848c9f4..e6401960a9b2 100644
--- a/drivers/accel/rocket/rocket_core.h
+++ b/drivers/accel/rocket/rocket_core.h
@@ -8,6 +8,8 @@
  #include <asm/io.h>
  #include <asm-generic/io.h>
+#include <drm/gpu_scheduler.h>

What about includes for workqueue or atomic?


+static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
+{
+       struct rocket_job *job = to_rocket_job(sched_job);
+       struct rocket_device *rdev = job->rdev;
+       struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
+       struct dma_fence *fence = NULL;
+       int ret;
+
+       if (unlikely(job->base.s_fence->finished.error))
+               return NULL;
+
+       /* Nothing to execute: can happen if the job has finished while
+        * we were resetting the GPU.
+        */

Not the correct comment style

+       if (job->next_task_idx == job->task_count)
+               return NULL;
+
+       fence = rocket_fence_create(core);
+       if (IS_ERR(fence))
+               return fence;
+
+       if (job->done_fence)
+               dma_fence_put(job->done_fence);
+       job->done_fence = dma_fence_get(fence);
+
+       ret = pm_runtime_get_sync(rdev->dev);
+       if (ret < 0)
+               return fence;
+
+       spin_lock(&core->job_lock);
+
+       core->in_flight_job = job;
+       rocket_job_hw_submit(core, job);
+
+       spin_unlock(&core->job_lock);
+
+       return fence;
+}
+
+static void rocket_job_handle_done(struct rocket_core *core,
+                                  struct rocket_job *job)
+{
+       if (job->next_task_idx < job->task_count) {
+               rocket_job_hw_submit(core, job);
+               return;
+       }
+
+       core->in_flight_job = NULL;
+       dma_fence_signal_locked(job->done_fence);
+       pm_runtime_put_autosuspend(core->dev->dev);
+}
+
+static void rocket_job_handle_irq(struct rocket_core *core)
+{
+       uint32_t status, raw_status;
+
+       pm_runtime_mark_last_busy(core->dev->dev);
+
+       status = rocket_read(core, REG_PC_INTERRUPT_STATUS);
+       raw_status = rocket_read(core, REG_PC_INTERRUPT_RAW_STATUS);
+
+       rocket_write(core, REG_PC_OPERATION_ENABLE, 0x0);
+       rocket_write(core, REG_PC_INTERRUPT_CLEAR, 0x1ffff);
+
+       spin_lock(&core->job_lock);
+
+       if (core->in_flight_job)
+               rocket_job_handle_done(core, core->in_flight_job);
+
+       spin_unlock(&core->job_lock);
+}
+
+static void
+rocket_reset(struct rocket_core *core, struct drm_sched_job *bad)
+{
+       struct rocket_device *rdev = core->dev;
+       bool cookie;
+
+       if (!atomic_read(&core->reset.pending))
+               return;
+
+       /* Stop the scheduler.

Not the correct comment style

+        *
+        * FIXME: We temporarily get out of the dma_fence_signalling section
+        * because the cleanup path generate lockdep splats when taking locks
+        * to release job resources. We should rework the code to follow this
+        * pattern:
+        *
+        *      try_lock
+        *      if (locked)
+        *              release
+        *      else
+        *              schedule_work_to_release_later
+        */
+       drm_sched_stop(&core->sched, bad);
+
+       cookie = dma_fence_begin_signalling();
+
+       if (bad)
+               drm_sched_increase_karma(bad);
+
+       /* Mask job interrupts and synchronize to make sure we won't be
+        * interrupted during our reset.
+        */

Not the correct comment style, again. This is the last time I'm going to mention it.


diff --git a/drivers/accel/rocket/rocket_job.h 
b/drivers/accel/rocket/rocket_job.h
new file mode 100644
index 000000000000..0c3c90e47d39
--- /dev/null
+++ b/drivers/accel/rocket/rocket_job.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright 2024 Tomeu Vizoso <to...@tomeuvizoso.net> */
+
+#ifndef __ROCKET_JOB_H__
+#define __ROCKET_JOB_H__
+
+#include <drm/gpu_scheduler.h>
+#include <drm/drm_drv.h>

Alphabetical order


diff --git a/include/uapi/drm/rocket_accel.h b/include/uapi/drm/rocket_accel.h
index 8338726a83c3..888c9413e4cd 100644
--- a/include/uapi/drm/rocket_accel.h
+++ b/include/uapi/drm/rocket_accel.h
@@ -12,8 +12,10 @@ extern "C" {
  #endif
#define DRM_ROCKET_CREATE_BO 0x00
+#define DRM_ROCKET_SUBMIT                      0x01
#define DRM_IOCTL_ROCKET_CREATE_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_ROCKET_CREATE_BO, struct drm_rocket_create_bo)
+#define DRM_IOCTL_ROCKET_SUBMIT                        
DRM_IOW(DRM_COMMAND_BASE + DRM_ROCKET_SUBMIT, struct drm_rocket_submit)
/**
   * struct drm_rocket_create_bo - ioctl argument for creating Rocket BOs.
@@ -36,6 +38,59 @@ struct drm_rocket_create_bo {
        __u64 offset;
  };
+/**
+ * struct drm_rocket_task - A task to be run on the NPU
+ *
+ * A task is the smallest unit of work that can be run on the NPU.
+ */
+struct drm_rocket_task {
+       /** DMA address to NPU mapping of register command buffer */
+       __u64 regcmd;
+
+       /** Number of commands in the register command buffer */
+       __u32 regcmd_count;
+};
+
+/**
+ * struct drm_rocket_job - A job to be run on the NPU
+ *
+ * The kernel will schedule the execution of this job taking into account its
+ * dependencies with other jobs. All tasks in the same job will be executed
+ * sequentially on the same core, to benefit from memory residency in SRAM.
+ */
+struct drm_rocket_job {
+       /** Pointer to an array of struct drm_rocket_task. */
+       __u64 tasks;
+
+       /** Number of tasks passed in. */
+       __u32 task_count;
+
+       /** Pointer to a u32 array of the BOs that are read by the job. */
+       __u64 in_bo_handles;
+
+       /** Number of input BO handles passed in (size is that times 4). */
+       __u32 in_bo_handle_count;
+
+       /** Pointer to a u32 array of the BOs that are written to by the job. */
+       __u64 out_bo_handles;
+
+       /** Number of output BO handles passed in (size is that times 4). */
+       __u32 out_bo_handle_count;
+};

I feels like the mixing of 32-bit and 64-bit fields violates the guidelines on defining ioctls due to implicit padding that might or might not be present.

+
+/**
+ * struct drm_rocket_submit - ioctl argument for submitting commands to the 
NPU.
+ *
+ * The kernel will schedule the execution of these jobs in dependency order.
+ */
+struct drm_rocket_submit {
+       /** Pointer to an array of struct drm_rocket_job. */
+       __u64 jobs;
+
+       /** Number of jobs passed in. */
+       __u32 job_count;
+};
+
  #if defined(__cplusplus)
  }
  #endif


Reply via email to