On Fri, Sep 26, 2025 at 03:00:49PM -0500, Rob Herring (Arm) wrote:
> Add a driver for Arm Ethos-U65/U85 NPUs. The Ethos-U NPU has a
> relatively simple interface with single command stream to describe
> buffers, operation settings, and network operations. It supports up to 8
> memory regions (though no h/w bounds on a region). The Ethos NPUs
> are designed to use an SRAM for scratch memory. Region 2 is reserved
> for SRAM (like the downstream driver stack and compiler). Userspace
> doesn't need access to the SRAM.
>
> The h/w has no MMU nor external IOMMU and is a DMA engine which can
> read and write anywhere in memory without h/w bounds checks. The user
> submitted command streams must be validated against the bounds of the
> GEM BOs. This is similar to the VC4 design which validates shaders.
>
> The job submit is based on the rocket driver for the Rockchip NPU
> utilizing the GPU scheduler. It is simpler as there's only 1 core rather
> than 3.
>
> Tested on i.MX93 platform (U65) and FVP (U85) with WIP Mesa Teflon
> support.
>
> Acked-by: Thomas Zimmermann <[email protected]>
> Signed-off-by: Rob Herring (Arm) <[email protected]>
> ---
> v3:
>  - Rework and improve job submit validation
>  - Rename ethos to ethosu. There was an Ethos-Nxx that's unrelated.
>  - Add missing init for sched_lock mutex
>  - Drop some prints to debug level
>  - Fix i.MX93 SRAM accesses (AXI config)
>  - Add U85 AXI configuration
>  - Print the current cmd value on timeout
>
> v2:
>  - Rebase on v6.17-rc1 adapting to scheduler changes
>  - scheduler: Drop the reset workqueue. According to the scheduler docs,
>    we don't need it since we have a single h/w queue.
>  - scheduler: Rework the timeout handling to continue running if we are
>    making progress.
>  - Reset the NPU on resume so it's in a known state
>  - Add error handling on clk_get() calls
>  - Fix drm_mm splat on module unload. We were missing a put on the
>    cmdstream BO in the scheduler clean-up.
>  - Fix 0-day report needing explicit bitfield.h include
> ---
>  MAINTAINERS                          |   9 +
>  drivers/accel/Kconfig                |   1 +
>  drivers/accel/Makefile               |   1 +
>  drivers/accel/ethosu/Kconfig         |  10 +
>  drivers/accel/ethosu/Makefile        |   4 +
>  drivers/accel/ethosu/ethosu_device.h | 187 +++++++++
>  drivers/accel/ethosu/ethosu_drv.c    | 430 +++++++++++++++++++++
>  drivers/accel/ethosu/ethosu_drv.h    |  15 +
>  drivers/accel/ethosu/ethosu_gem.c    | 709 
> +++++++++++++++++++++++++++++++++++
>  drivers/accel/ethosu/ethosu_gem.h    |  46 +++
>  drivers/accel/ethosu/ethosu_job.c    | 543 +++++++++++++++++++++++++++
>  drivers/accel/ethosu/ethosu_job.h    |  41 ++
>  include/uapi/drm/ethosu_accel.h      | 262 +++++++++++++
>  13 files changed, 2258 insertions(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index fe168477caa4..b091badec8e4 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1991,6 +1991,15 @@ F:     arch/arm/include/asm/arch_timer.h
>  F:   arch/arm64/include/asm/arch_timer.h
>  F:   drivers/clocksource/arm_arch_timer.c
>
> +ARM ETHOS-U NPU DRIVER
> +M:   Rob Herring (Arm) <[email protected]>
> +M:   Tomeu Vizoso <[email protected]>
> +L:   [email protected]
> +S:   Supported
> +T:   git https://gitlab.freedesktop.org/drm/misc/kernel.git
> +F:   drivers/accel/ethosu/
> +F:   include/uapi/drm/ethosu_accel.h
> +
>  ARM GENERIC INTERRUPT CONTROLLER DRIVERS
>  M:   Marc Zyngier <[email protected]>
>  L:   [email protected] (moderated for non-subscribers)
> diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig
> index 5b9490367a39..a5a4faab221b 100644
> --- a/drivers/accel/Kconfig
> +++ b/drivers/accel/Kconfig
> @@ -25,6 +25,7 @@ menuconfig DRM_ACCEL
>         and debugfs).
>
>  source "drivers/accel/amdxdna/Kconfig"
> +source "drivers/accel/ethosu/Kconfig"
>  source "drivers/accel/habanalabs/Kconfig"
>  source "drivers/accel/ivpu/Kconfig"
>  source "drivers/accel/qaic/Kconfig"
> diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile
> index a301fb6089d4..c759857d64cb 100644
> --- a/drivers/accel/Makefile
> +++ b/drivers/accel/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>
>  obj-$(CONFIG_DRM_ACCEL_AMDXDNA)              += amdxdna/
> +obj-$(CONFIG_DRM_ACCEL_ARM_ETHOSU)   += ethosu/
>  obj-$(CONFIG_DRM_ACCEL_HABANALABS)   += habanalabs/
>  obj-$(CONFIG_DRM_ACCEL_IVPU)         += ivpu/
>  obj-$(CONFIG_DRM_ACCEL_QAIC)         += qaic/
...
> +#include <drm/ethosu_accel.h>
> +
> +struct clk;
> +struct gen_pool;
> +
> +#define NPU_REG_ID           0x0000
> +#define NPU_REG_STATUS               0x0004
> +#define NPU_REG_CMD          0x0008
> +#define NPU_REG_RESET                0x000C
> +#define NPU_REG_QBASE                0x0010
> +#define NPU_REG_QBASE_HI     0x0014
> +#define NPU_REG_QREAD                0x0018
> +#define NPU_REG_QCONFIG              0x001C
> +#define NPU_REG_QSIZE                0x0020
> +#define NPU_REG_PROT         0x0024
> +#define NPU_REG_CONFIG               0x0028
> +#define NPU_REG_REGIONCFG    0x003C
> +#define NPU_REG_AXILIMIT0    0x0040          // U65
> +#define NPU_REG_AXILIMIT1    0x0044          // U65
> +#define NPU_REG_AXILIMIT2    0x0048          // U65
> +#define NPU_REG_AXILIMIT3    0x004c          // U65

Nit: can you keep low-case/up-case consistent for Hex value

> +#define NPU_REG_MEM_ATTR0    0x0040          // U85
> +#define NPU_REG_MEM_ATTR1    0x0044          // U85
> +#define NPU_REG_MEM_ATTR2    0x0048          // U85
> +#define NPU_REG_MEM_ATTR3    0x004c          // U85
> +#define NPU_REG_AXI_SRAM     0x0050          // U85
> +#define NPU_REG_AXI_EXT              0x0054          // U85
> +
...
> +
> +#define ETHOSU_SRAM_REGION   2       /* Matching Vela compiler */
> +
> +/**
> + * struct ethosu_device - Ethosu device
> + */
> +struct ethosu_device {
> +     /** @base: Base drm_device. */
> +     struct drm_device base;
> +
> +     /** @iomem: CPU mapping of the registers. */
> +     void __iomem *regs;
> +
> +     void __iomem *sram;
> +     struct gen_pool *srampool;
> +     dma_addr_t sramphys;
> +
> +     struct clk *core_clk;
> +     struct clk *apb_clk;
> +
> +     int irq;
> +
> +     bool coherent;
> +
> +     struct drm_ethosu_npu_info npu_info;
> +
> +     struct ethosu_job *in_flight_job;
> +     struct mutex job_lock;

I remember checkpatch.py require all locks need comments.

> +     spinlock_t fence_lock;
> +
> +     struct drm_gpu_scheduler sched;
> +     struct mutex sched_lock;
> +     u64 fence_context;
> +     u64 emit_seqno;
> +};
> +
> +#define to_ethosu_device(drm_dev) \
> +     ((struct ethosu_device *)container_of(drm_dev, struct ethosu_device, 
> base))
> +
> +#endif
> diff --git a/drivers/accel/ethosu/ethosu_drv.c 
> b/drivers/accel/ethosu/ethosu_drv.c
> new file mode 100644
> index 000000000000..b74d803756cf
> --- /dev/null
> +++ b/drivers/accel/ethosu/ethosu_drv.c

...

> +
> +static int ethosu_device_resume(struct device *dev)
> +{
> +     struct ethosu_device *ethosudev = dev_get_drvdata(dev);
> +     int ret;
> +
> +     ret = clk_prepare_enable(ethosudev->core_clk);
> +     if (ret)
> +             return ret;
> +
> +     ret = clk_prepare_enable(ethosudev->apb_clk);
> +     if (ret)
> +             goto err_disable_core_clk;

use clk bulk api should be simple.

> +
> +     ret = ethosu_reset(ethosudev);
> +     if (!ret)
> +             return 0;
> +
> +err_disable_core_clk:
> +     clk_disable_unprepare(ethosudev->core_clk);
> +     return ret;
> +}
> +
...
> +
> +static int ethosu_init(struct ethosu_device *ethosudev)
> +{
> +     int ret;
> +     u32 id, config;
> +
> +     ret = devm_pm_runtime_enable(ethosudev->base.dev);
> +     if (ret)
> +             return ret;
> +
> +     ret = pm_runtime_resume_and_get(ethosudev->base.dev);
> +     if (ret)
> +             return ret;
> +
> +     pm_runtime_set_autosuspend_delay(ethosudev->base.dev, 50);
> +     pm_runtime_use_autosuspend(ethosudev->base.dev);

pm_runtime_use_autosuspend() should be after last register read
readl_relaxed(ethosudev->regs + NPU_REG_CONFIG);

incase schedule happen between pm_runtime_use_autosuspend(ethosudev->base.dev);
and readl().

> +
> +     /* If PM is disabled, we need to call ethosu_device_resume() manually. 
> */
> +     if (!IS_ENABLED(CONFIG_PM)) {
> +             ret = ethosu_device_resume(ethosudev->base.dev);
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     ethosudev->npu_info.id = id = readl_relaxed(ethosudev->regs + 
> NPU_REG_ID);
> +     ethosudev->npu_info.config = config = readl_relaxed(ethosudev->regs + 
> NPU_REG_CONFIG);
> +
> +     ethosu_sram_init(ethosudev);
> +
> +     dev_info(ethosudev->base.dev,
> +             "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream 
> ver%ld, %d MACs, %dKB SRAM\n",
> +             FIELD_GET(ID_ARCH_MAJOR_MASK, id),
> +             FIELD_GET(ID_ARCH_MINOR_MASK, id),
> +             FIELD_GET(ID_ARCH_PATCH_MASK, id),
> +             FIELD_GET(ID_VER_MAJOR_MASK, id),
> +             FIELD_GET(ID_VER_MINOR_MASK, id),
> +             FIELD_GET(CONFIG_CMD_STREAM_VER_MASK, config),
> +             1 << FIELD_GET(CONFIG_MACS_PER_CC_MASK, config),
> +             ethosudev->npu_info.sram_size / 1024);
> +
> +     return 0;
> +}
> +
> +static int ethosu_probe(struct platform_device *pdev)
> +{
> +     int ret;
> +     struct ethosu_device *ethosudev;
> +
> +     ethosudev = devm_drm_dev_alloc(&pdev->dev, &ethosu_drm_driver,
> +                                   struct ethosu_device, base);
> +     if (IS_ERR(ethosudev))
> +             return -ENOMEM;
> +     platform_set_drvdata(pdev, ethosudev);
> +
> +     ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
> +     if (ret)
> +             return ret;

bigger than 32, dma_set_mask_and_coherent() always return true. Needn't
check return value.

> +
> +     ethosudev->regs = devm_platform_ioremap_resource(pdev, 0);
> +
...
> +
> +/**
> + * ethosu_gem_create_with_handle() - Create a GEM object and attach it to a 
> handle.
> + * @file: DRM file.
> + * @ddev: DRM device.
> + * @size: Size of the GEM object to allocate.
> + * @flags: Combination of drm_ethosu_bo_flags flags.
> + * @handle: Pointer holding the handle pointing to the new GEM object.
> + *
> + * Return: Zero on success
> + */
> +int ethosu_gem_create_with_handle(struct drm_file *file,
> +                              struct drm_device *ddev,
> +                              u64 *size, u32 flags, u32 *handle)
> +{
> +     int ret;
> +     struct drm_gem_dma_object *mem;
> +     struct ethosu_gem_object *bo;

move 'ret' here to keep reverise christmas tree order.

> +
> +     mem = drm_gem_dma_create(ddev, *size);
> +     if (IS_ERR(mem))
> +             return PTR_ERR(mem);
> +
> +     bo = to_ethosu_bo(&mem->base);
> +     bo->flags = flags;
> +
> +     /*
> +      * Allocate an id of idr table where the obj is registered
> +      * and handle has the id what user can see.
> +      */
> +     ret = drm_gem_handle_create(file, &mem->base, handle);
> +     if (!ret)
> +             *size = bo->base.base.size;
> +
> +     /* drop reference from allocate - handle holds it now. */
> +     drm_gem_object_put(&mem->base);
> +
> +     return ret;
> +}
> +
...
> +
> +static void cmd_state_init(struct cmd_state *st)
> +{
> +     /* Initialize to all 1s to detect missing setup */
> +     memset(st, 0xff, sizeof(*st));
> +}
> +
> +static u64 cmd_to_addr(u32 *cmd)
> +{
> +     return ((u64)((cmd[0] & 0xff0000) << 16)) | cmd[1];

will FIELD_PREP helpful?

> +}
> +
> +static u64 dma_length(struct ethosu_validated_cmdstream_info *info,
> +                   struct dma_state *dma_st, struct dma *dma)
> +{
> +     s8 mode = dma_st->mode;
> +     u64 len = dma->len;
> +
> +     if (mode >= 1) {
> +             len += dma->stride[0];
> +             len *= dma_st->size0;
> +     }
> +     if (mode == 2) {
> +             len += dma->stride[1];
> +             len *= dma_st->size1;
> +     }
> +     if (dma->region >= 0)
> +             info->region_size[dma->region] = 
> max(info->region_size[dma->region],
> +                                                  len + dma->offset);
> +
> +     return len;
> +}
> +
...

> +
> +static void ethosu_job_handle_irq(struct ethosu_device *dev)
> +{
> +     u32 status;
> +
> +     pm_runtime_mark_last_busy(dev->base.dev);

I think don't need pm_runtime_mark_last_busy() here because
pm_runtime_put_autosuspend() already call pm_runtime_mark_last_busy().

only mark last busy without pm_runtime_put() can't affect run time pm
state, still in active state.

> +
> +     status = readl_relaxed(dev->regs + NPU_REG_STATUS);
> +
> +     if (status & (STATUS_BUS_STATUS | STATUS_CMD_PARSE_ERR)) {
> +             dev_err(dev->base.dev, "Error IRQ - %x\n", status);
> +             drm_sched_fault(&dev->sched);
> +             return;
> +     }
> +
> +     scoped_guard(mutex, &dev->job_lock) {
> +             if (dev->in_flight_job) {
> +                     dma_fence_signal(dev->in_flight_job->done_fence);
> +                     pm_runtime_put_autosuspend(dev->base.dev);
> +                     dev->in_flight_job = NULL;
> +             }
> +     }
> +}
> +
...
> +
> +int ethosu_job_init(struct ethosu_device *dev)
> +{
> +     struct drm_sched_init_args args = {
> +             .ops = &ethosu_sched_ops,
> +             .num_rqs = DRM_SCHED_PRIORITY_COUNT,
> +             .credit_limit = 1,
> +             .timeout = msecs_to_jiffies(JOB_TIMEOUT_MS),
> +             .name = dev_name(dev->base.dev),
> +             .dev = dev->base.dev,
> +     };
> +     int ret;
> +
> +     spin_lock_init(&dev->fence_lock);
> +     mutex_init(&dev->job_lock);
> +     mutex_init(&dev->sched_lock);

now perfer use dev_mutex_init().

Frank
> +
> +     dev->irq = platform_get_irq(to_platform_device(dev->base.dev), 0);
> +     if (dev->irq < 0)
> +             return dev->irq;
> +
> +     ret = devm_request_threaded_irq(dev->base.dev, dev->irq,
> +                                     ethosu_job_irq_handler,
> +                                     ethosu_job_irq_handler_thread,
> +                                     IRQF_SHARED, KBUILD_MODNAME,
> +                                     dev);
> +     if (ret) {
> +             dev_err(dev->base.dev, "failed to request irq");
> +             return ret;
> +     }
> +
> +     dev->fence_context = dma_fence_context_alloc(1);
> +
> +     ret = drm_sched_init(&dev->sched, &args);
> +     if (ret) {
> +             dev_err(dev->base.dev, "Failed to create scheduler: %d.", ret);
> +             goto err_sched;
> +     }
> +
> +     return 0;
> +
> +err_sched:
> +     drm_sched_fini(&dev->sched);
> +     return ret;
> +}
> +
...
> +     DRM_IOCTL_ETHOSU_BO_WAIT =
> +             DRM_IOCTL_ETHOSU(WR, BO_WAIT, bo_wait),
> +     DRM_IOCTL_ETHOSU_BO_MMAP_OFFSET =
> +             DRM_IOCTL_ETHOSU(WR, BO_MMAP_OFFSET, bo_mmap_offset),
> +     DRM_IOCTL_ETHOSU_CMDSTREAM_BO_CREATE =
> +             DRM_IOCTL_ETHOSU(WR, CMDSTREAM_BO_CREATE, cmdstream_bo_create),
> +     DRM_IOCTL_ETHOSU_SUBMIT =
> +             DRM_IOCTL_ETHOSU(WR, SUBMIT, submit),
> +};
> +
> +#if defined(__cplusplus)
> +}
> +#endif
> +
> +#endif /* _ETHOSU_DRM_H_ */
>
> --
> 2.51.0
>

Reply via email to