Hi Tomeu,
On 23/05/26 05:37, Tomeu Vizoso wrote:
From: "Rob Herring (Arm)" <[email protected]>
The Arm Ethos-U NPUs have a PMU with performance counters. The PMU h/w
supports up to 4 (U65) or 8 (U85) counters which can be programmed for
different events. There is also a dedicated cycle counter.
The ABI and implementation are copied from the V3D driver. The main
difference in the ABI is there is no query API for the the event list.
The events differ between the U65 and U85, so the events lists are
maintained in userspace along with other differences between the U65 and
U85.
The cycle counter is always enabled when the PMU is enabled. When the
user requests N events, reading the counters will return the N events
plus the cycle counter.
Signed-off-by: Rob Herring (Arm) <[email protected]>
Signed-off-by: Tomeu Vizoso <[email protected]>
Reviewed-by: Maíra Canal <[email protected]>
Just small nits below. Feel free to fix it when applying it.
---
v2:
- Use XArray instead of idr
- Rework locking to use per device spinlock to protect modifying active
perfmon. Based on pending V3D changes:
https://lore.kernel.org/all/[email protected]/
- Add missing perfmon puts in ethosu_ioctl_perfmon_set_global() and
ethosu_ioctl_perfmon_get_values() error paths.
- Fix reading number of counters on U85.
- Add defines NPU_REG_PMCCNTR_CFG
v3:
- Add explicit padding to drm_ethosu_perfmon_destroy
- Fix SPDX license expression
- Fix comment typos
- Convert perfmon lock from spinlock to mutex
- Simplify switch_perfmon condition check
- Remove unused ethosu_perfmon_init
- Add lockdep_assert_held to ethosu_perfmon_stop_locked
---
drivers/accel/ethosu/Makefile | 2 +-
drivers/accel/ethosu/ethosu_device.h | 33 +++
drivers/accel/ethosu/ethosu_drv.c | 23 +-
drivers/accel/ethosu/ethosu_drv.h | 61 +++++-
drivers/accel/ethosu/ethosu_job.c | 39 +++-
drivers/accel/ethosu/ethosu_job.h | 2 +
drivers/accel/ethosu/ethosu_perfmon.c | 298 ++++++++++++++++++++++++++
include/uapi/drm/ethosu_accel.h | 60 +++++-
8 files changed, 504 insertions(+), 14 deletions(-)
create mode 100644 drivers/accel/ethosu/ethosu_perfmon.c
[...]
@@ -312,11 +318,16 @@ static int ethosu_init(struct ethosu_device *ethosudev)
ethosudev->npu_info.id = id = readl_relaxed(ethosudev->regs + NPU_REG_ID);
ethosudev->npu_info.config = config = readl_relaxed(ethosudev->regs +
NPU_REG_CONFIG);
-
I believe this doesn't belong to this patch.
ethosu_sram_init(ethosudev);
+ if (!ethosu_is_u65(ethosudev))
+ ethosudev->pmu_regs += 0x1000;
+
+ ethosudev->npu_info.pmu_counters = FIELD_GET(PMCR_NUM_EVENT_CNT_MASK,
+ readl_relaxed(ethosudev->pmu_regs + NPU_REG_PMCR));
+
dev_info(ethosudev->base.dev,
- "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream ver%ld,
%d MACs, %dKB SRAM\n",
+ "Ethos-U NPU, arch v%ld.%ld.%ld, rev r%ldp%ld, cmd stream ver%ld,
%d MACs, %dKB SRAM, %d PMU cntrs\n",
FIELD_GET(ID_ARCH_MAJOR_MASK, id),
FIELD_GET(ID_ARCH_MINOR_MASK, id),
FIELD_GET(ID_ARCH_PATCH_MASK, id),
@@ -324,7 +335,8 @@ static int ethosu_init(struct ethosu_device *ethosudev)
FIELD_GET(ID_VER_MINOR_MASK, id),
FIELD_GET(CONFIG_CMD_STREAM_VER_MASK, config),
1 << FIELD_GET(CONFIG_MACS_PER_CC_MASK, config),
- ethosudev->npu_info.sram_size / 1024);
+ ethosudev->npu_info.sram_size / 1024,
+ ethosudev->npu_info.pmu_counters);
return 0;
}
@@ -343,11 +355,16 @@ static int ethosu_probe(struct platform_device *pdev)
dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(40));
ethosudev->regs = devm_platform_ioremap_resource(pdev, 0);
+ ethosudev->pmu_regs = ethosudev->regs;
ethosudev->num_clks = devm_clk_bulk_get_all(&pdev->dev, ðosudev->clks);
if (ethosudev->num_clks < 0)
return ethosudev->num_clks;
+ ret = devm_mutex_init(&pdev->dev, ðosudev->perfmon_state.lock);
I believe this should be drmm_mutex_init() as it's bounded by the DRM
device.
+ if (ret)
+ return ret;
+
ret = ethosu_job_init(ethosudev);
if (ret)
return ret;
[...]
+
+void ethosu_perfmon_put(struct ethosu_perfmon *perfmon)
+{
+ if (perfmon && refcount_dec_and_test(&perfmon->refcnt)) {
+ kfree(perfmon);
+ }
No brackets needed.
+}
+
+void ethosu_perfmon_start(struct ethosu_device *ethosu, struct ethosu_perfmon
*perfmon)
+{
+ unsigned int i;
+ u8 ncounters;
+ u32 mask;
lockdep_assert_held?
+
+ if (WARN_ON_ONCE(!perfmon || ethosu->perfmon_state.active))
+ return;
+
+ writel_relaxed(PMCR_CNT_EN, ethosu->pmu_regs + NPU_REG_PMCR);
+ writel_relaxed(PMU_EV_TYPE_CYCLES, ethosu->pmu_regs +
NPU_REG_PMCCNTR_CFG);
+
+ mask = 0x80000000;
+ ncounters = perfmon->ncounters - 1;
+ if (ncounters)
+ mask |= GENMASK(ncounters - 1, 0);
+
+ for (i = 0; i < ncounters; i++)
+ writel_relaxed(perfmon->counters[i], ethosu->pmu_regs +
NPU_REG_PMU_EVTYPER(i));
+
+ writel_relaxed(mask, ethosu->pmu_regs + NPU_REG_PMCNTENSET);
+ writel_relaxed(PMCR_CNT_EN | PMCR_EVENT_CNT_RST | PMCR_CYCLE_CNT_RST,
+ ethosu->pmu_regs + NPU_REG_PMCR);
+ ethosu->perfmon_state.active = perfmon;
+}
+
+void ethosu_perfmon_stop_locked(struct ethosu_device *ethosu, struct
ethosu_perfmon *perfmon,
+ bool capture)
+{
+ unsigned int i;
+ u8 ncounters;
+ u32 mask;
+
+ lockdep_assert_held(ðosu->perfmon_state.lock);
+
+ if (!perfmon || perfmon != ethosu->perfmon_state.active)
+ return;
+
+ ncounters = perfmon->ncounters - 1;
+
+ if (!pm_runtime_get_if_active(ethosu->base.dev)) {
+ ethosu->perfmon_state.active = NULL;
+ return;
+ }
+
+ if (capture) {
+ for (i = 0; i < ncounters; i++)
+ perfmon->values[i] += readl_relaxed(ethosu->pmu_regs +
NPU_REG_PMU_EVCNTR(i));
A new line here would make the code a bit more readable.
+ perfmon->values[ncounters] +=
+ readl_relaxed(ethosu->pmu_regs + NPU_REG_PMCCNTR_LO) |
+ (u64)readl_relaxed(ethosu->pmu_regs + NPU_REG_PMCCNTR_HI)
<< 32;
+ }
+
+ mask = 0x80000000;
+ if (ncounters)
+ mask |= GENMASK(ncounters - 1, 0);
+ writel_relaxed(mask, ethosu->pmu_regs + NPU_REG_PMCNTENCLR);
+
+ writel_relaxed(0, ethosu->pmu_regs + NPU_REG_PMCR);
+ ethosu->perfmon_state.active = NULL;
+
+ pm_runtime_put(ethosu->base.dev);
+}
+
+void ethosu_perfmon_stop(struct ethosu_device *ethosu, struct ethosu_perfmon
*perfmon,
+ bool capture)
+{
+ if (!perfmon)
+ return;
+
+ guard(mutex)(ðosu->perfmon_state.lock);
+ ethosu_perfmon_stop_locked(ethosu, perfmon, capture);
+}
+
+struct ethosu_perfmon *ethosu_perfmon_find(struct ethosu_file_priv
*ethosu_priv, int id)
+{
+ struct ethosu_perfmon *perfmon;
+
+ xa_lock(ðosu_priv->perfmons);
+ perfmon = xa_load(ðosu_priv->perfmons, id);
+ ethosu_perfmon_get(perfmon);
+ xa_unlock(ðosu_priv->perfmons);
+
+ return perfmon;
+}
+
+void ethosu_perfmon_open_file(struct ethosu_file_priv *ethosu_priv)
+{
+ xa_init_flags(ðosu_priv->perfmons, XA_FLAGS_ALLOC1);
+}
+
+static void ethosu_perfmon_delete(struct ethosu_file_priv *ethosu_priv,
+ struct ethosu_perfmon *perfmon)
+{
+ struct ethosu_device *ethosu = ethosu_priv->edev;
+
+ /* If the active perfmon is being destroyed, stop it first */
+ scoped_guard(mutex, ðosu->perfmon_state.lock) {
+ /* If the global perfmon is being destroyed, set it to NULL */
+ if (ethosu->global_perfmon == perfmon) {
+ ethosu->global_perfmon = NULL;
+ ethosu_perfmon_put(perfmon);
+ }
+
+ ethosu_perfmon_stop_locked(ethosu, perfmon, false);
+ }
+
+ ethosu_perfmon_put(perfmon);
+}
+
+void ethosu_perfmon_close_file(struct ethosu_file_priv *ethosu_priv)
+{
+ struct ethosu_perfmon *perfmon;
+ unsigned long id;
+
+ xa_for_each(ðosu_priv->perfmons, id, perfmon)
+ ethosu_perfmon_delete(ethosu_priv, perfmon);
+
+ xa_destroy(ðosu_priv->perfmons);
+}
+
+int ethosu_ioctl_perfmon_create(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct ethosu_file_priv *ethosu_priv = file_priv->driver_priv;
+ struct drm_ethosu_perfmon_create *req = data;
+ struct ethosu_device *ethosu = to_ethosu_device(dev);
+ struct ethosu_perfmon *perfmon;
+ unsigned int i, event_max;
+ int ret;
+ u32 id;
+
+ /* Number of monitored counters cannot exceed HW limits. */
+ if (req->ncounters > ethosu->npu_info.pmu_counters)
Feel free to ignore this comment if I'm mistaken, but IIUC, pmu_counters
is the maximum number of counters *including* cycle counter. If
pmu_counters indeed includes cycle counter, I believe this should be a >=.
Best regards,
- Maíra