When there is a hardware error, the NPU firmware notifies the host through
a mailbox message. The message includes details of the error, such as the
tile and column indexes where the error occurred.

The driver starts a thread to handle the NPU error message. The thread
stops the clients which are using the column where error occurred. Then
the driver resets that column.

Co-developed-by: Min Ma <min...@amd.com>
Signed-off-by: Min Ma <min...@amd.com>
Signed-off-by: Lizhi Hou <lizhi....@amd.com>
---
 drivers/accel/amdxdna/Makefile       |   1 +
 drivers/accel/amdxdna/aie2_error.c   | 349 +++++++++++++++++++++++++++
 drivers/accel/amdxdna/aie2_message.c |  19 ++
 drivers/accel/amdxdna/aie2_pci.c     |  32 +++
 drivers/accel/amdxdna/aie2_pci.h     |   9 +
 5 files changed, 410 insertions(+)
 create mode 100644 drivers/accel/amdxdna/aie2_error.c

diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
index 7040e23d0ec6..69d97f678a09 100644
--- a/drivers/accel/amdxdna/Makefile
+++ b/drivers/accel/amdxdna/Makefile
@@ -2,6 +2,7 @@
 
 amdxdna-y := \
        aie2_ctx.o \
+       aie2_error.o \
        aie2_message.o \
        aie2_pci.o \
        aie2_psp.o \
diff --git a/drivers/accel/amdxdna/aie2_error.c 
b/drivers/accel/amdxdna/aie2_error.c
new file mode 100644
index 000000000000..6ad8449bd1d3
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <drm/drm_cache.h>
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+
+struct async_event {
+       struct amdxdna_dev_hdl          *ndev;
+       struct async_event_msg_resp     resp;
+       struct workqueue_struct         *wq;
+       struct work_struct              work;
+       u8                              *buf;
+       dma_addr_t                      addr;
+       u32                             size;
+};
+
+struct async_events {
+       struct workqueue_struct         *wq;
+       u8                              *buf;
+       dma_addr_t                      addr;
+       u32                             size;
+       u32                             event_cnt;
+       struct async_event              event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+       AIE_MEM_MOD = 0,
+       AIE_CORE_MOD,
+       AIE_PL_MOD,
+};
+
+enum aie_error_category {
+       AIE_ERROR_SATURATION = 0,
+       AIE_ERROR_FP,
+       AIE_ERROR_STREAM,
+       AIE_ERROR_ACCESS,
+       AIE_ERROR_BUS,
+       AIE_ERROR_INSTRUCTION,
+       AIE_ERROR_ECC,
+       AIE_ERROR_LOCK,
+       AIE_ERROR_DMA,
+       AIE_ERROR_MEM_PARITY,
+       /* Unknown is not from XAIE, added for better category */
+       AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+       u8                      row;
+       u8                      col;
+       enum aie_module_type    mod_type;
+       u8                      event_id;
+};
+
+struct aie_err_info {
+       u32                     err_cnt;
+       u32                     ret_code;
+       u32                     rsvd;
+       struct aie_error        payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+       u8                      event_id;
+       enum aie_error_category category;
+};
+
+#define EVENT_CATEGORY(id, cat) { id, cat }
+static const struct aie_event_category aie_ml_mem_event_cat[] = {
+       EVENT_CATEGORY(88U,  AIE_ERROR_ECC),
+       EVENT_CATEGORY(90U,  AIE_ERROR_ECC),
+       EVENT_CATEGORY(91U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(92U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(93U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(94U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(95U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(96U,  AIE_ERROR_MEM_PARITY),
+       EVENT_CATEGORY(97U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(98U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(99U,  AIE_ERROR_DMA),
+       EVENT_CATEGORY(100U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_core_event_cat[] = {
+       EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(58U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
+       EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(62U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(64U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
+       EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
+       EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
+       EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(72U, AIE_ERROR_BUS),
+};
+
+static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
+       EVENT_CATEGORY(130U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(132U, AIE_ERROR_ECC),
+       EVENT_CATEGORY(133U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(134U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(138U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
+       EVENT_CATEGORY(64U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
+       EVENT_CATEGORY(67U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(68U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(69U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(70U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(71U, AIE_ERROR_BUS),
+       EVENT_CATEGORY(72U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(73U, AIE_ERROR_DMA),
+       EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
+};
+
+static enum aie_error_category
+aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
+{
+       const struct aie_event_category *lut;
+       int num_entry;
+       int i;
+
+       switch (mod_type) {
+       case AIE_PL_MOD:
+               lut = aie_ml_shim_tile_event_cat;
+               num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
+               break;
+       case AIE_CORE_MOD:
+               lut = aie_ml_core_event_cat;
+               num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
+               break;
+       case AIE_MEM_MOD:
+               if (row == 1) {
+                       lut = aie_ml_mem_tile_event_cat;
+                       num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
+               } else {
+                       lut = aie_ml_mem_event_cat;
+                       num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
+               }
+               break;
+       default:
+               return AIE_ERROR_UNKNOWN;
+       }
+
+       for (i = 0; i < num_entry; i++) {
+               if (event_id != lut[i].event_id)
+                       continue;
+
+               return lut[i].category;
+       }
+
+       return AIE_ERROR_UNKNOWN;
+}
+
+static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, 
u32 num_err)
+{
+       struct aie_error *errs = err_info;
+       u32 err_col = 0; /* assume that AIE has less than 32 columns */
+       int i;
+
+       /* Get err column bitmap */
+       for (i = 0; i < num_err; i++) {
+               struct aie_error *err = &errs[i];
+               enum aie_error_category cat;
+
+               cat = aie_get_error_category(err->row, err->event_id, 
err->mod_type);
+               XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, 
category %d",
+                        err->row, err->col, err->mod_type,
+                        err->event_id, cat);
+
+               if (err->col >= 32) {
+                       XDNA_WARN(ndev->xdna, "Invalid column number");
+                       break;
+               }
+
+               err_col |= (1 << err->col);
+       }
+
+       return err_col;
+}
+
+static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
+{
+       struct async_event_msg_resp *resp;
+       struct async_event *e = handle;
+
+       if (data) {
+               resp = (struct async_event_msg_resp *)data;
+               e->resp.type = resp->type;
+               wmb(); /* Update status in the end, so that no lock for here */
+               e->resp.status = resp->status;
+       }
+       queue_work(e->wq, &e->work);
+       return 0;
+}
+
+static int aie2_error_event_send(struct async_event *e)
+{
+       drm_clflush_virt_range(e->buf, e->size); /* device can access */
+       return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
+                                           aie2_error_async_cb);
+}
+
+static void aie2_error_worker(struct work_struct *err_work)
+{
+       struct aie_err_info *info;
+       struct amdxdna_dev *xdna;
+       struct async_event *e;
+       u32 max_err;
+       u32 err_col;
+
+       e = container_of(err_work, struct async_event, work);
+
+       xdna = e->ndev->xdna;
+
+       if (e->resp.status == MAX_AIE2_STATUS_CODE)
+               return;
+
+       e->resp.status = MAX_AIE2_STATUS_CODE;
+
+       print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
+                            e->buf, 0x100, false);
+
+       info = (struct aie_err_info *)e->buf;
+       XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, 
info->ret_code);
+
+       max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
+       if (unlikely(info->err_cnt > max_err)) {
+               WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
+               return;
+       }
+       err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
+       if (!err_col) {
+               XDNA_WARN(xdna, "Did not get error column");
+               return;
+       }
+
+       mutex_lock(&xdna->dev_lock);
+       /* Re-sent this event to firmware */
+       if (aie2_error_event_send(e))
+               XDNA_WARN(xdna, "Unable to register async event");
+       mutex_unlock(&xdna->dev_lock);
+}
+
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       struct async_event *e;
+       int i, ret;
+
+       drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+       for (i = 0; i < ndev->async_events->event_cnt; i++) {
+               e = &ndev->async_events->event[i];
+               ret = aie2_error_event_send(e);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       struct async_events *events;
+
+       events = ndev->async_events;
+       destroy_workqueue(events->wq);
+       dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+                            events->addr, DMA_FROM_DEVICE);
+       kfree(events);
+}
+
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
+{
+       struct amdxdna_dev *xdna = ndev->xdna;
+       u32 total_col = ndev->total_col;
+       u32 total_size = ASYNC_BUF_SIZE * total_col;
+       struct async_events *events;
+       int i, ret;
+
+       events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
+       if (!events)
+               return -ENOMEM;
+
+       events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, 
&events->addr,
+                                           DMA_FROM_DEVICE, GFP_KERNEL);
+       if (!events->buf) {
+               ret = -ENOMEM;
+               goto free_events;
+       }
+       events->size = total_size;
+       events->event_cnt = total_col;
+
+       events->wq = alloc_ordered_workqueue("async_wq", 0);
+       if (!events->wq) {
+               ret = -ENOMEM;
+               goto free_buf;
+       }
+
+       for (i = 0; i < events->event_cnt; i++) {
+               struct async_event *e = &events->event[i];
+               u32 offset = i * ASYNC_BUF_SIZE;
+
+               e->ndev = ndev;
+               e->wq = events->wq;
+               e->buf = &events->buf[offset];
+               e->addr = events->addr + offset;
+               e->size = ASYNC_BUF_SIZE;
+               e->resp.status = MAX_AIE2_STATUS_CODE;
+               INIT_WORK(&e->work, aie2_error_worker);
+       }
+
+       ndev->async_events = events;
+
+       XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
+                events->event_cnt, events->size);
+       return 0;
+
+free_buf:
+       dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+                            events->addr, DMA_FROM_DEVICE);
+free_events:
+       kfree(events);
+       return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c 
b/drivers/accel/amdxdna/aie2_message.c
index 549d33c85afe..3f1e15a0c622 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -292,6 +292,25 @@ int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 
context_id, u64 addr, u6
        return 0;
 }
 
+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t 
addr, u32 size,
+                                void *handle, int (*cb)(void*, const u32 *, 
size_t))
+{
+       struct async_event_msg_req req = { 0 };
+       struct xdna_mailbox_msg msg = {
+               .send_data = (u8 *)&req,
+               .send_size = sizeof(req),
+               .handle = handle,
+               .opcode = MSG_OP_REGISTER_ASYNC_EVENT_MSG,
+               .notify_cb = cb,
+       };
+
+       req.buf_addr = addr;
+       req.buf_size = size;
+
+       XDNA_DBG(ndev->xdna, "Register addr 0x%llx size 0x%x", addr, size);
+       return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT);
+}
+
 int aie2_config_cu(struct amdxdna_hwctx *hwctx)
 {
        struct mailbox_channel *chann = hwctx->priv->mbox_chann;
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index e52428d2c092..7e20b3e13063 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -169,6 +169,15 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
                return ret;
        }
 
+       if (!ndev->async_events)
+               return 0;
+
+       ret = aie2_error_async_events_send(ndev);
+       if (ret) {
+               XDNA_ERR(ndev->xdna, "Send async events failed");
+               return ret;
+       }
+
        return 0;
 }
 
@@ -463,9 +472,30 @@ static int aie2_init(struct amdxdna_dev *xdna)
                goto stop_hw;
        }
 
+       ret = aie2_error_async_events_alloc(ndev);
+       if (ret) {
+               XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
+               goto stop_hw;
+       }
+
+       ret = aie2_error_async_events_send(ndev);
+       if (ret) {
+               XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
+               goto async_event_free;
+       }
+
+       /* Issue a command to make sure firmware handled async events */
+       ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
+       if (ret) {
+               XDNA_ERR(xdna, "Re-query firmware version failed");
+               goto async_event_free;
+       }
+
        release_firmware(fw);
        return 0;
 
+async_event_free:
+       aie2_error_async_events_free(ndev);
 stop_hw:
        aie2_hw_stop(xdna);
 disable_sva:
@@ -481,8 +511,10 @@ static int aie2_init(struct amdxdna_dev *xdna)
 static void aie2_fini(struct amdxdna_dev *xdna)
 {
        struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+       struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
 
        aie2_hw_stop(xdna);
+       aie2_error_async_events_free(ndev);
        iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
        pci_free_irq_vectors(pdev);
 }
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 113395ab5e33..30dde8376504 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -173,6 +173,7 @@ struct amdxdna_dev_hdl {
        /* Mailbox and the management channel */
        struct mailbox                  *mbox;
        struct mailbox_channel          *mgmt_chann;
+       struct async_events             *async_events;
 };
 
 #define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
@@ -213,6 +214,12 @@ struct psp_device *aie2m_psp_create(struct device *dev, 
struct psp_config *conf)
 int aie2_psp_start(struct psp_device *psp);
 void aie2_psp_stop(struct psp_device *psp);
 
+/* aie2_error.c */
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_msg_thread(void *data);
+
 /* aie2_message.c */
 int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
 int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
@@ -227,6 +234,8 @@ int aie2_query_firmware_version(struct amdxdna_dev_hdl 
*ndev,
 int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx 
*hwctx);
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx 
*hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, 
u64 size);
+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t 
addr, u32 size,
+                                void *handle, int (*cb)(void*, const u32 *, 
size_t));
 int aie2_config_cu(struct amdxdna_hwctx *hwctx);
 int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
                 int (*notify_cb)(void *, const u32 *, size_t));
-- 
2.34.1

Reply via email to