The firmware implements the GET_APP_HEALTH command to collect debug
information for a specific hardware context.

When a command times out, the driver issues this command to collect the
relevant debug information. User space tools can also retrieve this
information through the hardware context query IOCTL.

Signed-off-by: Lizhi Hou <[email protected]>
---
 drivers/accel/amdxdna/aie2_ctx.c      | 85 ++++++++++++++++++++++++---
 drivers/accel/amdxdna/aie2_message.c  | 41 +++++++++++++
 drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++
 drivers/accel/amdxdna/aie2_pci.c      | 14 +++++
 drivers/accel/amdxdna/aie2_pci.h      |  4 ++
 drivers/accel/amdxdna/amdxdna_ctx.c   |  6 +-
 drivers/accel/amdxdna/amdxdna_ctx.h   | 11 +++-
 drivers/accel/amdxdna/npu4_regs.c     |  3 +-
 8 files changed, 205 insertions(+), 11 deletions(-)

diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
index 779ac70d62d7..8b7375d13e28 100644
--- a/drivers/accel/amdxdna/aie2_ctx.c
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list 
(Default true)");
 
 #define HWCTX_MAX_TIMEOUT      60000 /* milliseconds */
 
+struct aie2_ctx_health {
+       struct amdxdna_ctx_health header;
+       u32 txn_op_idx;
+       u32 ctx_pc;
+       u32 fatal_error_type;
+       u32 fatal_error_exception_type;
+       u32 fatal_error_exception_pc;
+       u32 fatal_error_app_module;
+};
+
 static void aie2_job_release(struct kref *ref)
 {
        struct amdxdna_sched_job *job;
@@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref)
        wake_up(&job->hwctx->priv->job_free_wq);
        if (job->out_fence)
                dma_fence_put(job->out_fence);
+       kfree(job->priv);
        kfree(job);
 }
 
@@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
        aie2_job_put(job);
 }
 
+static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job)
+{
+       struct aie2_ctx_health *aie2_health __free(kfree) = NULL;
+       struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+       struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+       struct app_health_report *report = job->priv;
+       u32 fail_cmd_idx = 0;
+
+       if (!report)
+               goto set_timeout;
+
+       XDNA_ERR(xdna, "Firmware timeout state capture:");
+       XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor);
+       XDNA_ERR(xdna, "\tReport size: 0x%x", report->size);
+       XDNA_ERR(xdna, "\tContext ID: %d", report->context_id);
+       XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc);
+       XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id);
+       XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc);
+       XDNA_ERR(xdna, "\tFatal error type: 0x%x", 
report->fatal_info.fatal_type);
+       XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", 
report->fatal_info.exception_type);
+       XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", 
report->fatal_info.exception_pc);
+       XDNA_ERR(xdna, "\tFatal error app module: 0x%x", 
report->fatal_info.app_module);
+       XDNA_ERR(xdna, "\tFatal error task ID: %d", 
report->fatal_info.task_index);
+       XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id);
+
+       fail_cmd_idx = report->run_list_id;
+       aie2_health = kzalloc_obj(*aie2_health);
+       if (!aie2_health)
+               goto set_timeout;
+
+       aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1;
+       aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2;
+       aie2_health->txn_op_idx = report->txn_op_id;
+       aie2_health->ctx_pc = report->ctx_pc;
+       aie2_health->fatal_error_type = report->fatal_info.fatal_type;
+       aie2_health->fatal_error_exception_type = 
report->fatal_info.exception_type;
+       aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc;
+       aie2_health->fatal_error_app_module = report->fatal_info.app_module;
+
+set_timeout:
+       amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT,
+                             aie2_health, sizeof(*aie2_health));
+}
+
 static int
 aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size)
 {
@@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, 
size_t size)
        cmd_abo = job->cmd_bo;
 
        if (unlikely(job->job_timeout)) {
-               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+               aie2_set_cmd_timeout(job);
                ret = -EINVAL;
                goto out;
        }
 
        if (unlikely(!data) || unlikely(size != sizeof(u32))) {
-               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, 
NULL, 0);
                ret = -EINVAL;
                goto out;
        }
@@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, 
size_t size)
        if (status == AIE2_STATUS_SUCCESS)
                amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
        else
-               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR);
+               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, 
NULL, 0);
 
 out:
        aie2_sched_notify(job);
@@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void 
__iomem *data, size_t size)
        struct amdxdna_sched_job *job = handle;
        struct amdxdna_gem_obj *cmd_abo;
        struct amdxdna_dev *xdna;
+       u32 fail_cmd_idx = 0;
        u32 fail_cmd_status;
-       u32 fail_cmd_idx;
        u32 cmd_status;
        int ret = 0;
 
        cmd_abo = job->cmd_bo;
 
        if (unlikely(job->job_timeout)) {
-               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT);
+               aie2_set_cmd_timeout(job);
                ret = -EINVAL;
                goto out;
        }
 
        if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
-               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT);
+               amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, 
NULL, 0);
                ret = -EINVAL;
                goto out;
        }
@@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void 
__iomem *data, size_t size)
                 fail_cmd_idx, fail_cmd_status);
 
        if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
-               amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
ERT_CMD_STATE_ABORT);
+               amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
ERT_CMD_STATE_ABORT, NULL, 0);
                ret = -EINVAL;
        } else {
-               amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
ERT_CMD_STATE_ERROR);
+               amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, 
ERT_CMD_STATE_ERROR, NULL, 0);
        }
 
 out:
@@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job)
 {
        struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
        struct amdxdna_hwctx *hwctx = job->hwctx;
+       struct app_health_report *report;
        struct amdxdna_dev *xdna;
+       int ret;
 
        xdna = hwctx->client->xdna;
        trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
        job->job_timeout = true;
+
        mutex_lock(&xdna->dev_lock);
+       report = kzalloc_obj(*report);
+       if (!report)
+               goto reset_hwctx;
+
+       ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report);
+       if (ret)
+               kfree(report);
+       else
+               job->priv = report;
+
+reset_hwctx:
        aie2_hwctx_stop(xdna, hwctx, sched_job);
 
        aie2_hwctx_restart(xdna, hwctx);
diff --git a/drivers/accel/amdxdna/aie2_message.c 
b/drivers/accel/amdxdna/aie2_message.c
index fa2f33c322d4..b764c7e8816a 100644
--- a/drivers/accel/amdxdna/aie2_message.c
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -1161,3 +1161,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, 
struct amdxdna_sched_job *
 
        return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
 }
+
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+                         struct app_health_report *report)
+{
+       DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH);
+       struct amdxdna_dev *xdna = ndev->xdna;
+       struct app_health_report *buf;
+       dma_addr_t dma_addr;
+       u32 buf_size;
+       int ret;
+
+       if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) {
+               XDNA_DBG(xdna, "App health feature not supported");
+               return -EOPNOTSUPP;
+       }
+
+       buf_size = sizeof(*report);
+       buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr);
+       if (IS_ERR(buf)) {
+               XDNA_ERR(xdna, "Failed to allocate buffer for app health");
+               return PTR_ERR(buf);
+       }
+
+       req.buf_addr = dma_addr;
+       req.context_id = context_id;
+       req.buf_size = buf_size;
+
+       drm_clflush_virt_range(buf, sizeof(*report));
+       ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+       if (ret) {
+               XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", 
ret, resp.status);
+               goto free_buf;
+       }
+
+       /* Copy the report to caller's buffer */
+       memcpy(report, buf, sizeof(*report));
+
+free_buf:
+       aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr);
+       return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h 
b/drivers/accel/amdxdna/aie2_msg_priv.h
index 728ef56f7f0a..f18e89a39e35 100644
--- a/drivers/accel/amdxdna/aie2_msg_priv.h
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -31,6 +31,7 @@ enum aie2_msg_opcode {
        MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
        MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
        MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
+       MSG_OP_GET_APP_HEALTH              = 0x114,
        MSG_OP_MAX_DRV_OPCODE,
        MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
        MSG_OP_MAX_OPCODE
@@ -451,4 +452,55 @@ struct config_debug_bo_req {
 struct config_debug_bo_resp {
        enum aie2_msg_status    status;
 } __packed;
+
+struct fatal_error_info {
+       __u32 fatal_type;         /* Fatal error type */
+       __u32 exception_type;     /* Only valid if fatal_type is a specific 
value */
+       __u32 exception_argument; /* Argument based on exception type */
+       __u32 exception_pc;       /* Program Counter at the time of the 
exception */
+       __u32 app_module;         /* Error module name */
+       __u32 task_index;         /* Index of the task in which the error 
occurred */
+       __u32 reserved[128];
+};
+
+struct app_health_report {
+       __u16 major;
+       __u16 minor;
+       __u32 size;
+       __u32 context_id;
+       /*
+        * Program Counter (PC) of the last initiated DPU opcode, as reported 
by the ERT
+        * application. Before execution begins or after successful completion, 
the value is set
+        * to UINT_MAX. If execution halts prematurely due to an error, this 
field retains the
+        * opcode's PC value.
+        * Note: To optimize performance, the ERT may simplify certain aspects 
of reporting.
+        * Proper interpretation requires familiarity with the implementation 
details.
+        */
+       __u32 dpu_pc;
+       /*
+        * Index of the last initiated TXN opcode.
+        * Before execution starts or after successful completion, the value is 
set to UINT_MAX.
+        * If execution halts prematurely due to an error, this field retains 
the opcode's ID.
+        * Note: To optimize performance, the ERT may simplify certain aspects 
of reporting.
+        * Proper interpretation requires familiarity with the implementation 
details.
+        */
+       __u32 txn_op_id;
+       /* The PC of the context at the time of the report */
+       __u32 ctx_pc;
+       struct fatal_error_info         fatal_info;
+       /* Index of the most recently executed run list entry. */
+       __u32 run_list_id;
+};
+
+struct get_app_health_req {
+       __u32 context_id;
+       __u32 buf_size;
+       __u64 buf_addr;
+} __packed;
+
+struct get_app_health_resp {
+       enum aie2_msg_status status;
+       __u32 required_buffer_size;
+       __u32 reserved[7];
+} __packed;
 #endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
index ddd3d82f3426..9e39bfe75971 100644
--- a/drivers/accel/amdxdna/aie2_pci.c
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx 
*hwctx, void *arg)
        struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL;
        struct amdxdna_drm_get_array *array_args = arg;
        struct amdxdna_drm_hwctx_entry __user *buf;
+       struct app_health_report report;
+       struct amdxdna_dev_hdl *ndev;
        u32 size;
+       int ret;
 
        if (!array_args->num_element)
                return -EINVAL;
@@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx 
*hwctx, void *arg)
        tmp->latency = hwctx->qos.latency;
        tmp->frame_exec_time = hwctx->qos.frame_exec_time;
        tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE;
+       ndev = hwctx->client->xdna->dev_handle;
+       ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report);
+       if (!ret) {
+               /* Fill in app health report fields */
+               tmp->txn_op_idx = report.txn_op_id;
+               tmp->ctx_pc = report.ctx_pc;
+               tmp->fatal_error_type = report.fatal_info.fatal_type;
+               tmp->fatal_error_exception_type = 
report.fatal_info.exception_type;
+               tmp->fatal_error_exception_pc = report.fatal_info.exception_pc;
+               tmp->fatal_error_app_module = report.fatal_info.app_module;
+       }
 
        buf = u64_to_user_ptr(array_args->buffer);
        size = min(sizeof(*tmp), array_args->element_size);
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
index 885ae7e6bfc7..6cced8ab936b 100644
--- a/drivers/accel/amdxdna/aie2_pci.h
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -10,6 +10,7 @@
 #include <linux/limits.h>
 #include <linux/semaphore.h>
 
+#include "aie2_msg_priv.h"
 #include "amdxdna_mailbox.h"
 
 #define AIE2_INTERVAL  20000   /* us */
@@ -261,6 +262,7 @@ enum aie2_fw_feature {
        AIE2_NPU_COMMAND,
        AIE2_PREEMPT,
        AIE2_TEMPORAL_ONLY,
+       AIE2_APP_HEALTH,
        AIE2_FEATURE_MAX
 };
 
@@ -341,6 +343,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, 
struct aie_version *ver
 int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata 
*metadata);
 int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
                                struct amdxdna_fw_ver *fw_ver);
+int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id,
+                         struct app_health_report *report);
 int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx 
*hwctx);
 int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx 
*hwctx);
 int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, 
u64 size);
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c 
b/drivers/accel/amdxdna/amdxdna_ctx.c
index 666dfd7b2a80..4b921715176d 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.c
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
 
 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
                          struct amdxdna_sched_job *job, u32 cmd_idx,
-                         enum ert_cmd_state error_state)
+                         enum ert_cmd_state error_state,
+                         void *err_data, size_t size)
 {
        struct amdxdna_client *client = job->hwctx->client;
        struct amdxdna_cmd *cmd = abo->mem.kva;
@@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
        }
 
        memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd));
+       if (err_data)
+               memcpy(cmd->data, err_data, min(size, abo->mem.size - 
sizeof(*cmd)));
+
        if (cc)
                amdxdna_gem_put_obj(abo);
 
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h 
b/drivers/accel/amdxdna/amdxdna_ctx.h
index fbdf9d000871..c067688755af 100644
--- a/drivers/accel/amdxdna/amdxdna_ctx.h
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data {
        u32 prop_args[];    /* properties and regular kernel arguments */
 };
 
+#define AMDXDNA_CMD_CTX_HEALTH_V1      1
+#define AMDXDNA_CMD_CTX_HEALTH_AIE2    0
+struct amdxdna_ctx_health {
+       u32 version;
+       u32 npu_gen;
+};
+
 /* Exec buffer command header format */
 #define AMDXDNA_CMD_STATE              GENMASK(3, 0)
 #define AMDXDNA_CMD_EXTRA_CU_MASK      GENMASK(11, 10)
@@ -136,6 +143,7 @@ struct amdxdna_sched_job {
        u64                     seq;
        struct amdxdna_drv_cmd  *drv_cmd;
        struct amdxdna_gem_obj  *cmd_bo;
+       void                    *priv;
        size_t                  bo_cnt;
        struct drm_gem_object   *bos[] __counted_by(bo_cnt);
 };
@@ -169,7 +177,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, 
u32 *size);
 u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
 int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo,
                          struct amdxdna_sched_job *job, u32 cmd_idx,
-                         enum ert_cmd_state error_state);
+                         enum ert_cmd_state error_state,
+                         void *err_data, size_t size);
 
 void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
 void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
diff --git a/drivers/accel/amdxdna/npu4_regs.c 
b/drivers/accel/amdxdna/npu4_regs.c
index ce25eef5fc34..d44fe8fd6cb0 100644
--- a/drivers/accel/amdxdna/npu4_regs.c
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = {
        { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 },
        { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 },
        { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 
},
-       { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major 
= 7 },
+       { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 },
+       { .features = GENMASK_ULL(AIE2_APP_HEALTH, AIE2_NPU_COMMAND), .major = 
7 },
        { 0 }
 };
 
-- 
2.34.1

Reply via email to