The firmware implements the GET_APP_HEALTH command to collect debug information for a specific hardware context.
When a command times out, the driver issues this command to collect the relevant debug information. User space tools can also retrieve this information through the hardware context query IOCTL. Signed-off-by: Lizhi Hou <[email protected]> --- drivers/accel/amdxdna/aie2_ctx.c | 85 ++++++++++++++++++++++++--- drivers/accel/amdxdna/aie2_message.c | 41 +++++++++++++ drivers/accel/amdxdna/aie2_msg_priv.h | 52 ++++++++++++++++ drivers/accel/amdxdna/aie2_pci.c | 14 +++++ drivers/accel/amdxdna/aie2_pci.h | 4 ++ drivers/accel/amdxdna/amdxdna_ctx.c | 6 +- drivers/accel/amdxdna/amdxdna_ctx.h | 11 +++- drivers/accel/amdxdna/npu4_regs.c | 3 +- 8 files changed, 205 insertions(+), 11 deletions(-) diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c index 779ac70d62d7..8b7375d13e28 100644 --- a/drivers/accel/amdxdna/aie2_ctx.c +++ b/drivers/accel/amdxdna/aie2_ctx.c @@ -29,6 +29,16 @@ MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default true)"); #define HWCTX_MAX_TIMEOUT 60000 /* milliseconds */ +struct aie2_ctx_health { + struct amdxdna_ctx_health header; + u32 txn_op_idx; + u32 ctx_pc; + u32 fatal_error_type; + u32 fatal_error_exception_type; + u32 fatal_error_exception_pc; + u32 fatal_error_app_module; +}; + static void aie2_job_release(struct kref *ref) { struct amdxdna_sched_job *job; @@ -39,6 +49,7 @@ static void aie2_job_release(struct kref *ref) wake_up(&job->hwctx->priv->job_free_wq); if (job->out_fence) dma_fence_put(job->out_fence); + kfree(job->priv); kfree(job); } @@ -176,6 +187,50 @@ aie2_sched_notify(struct amdxdna_sched_job *job) aie2_job_put(job); } +static void aie2_set_cmd_timeout(struct amdxdna_sched_job *job) +{ + struct aie2_ctx_health *aie2_health __free(kfree) = NULL; + struct amdxdna_dev *xdna = job->hwctx->client->xdna; + struct amdxdna_gem_obj *cmd_abo = job->cmd_bo; + struct app_health_report *report = job->priv; + u32 fail_cmd_idx = 0; + + if (!report) + goto set_timeout; + + XDNA_ERR(xdna, "Firmware timeout state capture:"); + XDNA_ERR(xdna, "\tVersion: %d.%d", report->major, report->minor); + XDNA_ERR(xdna, "\tReport size: 0x%x", report->size); + XDNA_ERR(xdna, "\tContext ID: %d", report->context_id); + XDNA_ERR(xdna, "\tDPU PC: 0x%x", report->dpu_pc); + XDNA_ERR(xdna, "\tTXN OP ID: 0x%x", report->txn_op_id); + XDNA_ERR(xdna, "\tContext PC: 0x%x", report->ctx_pc); + XDNA_ERR(xdna, "\tFatal error type: 0x%x", report->fatal_info.fatal_type); + XDNA_ERR(xdna, "\tFatal error exception type: 0x%x", report->fatal_info.exception_type); + XDNA_ERR(xdna, "\tFatal error exception PC: 0x%x", report->fatal_info.exception_pc); + XDNA_ERR(xdna, "\tFatal error app module: 0x%x", report->fatal_info.app_module); + XDNA_ERR(xdna, "\tFatal error task ID: %d", report->fatal_info.task_index); + XDNA_ERR(xdna, "\tTimed out sub command ID: %d", report->run_list_id); + + fail_cmd_idx = report->run_list_id; + aie2_health = kzalloc_obj(*aie2_health); + if (!aie2_health) + goto set_timeout; + + aie2_health->header.version = AMDXDNA_CMD_CTX_HEALTH_V1; + aie2_health->header.npu_gen = AMDXDNA_CMD_CTX_HEALTH_AIE2; + aie2_health->txn_op_idx = report->txn_op_id; + aie2_health->ctx_pc = report->ctx_pc; + aie2_health->fatal_error_type = report->fatal_info.fatal_type; + aie2_health->fatal_error_exception_type = report->fatal_info.exception_type; + aie2_health->fatal_error_exception_pc = report->fatal_info.exception_pc; + aie2_health->fatal_error_app_module = report->fatal_info.app_module; + +set_timeout: + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_TIMEOUT, + aie2_health, sizeof(*aie2_health)); +} + static int aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size) { @@ -187,13 +242,13 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size) cmd_abo = job->cmd_bo; if (unlikely(job->job_timeout)) { - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT); + aie2_set_cmd_timeout(job); ret = -EINVAL; goto out; } if (unlikely(!data) || unlikely(size != sizeof(u32))) { - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT); + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0); ret = -EINVAL; goto out; } @@ -203,7 +258,7 @@ aie2_sched_resp_handler(void *handle, void __iomem *data, size_t size) if (status == AIE2_STATUS_SUCCESS) amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED); else - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR); + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ERROR, NULL, 0); out: aie2_sched_notify(job); @@ -237,21 +292,21 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size) struct amdxdna_sched_job *job = handle; struct amdxdna_gem_obj *cmd_abo; struct amdxdna_dev *xdna; + u32 fail_cmd_idx = 0; u32 fail_cmd_status; - u32 fail_cmd_idx; u32 cmd_status; int ret = 0; cmd_abo = job->cmd_bo; if (unlikely(job->job_timeout)) { - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_TIMEOUT); + aie2_set_cmd_timeout(job); ret = -EINVAL; goto out; } if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) { - amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT); + amdxdna_cmd_set_error(cmd_abo, job, 0, ERT_CMD_STATE_ABORT, NULL, 0); ret = -EINVAL; goto out; } @@ -271,10 +326,10 @@ aie2_sched_cmdlist_resp_handler(void *handle, void __iomem *data, size_t size) fail_cmd_idx, fail_cmd_status); if (fail_cmd_status == AIE2_STATUS_SUCCESS) { - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT); + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ABORT, NULL, 0); ret = -EINVAL; } else { - amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR); + amdxdna_cmd_set_error(cmd_abo, job, fail_cmd_idx, ERT_CMD_STATE_ERROR, NULL, 0); } out: @@ -363,12 +418,26 @@ aie2_sched_job_timedout(struct drm_sched_job *sched_job) { struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job); struct amdxdna_hwctx *hwctx = job->hwctx; + struct app_health_report *report; struct amdxdna_dev *xdna; + int ret; xdna = hwctx->client->xdna; trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq); job->job_timeout = true; + mutex_lock(&xdna->dev_lock); + report = kzalloc_obj(*report); + if (!report) + goto reset_hwctx; + + ret = aie2_query_app_health(xdna->dev_handle, hwctx->fw_ctx_id, report); + if (ret) + kfree(report); + else + job->priv = report; + +reset_hwctx: aie2_hwctx_stop(xdna, hwctx, sched_job); aie2_hwctx_restart(xdna, hwctx); diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c index fa2f33c322d4..b764c7e8816a 100644 --- a/drivers/accel/amdxdna/aie2_message.c +++ b/drivers/accel/amdxdna/aie2_message.c @@ -1161,3 +1161,44 @@ int aie2_config_debug_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job * return xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT); } + +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id, + struct app_health_report *report) +{ + DECLARE_AIE2_MSG(get_app_health, MSG_OP_GET_APP_HEALTH); + struct amdxdna_dev *xdna = ndev->xdna; + struct app_health_report *buf; + dma_addr_t dma_addr; + u32 buf_size; + int ret; + + if (!AIE2_FEATURE_ON(ndev, AIE2_APP_HEALTH)) { + XDNA_DBG(xdna, "App health feature not supported"); + return -EOPNOTSUPP; + } + + buf_size = sizeof(*report); + buf = aie2_alloc_msg_buffer(ndev, &buf_size, &dma_addr); + if (IS_ERR(buf)) { + XDNA_ERR(xdna, "Failed to allocate buffer for app health"); + return PTR_ERR(buf); + } + + req.buf_addr = dma_addr; + req.context_id = context_id; + req.buf_size = buf_size; + + drm_clflush_virt_range(buf, sizeof(*report)); + ret = aie2_send_mgmt_msg_wait(ndev, &msg); + if (ret) { + XDNA_ERR(xdna, "Get app health failed, ret %d status 0x%x", ret, resp.status); + goto free_buf; + } + + /* Copy the report to caller's buffer */ + memcpy(report, buf, sizeof(*report)); + +free_buf: + aie2_free_msg_buffer(ndev, buf_size, buf, dma_addr); + return ret; +} diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h index 728ef56f7f0a..f18e89a39e35 100644 --- a/drivers/accel/amdxdna/aie2_msg_priv.h +++ b/drivers/accel/amdxdna/aie2_msg_priv.h @@ -31,6 +31,7 @@ enum aie2_msg_opcode { MSG_OP_SET_RUNTIME_CONFIG = 0x10A, MSG_OP_GET_RUNTIME_CONFIG = 0x10B, MSG_OP_REGISTER_ASYNC_EVENT_MSG = 0x10C, + MSG_OP_GET_APP_HEALTH = 0x114, MSG_OP_MAX_DRV_OPCODE, MSG_OP_GET_PROTOCOL_VERSION = 0x301, MSG_OP_MAX_OPCODE @@ -451,4 +452,55 @@ struct config_debug_bo_req { struct config_debug_bo_resp { enum aie2_msg_status status; } __packed; + +struct fatal_error_info { + __u32 fatal_type; /* Fatal error type */ + __u32 exception_type; /* Only valid if fatal_type is a specific value */ + __u32 exception_argument; /* Argument based on exception type */ + __u32 exception_pc; /* Program Counter at the time of the exception */ + __u32 app_module; /* Error module name */ + __u32 task_index; /* Index of the task in which the error occurred */ + __u32 reserved[128]; +}; + +struct app_health_report { + __u16 major; + __u16 minor; + __u32 size; + __u32 context_id; + /* + * Program Counter (PC) of the last initiated DPU opcode, as reported by the ERT + * application. Before execution begins or after successful completion, the value is set + * to UINT_MAX. If execution halts prematurely due to an error, this field retains the + * opcode's PC value. + * Note: To optimize performance, the ERT may simplify certain aspects of reporting. + * Proper interpretation requires familiarity with the implementation details. + */ + __u32 dpu_pc; + /* + * Index of the last initiated TXN opcode. + * Before execution starts or after successful completion, the value is set to UINT_MAX. + * If execution halts prematurely due to an error, this field retains the opcode's ID. + * Note: To optimize performance, the ERT may simplify certain aspects of reporting. + * Proper interpretation requires familiarity with the implementation details. + */ + __u32 txn_op_id; + /* The PC of the context at the time of the report */ + __u32 ctx_pc; + struct fatal_error_info fatal_info; + /* Index of the most recently executed run list entry. */ + __u32 run_list_id; +}; + +struct get_app_health_req { + __u32 context_id; + __u32 buf_size; + __u64 buf_addr; +} __packed; + +struct get_app_health_resp { + enum aie2_msg_status status; + __u32 required_buffer_size; + __u32 reserved[7]; +} __packed; #endif /* _AIE2_MSG_PRIV_H_ */ diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c index ddd3d82f3426..9e39bfe75971 100644 --- a/drivers/accel/amdxdna/aie2_pci.c +++ b/drivers/accel/amdxdna/aie2_pci.c @@ -846,7 +846,10 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg) struct amdxdna_drm_hwctx_entry *tmp __free(kfree) = NULL; struct amdxdna_drm_get_array *array_args = arg; struct amdxdna_drm_hwctx_entry __user *buf; + struct app_health_report report; + struct amdxdna_dev_hdl *ndev; u32 size; + int ret; if (!array_args->num_element) return -EINVAL; @@ -869,6 +872,17 @@ static int aie2_hwctx_status_cb(struct amdxdna_hwctx *hwctx, void *arg) tmp->latency = hwctx->qos.latency; tmp->frame_exec_time = hwctx->qos.frame_exec_time; tmp->state = AMDXDNA_HWCTX_STATE_ACTIVE; + ndev = hwctx->client->xdna->dev_handle; + ret = aie2_query_app_health(ndev, hwctx->fw_ctx_id, &report); + if (!ret) { + /* Fill in app health report fields */ + tmp->txn_op_idx = report.txn_op_id; + tmp->ctx_pc = report.ctx_pc; + tmp->fatal_error_type = report.fatal_info.fatal_type; + tmp->fatal_error_exception_type = report.fatal_info.exception_type; + tmp->fatal_error_exception_pc = report.fatal_info.exception_pc; + tmp->fatal_error_app_module = report.fatal_info.app_module; + } buf = u64_to_user_ptr(array_args->buffer); size = min(sizeof(*tmp), array_args->element_size); diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h index 885ae7e6bfc7..6cced8ab936b 100644 --- a/drivers/accel/amdxdna/aie2_pci.h +++ b/drivers/accel/amdxdna/aie2_pci.h @@ -10,6 +10,7 @@ #include <linux/limits.h> #include <linux/semaphore.h> +#include "aie2_msg_priv.h" #include "amdxdna_mailbox.h" #define AIE2_INTERVAL 20000 /* us */ @@ -261,6 +262,7 @@ enum aie2_fw_feature { AIE2_NPU_COMMAND, AIE2_PREEMPT, AIE2_TEMPORAL_ONLY, + AIE2_APP_HEALTH, AIE2_FEATURE_MAX }; @@ -341,6 +343,8 @@ int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *ver int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata); int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev, struct amdxdna_fw_ver *fw_ver); +int aie2_query_app_health(struct amdxdna_dev_hdl *ndev, u32 context_id, + struct app_health_report *report); int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx); int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx); int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size); diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c index 666dfd7b2a80..4b921715176d 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.c +++ b/drivers/accel/amdxdna/amdxdna_ctx.c @@ -137,7 +137,8 @@ u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo) int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo, struct amdxdna_sched_job *job, u32 cmd_idx, - enum ert_cmd_state error_state) + enum ert_cmd_state error_state, + void *err_data, size_t size) { struct amdxdna_client *client = job->hwctx->client; struct amdxdna_cmd *cmd = abo->mem.kva; @@ -156,6 +157,9 @@ int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo, } memset(cmd->data, 0xff, abo->mem.size - sizeof(*cmd)); + if (err_data) + memcpy(cmd->data, err_data, min(size, abo->mem.size - sizeof(*cmd))); + if (cc) amdxdna_gem_put_obj(abo); diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h index fbdf9d000871..c067688755af 100644 --- a/drivers/accel/amdxdna/amdxdna_ctx.h +++ b/drivers/accel/amdxdna/amdxdna_ctx.h @@ -72,6 +72,13 @@ struct amdxdna_cmd_preempt_data { u32 prop_args[]; /* properties and regular kernel arguments */ }; +#define AMDXDNA_CMD_CTX_HEALTH_V1 1 +#define AMDXDNA_CMD_CTX_HEALTH_AIE2 0 +struct amdxdna_ctx_health { + u32 version; + u32 npu_gen; +}; + /* Exec buffer command header format */ #define AMDXDNA_CMD_STATE GENMASK(3, 0) #define AMDXDNA_CMD_EXTRA_CU_MASK GENMASK(11, 10) @@ -136,6 +143,7 @@ struct amdxdna_sched_job { u64 seq; struct amdxdna_drv_cmd *drv_cmd; struct amdxdna_gem_obj *cmd_bo; + void *priv; size_t bo_cnt; struct drm_gem_object *bos[] __counted_by(bo_cnt); }; @@ -169,7 +177,8 @@ void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size); u32 amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo); int amdxdna_cmd_set_error(struct amdxdna_gem_obj *abo, struct amdxdna_sched_job *job, u32 cmd_idx, - enum ert_cmd_state error_state); + enum ert_cmd_state error_state, + void *err_data, size_t size); void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job); void amdxdna_hwctx_remove_all(struct amdxdna_client *client); diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c index ce25eef5fc34..d44fe8fd6cb0 100644 --- a/drivers/accel/amdxdna/npu4_regs.c +++ b/drivers/accel/amdxdna/npu4_regs.c @@ -93,7 +93,8 @@ const struct aie2_fw_feature_tbl npu4_fw_feature_table[] = { { .features = BIT_U64(AIE2_NPU_COMMAND), .major = 6, .min_minor = 15 }, { .features = BIT_U64(AIE2_PREEMPT), .major = 6, .min_minor = 12 }, { .features = BIT_U64(AIE2_TEMPORAL_ONLY), .major = 6, .min_minor = 12 }, - { .features = GENMASK_ULL(AIE2_TEMPORAL_ONLY, AIE2_NPU_COMMAND), .major = 7 }, + { .features = BIT_U64(AIE2_APP_HEALTH), .major = 6, .min_minor = 18 }, + { .features = GENMASK_ULL(AIE2_APP_HEALTH, AIE2_NPU_COMMAND), .major = 7 }, { 0 } }; -- 2.34.1
