Enabled support for error handling. Added error types and subtypes
supported by ML firmware. Enabled support to get device specific
error code and message for a completed ML request.

Signed-off-by: Srikanth Yalavarthi <syalavar...@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.c |   4 +-
 drivers/ml/cnxk/cn10k_ml_dev.h |  50 +++++++++++++-
 drivers/ml/cnxk/cn10k_ml_ops.c | 117 ++++++++++++++++++++++++++++++---
 drivers/ml/cnxk/cn10k_ml_ops.h |   2 +
 4 files changed, 160 insertions(+), 13 deletions(-)

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 837f006bf0..76ed853a3c 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -261,7 +261,7 @@ cn10k_ml_fw_load_asim(struct cn10k_ml_fw *fw)
        } while (plt_tsc_cycles() < timeout_cycle);
 
        /* Check firmware load status, clean-up and exit on failure. */
-       if ((!timeout) && (fw->req->result.error_code == 0)) {
+       if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
                cn10k_ml_fw_print_info(fw);
        } else {
                /* Set ML to disable new jobs */
@@ -452,7 +452,7 @@ cn10k_ml_fw_load_cn10ka(struct cn10k_ml_fw *fw, void 
*buffer, uint64_t size)
        } while (plt_tsc_cycles() < timeout_cycle);
 
        /* Check firmware load status, clean-up and exit on failure. */
-       if ((!timeout) && (fw->req->result.error_code == 0)) {
+       if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
                cn10k_ml_fw_print_info(fw);
        } else {
                /* Set ML to disable new jobs */
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index 8f6bc24370..604a200e26 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -64,6 +64,54 @@ enum cn10k_ml_dev_state {
        ML_CN10K_DEV_STATE_CLOSED
 };
 
+/* Error types enumeration */
+enum cn10k_ml_error_etype {
+       /* 0x0 */ ML_ETYPE_NO_ERROR = 0, /* No error */
+       /* 0x1 */ ML_ETYPE_FW_NONFATAL,  /* Firmware non-fatal error */
+       /* 0x2 */ ML_ETYPE_HW_NONFATAL,  /* Hardware non-fatal error */
+       /* 0x3 */ ML_ETYPE_HW_FATAL,     /* Hardware fatal error */
+       /* 0x4 */ ML_ETYPE_HW_WARNING,   /* Hardware warning */
+       /* 0x5 */ ML_ETYPE_DRIVER,       /* Driver specific error */
+       /* 0x6 */ ML_ETYPE_UNKNOWN,      /* Unknown error */
+};
+
+/* Firmware non-fatal error sub-type */
+enum cn10k_ml_error_stype_fw_nf {
+       /* 0x0 */ ML_FW_ERR_NOERR = 0,           /* No error */
+       /* 0x1 */ ML_FW_ERR_UNLOAD_ID_NOT_FOUND, /* Model ID not found during 
load */
+       /* 0x2 */ ML_FW_ERR_LOAD_LUT_OVERFLOW,   /* Lookup table overflow at 
load */
+       /* 0x3 */ ML_FW_ERR_ID_IN_USE,           /* Model ID already in use */
+       /* 0x4 */ ML_FW_ERR_INVALID_TILEMASK,    /* Invalid OCM tilemask */
+       /* 0x5 */ ML_FW_ERR_RUN_LUT_OVERFLOW,    /* Lookup table overflow at 
run */
+       /* 0x6 */ ML_FW_ERR_RUN_ID_NOT_FOUND,    /* Model ID not found during 
run */
+       /* 0x7 */ ML_FW_ERR_COMMAND_NOTSUP,      /* Unsupported command */
+       /* 0x8 */ ML_FW_ERR_DDR_ADDR_RANGE,      /* DDR address out of range */
+       /* 0x9 */ ML_FW_ERR_NUM_BATCHES_INVALID, /* Invalid number of batches */
+       /* 0xA */ ML_FW_ERR_INSSYNC_TIMEOUT,     /* INS sync timeout */
+};
+
+/* Driver error sub-type */
+enum cn10k_ml_error_stype_driver {
+       /* 0x0 */ ML_DRIVER_ERR_NOERR = 0, /* No error */
+       /* 0x1 */ ML_DRIVER_ERR_UNKNOWN,   /* Unable to determine error 
sub-type */
+       /* 0x2 */ ML_DRIVER_ERR_EXCEPTION, /* Firmware exception */
+       /* 0x3 */ ML_DRIVER_ERR_FW_ERROR,  /* Unknown firmware error */
+};
+
+/* Error structure */
+union cn10k_ml_error_code {
+       struct {
+               /* Error type */
+               uint64_t etype : 4;
+
+               /* Error sub-type */
+               uint64_t stype : 60;
+       } s;
+
+       /* WORD 0 */
+       uint64_t u64;
+};
+
 /* Firmware stats */
 struct cn10k_ml_fw_stats {
        /* Firmware start cycle */
@@ -82,7 +130,7 @@ struct cn10k_ml_fw_stats {
 /* Result structure */
 struct cn10k_ml_result {
        /* Job error code */
-       uint64_t error_code;
+       union cn10k_ml_error_code error_code;
 
        /* Firmware stats */
        struct cn10k_ml_fw_stats stats;
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 87778c37bb..23a9ca4ff2 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -23,6 +23,49 @@
 #define ML_FLAGS_POLL_COMPL BIT(0)
 #define ML_FLAGS_SSO_COMPL  BIT(1)
 
+/* Error message length */
+#define ERRMSG_LEN 32
+
+/* Error type database */
+static const struct cn10k_ml_etype_db {
+       enum cn10k_ml_error_etype etype;
+       char name[ERRMSG_LEN];
+} ml_etype_db[] = {
+       {ML_ETYPE_NO_ERROR, "NO_ERROR"},        {ML_ETYPE_FW_NONFATAL, 
"FW_NON_FATAL"},
+       {ML_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_ETYPE_HW_FATAL, "HW_FATAL"},
+       {ML_ETYPE_HW_WARNING, "HW_WARNING"},    {ML_ETYPE_DRIVER, 
"DRIVER_ERROR"},
+       {ML_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
+
+/* Hardware non-fatal error subtype database */
+static const struct cn10k_ml_stype_db_hw_nf {
+       enum cn10k_ml_error_stype_fw_nf stype;
+       char msg[ERRMSG_LEN];
+} ml_stype_db_hw_nf[] = {
+       {ML_FW_ERR_NOERR, "NO ERROR"},
+       {ML_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
+       {ML_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
+       {ML_FW_ERR_ID_IN_USE, "MODEL ID IN USE"},
+       {ML_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"},
+       {ML_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"},
+       {ML_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"},
+       {ML_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"},
+       {ML_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"},
+       {ML_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"},
+       {ML_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"},
+};
+
+/* Driver error subtype database */
+static const struct cn10k_ml_stype_db_driver {
+       enum cn10k_ml_error_stype_driver stype;
+       char msg[ERRMSG_LEN];
+} ml_stype_db_driver[] = {
+       {ML_DRIVER_ERR_NOERR, "NO ERROR"},
+       {ML_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
+       {ML_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
+       {ML_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"},
+};
+
 static void
 print_line(FILE *fp, int len)
 {
@@ -474,6 +517,7 @@ cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct 
rte_ml_dev_config *c
 
        dev->enqueue_burst = cn10k_ml_enqueue_burst;
        dev->dequeue_burst = cn10k_ml_dequeue_burst;
+       dev->op_error_get = cn10k_ml_op_error_get;
 
        mldev->nb_models_loaded = 0;
        mldev->state = ML_CN10K_DEV_STATE_CONFIGURED;
@@ -758,7 +802,7 @@ cn10k_ml_dev_selftest(struct rte_ml_dev *dev)
        if (timeout) {
                ret = -ETIME;
        } else {
-               if (req->result.error_code != 0)
+               if (req->result.error_code.u64 != 0)
                        ret = -1;
        }
 
@@ -936,7 +980,7 @@ cn10k_ml_model_start(struct rte_ml_dev *dev, uint16_t 
model_id)
        /* Prepare JD */
        req = model->req;
        cn10k_ml_prep_sp_job_descriptor(mldev, model, req, 
ML_CN10K_JOB_TYPE_MODEL_START);
-       req->result.error_code = 0x0;
+       req->result.error_code.u64 = 0x0;
        req->result.user_ptr = NULL;
 
        plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1017,7 +1061,7 @@ cn10k_ml_model_start(struct rte_ml_dev *dev, uint16_t 
model_id)
 
        if (job_dequeued) {
                if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-                       if (req->result.error_code == 0)
+                       if (req->result.error_code.u64 == 0)
                                ret = 0;
                        else
                                ret = -1;
@@ -1079,7 +1123,7 @@ cn10k_ml_model_stop(struct rte_ml_dev *dev, uint16_t 
model_id)
        /* Prepare JD */
        req = model->req;
        cn10k_ml_prep_sp_job_descriptor(mldev, model, req, 
ML_CN10K_JOB_TYPE_MODEL_STOP);
-       req->result.error_code = 0x0;
+       req->result.error_code.u64 = 0x0;
        req->result.user_ptr = NULL;
 
        plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1134,7 +1178,7 @@ cn10k_ml_model_stop(struct rte_ml_dev *dev, uint16_t 
model_id)
 
        if (job_dequeued) {
                if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-                       if (req->result.error_code == 0x0)
+                       if (req->result.error_code.u64 == 0x0)
                                ret = 0;
                        else
                                ret = -1;
@@ -1426,12 +1470,30 @@ cn10k_ml_result_update(struct rte_ml_dev *dev, int 
qp_id, struct cn10k_ml_result
        PLT_SET_USED(dev);
        PLT_SET_USED(qp_id);
 
-       op->impl_opaque = result->error_code;
+       struct cn10k_ml_dev *mldev;
 
-       if (likely(result->error_code == 0))
+       if (likely(result->error_code.u64 == 0)) {
+               op->impl_opaque = result->error_code.u64;
                op->status = RTE_ML_OP_STATUS_SUCCESS;
-       else
+       } else {
+               /* Handle driver error */
+               if (result->error_code.s.etype == ML_ETYPE_DRIVER) {
+                       mldev = dev->data->dev_private;
+
+                       /* Check for exception */
+                       if ((roc_ml_reg_read64(&mldev->roc, 
ML_SCRATCH_EXCEPTION_SP_C0) != 0) ||
+                           (roc_ml_reg_read64(&mldev->roc, 
ML_SCRATCH_EXCEPTION_SP_C1) != 0))
+                               result->error_code.s.stype = 
ML_DRIVER_ERR_EXCEPTION;
+                       else if ((roc_ml_reg_read64(&mldev->roc, 
ML_CORE_INT_LO) != 0) ||
+                                (roc_ml_reg_read64(&mldev->roc, 
ML_CORE_INT_HI) != 0))
+                               result->error_code.s.stype = 
ML_DRIVER_ERR_FW_ERROR;
+                       else
+                               result->error_code.s.stype = 
ML_DRIVER_ERR_UNKNOWN;
+               }
+
+               op->impl_opaque = result->error_code.u64;
                op->status = RTE_ML_OP_STATUS_ERROR;
+       }
 
        op->user_ptr = result->user_ptr;
 }
@@ -1468,6 +1530,7 @@ cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t 
qp_id, struct rte_ml_op
        cn10k_ml_prep_fp_job_descriptor(dev, req, op);
 
        memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+       req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
        req->result.user_ptr = op->user_ptr;
 
        plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1515,8 +1578,12 @@ cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t 
qp_id, struct rte_ml_op
 dequeue_req:
        req = &queue->reqs[tail];
        status = plt_read64(&req->status);
-       if (unlikely(status != ML_CN10K_POLL_JOB_FINISH))
-               goto empty_or_active;
+       if (unlikely(status != ML_CN10K_POLL_JOB_FINISH)) {
+               if (plt_tsc_cycles() < req->timeout)
+                       goto empty_or_active;
+               else /* Timeout, set indication of driver error */
+                       req->result.error_code.s.etype = ML_ETYPE_DRIVER;
+       }
 
        cn10k_ml_result_update(dev, qp_id, &req->result, req->op);
        ops[count] = req->op;
@@ -1533,6 +1600,35 @@ cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t 
qp_id, struct rte_ml_op
        return count;
 }
 
+__rte_hot int
+cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct 
rte_ml_op_error *error)
+{
+       union cn10k_ml_error_code *error_code;
+       char msg[RTE_ML_STR_MAX];
+
+       PLT_SET_USED(dev);
+
+       error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
+
+       /* Copy error message */
+       plt_strlcpy(msg, ml_etype_db[error_code->s.etype].name, sizeof(msg));
+
+       /* Copy sub error message */
+       if (error_code->s.etype == ML_ETYPE_HW_NONFATAL) {
+               strcat(msg, " : ");
+               strcat(msg, ml_stype_db_hw_nf[error_code->s.stype].msg);
+       }
+
+       if (error_code->s.etype == ML_ETYPE_DRIVER) {
+               strcat(msg, " : ");
+               strcat(msg, ml_stype_db_driver[error_code->s.stype].msg);
+       }
+
+       plt_strlcpy(error->message, msg, sizeof(error->message));
+
+       return 0;
+}
+
 __rte_hot int
 cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op)
 {
@@ -1549,6 +1645,7 @@ cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct 
rte_ml_op *op)
        cn10k_ml_prep_fp_job_descriptor(dev, req, op);
 
        memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+       req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
        req->result.user_ptr = op->user_ptr;
 
        plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index 7c35bf7539..1784900cff 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -75,6 +75,8 @@ __rte_hot uint16_t cn10k_ml_enqueue_burst(struct rte_ml_dev 
*dev, uint16_t qp_id
                                          struct rte_ml_op **ops, uint16_t 
nb_ops);
 __rte_hot uint16_t cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t 
qp_id,
                                          struct rte_ml_op **ops, uint16_t 
nb_ops);
+__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op 
*op,
+                                   struct rte_ml_op_error *error);
 __rte_hot int cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op 
*op);
 
 #endif /* _CN10K_ML_OPS_H_ */
-- 
2.17.1

Reply via email to