Added device argument "hw_queue_lock" to select the JCMDQ enqueue ROC function to be used in fast path.
hw_queue_lock: 0: Disable, use lock free version of JCMDQ enqueue ROC function for job queuing. To avoid race condition in request queuing to hardware, disabling hw_queue_lock restricts the number of queue-pairs supported by cnxk driver to 1. 1: Enable, (default) use spin-lock version of JCMDQ enqueue ROC function for job queuing. Enabling spinlock version would disable restrictions on the number of queue-pairs that can be created. Signed-off-by: Srikanth Yalavarthi <syalavar...@marvell.com> --- drivers/ml/cnxk/cn10k_ml_dev.c | 31 ++++++++++++++++++++++++++++++- drivers/ml/cnxk/cn10k_ml_dev.h | 13 +++++++++++-- drivers/ml/cnxk/cn10k_ml_ops.c | 20 +++++++++++++++++--- 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c index 5c02d67c8e..aa503b2691 100644 --- a/drivers/ml/cnxk/cn10k_ml_dev.c +++ b/drivers/ml/cnxk/cn10k_ml_dev.c @@ -22,12 +22,14 @@ #define CN10K_ML_FW_REPORT_DPE_WARNINGS "report_dpe_warnings" #define CN10K_ML_DEV_CACHE_MODEL_DATA "cache_model_data" #define CN10K_ML_OCM_ALLOC_MODE "ocm_alloc_mode" +#define CN10K_ML_DEV_HW_QUEUE_LOCK "hw_queue_lock" #define CN10K_ML_FW_PATH_DEFAULT "/lib/firmware/mlip-fw.bin" #define CN10K_ML_FW_ENABLE_DPE_WARNINGS_DEFAULT 1 #define CN10K_ML_FW_REPORT_DPE_WARNINGS_DEFAULT 0 #define CN10K_ML_DEV_CACHE_MODEL_DATA_DEFAULT 1 #define CN10K_ML_OCM_ALLOC_MODE_DEFAULT "lowest" +#define CN10K_ML_DEV_HW_QUEUE_LOCK_DEFAULT 1 /* ML firmware macros */ #define FW_MEMZONE_NAME "ml_cn10k_fw_mz" @@ -46,6 +48,7 @@ static const char *const valid_args[] = {CN10K_ML_FW_PATH, CN10K_ML_FW_REPORT_DPE_WARNINGS, CN10K_ML_DEV_CACHE_MODEL_DATA, CN10K_ML_OCM_ALLOC_MODE, + CN10K_ML_DEV_HW_QUEUE_LOCK, NULL}; /* Dummy operations for ML device */ @@ -87,6 +90,7 @@ cn10k_mldev_parse_devargs(struct rte_devargs *devargs, struct cn10k_ml_dev *mlde bool cache_model_data_set = false; struct rte_kvargs *kvlist = NULL; bool ocm_alloc_mode_set = false; + bool hw_queue_lock_set = false; char *ocm_alloc_mode = NULL; bool fw_path_set = false; char *fw_path = NULL; @@ -158,6 +162,18 @@ cn10k_mldev_parse_devargs(struct rte_devargs *devargs, struct cn10k_ml_dev *mlde ocm_alloc_mode_set = true; } + if (rte_kvargs_count(kvlist, CN10K_ML_DEV_HW_QUEUE_LOCK) == 1) { + ret = rte_kvargs_process(kvlist, CN10K_ML_DEV_HW_QUEUE_LOCK, &parse_integer_arg, + &mldev->hw_queue_lock); + if (ret < 0) { + plt_err("Error processing arguments, key = %s\n", + CN10K_ML_DEV_HW_QUEUE_LOCK); + ret = -EINVAL; + goto exit; + } + hw_queue_lock_set = true; + } + check_args: if (!fw_path_set) mldev->fw.path = CN10K_ML_FW_PATH_DEFAULT; @@ -215,6 +231,18 @@ cn10k_mldev_parse_devargs(struct rte_devargs *devargs, struct cn10k_ml_dev *mlde } plt_info("ML: %s = %s", CN10K_ML_OCM_ALLOC_MODE, mldev->ocm.alloc_mode); + if (!hw_queue_lock_set) { + mldev->hw_queue_lock = CN10K_ML_DEV_HW_QUEUE_LOCK_DEFAULT; + } else { + if ((mldev->hw_queue_lock < 0) || (mldev->hw_queue_lock > 1)) { + plt_err("Invalid argument, %s = %d\n", CN10K_ML_DEV_HW_QUEUE_LOCK, + mldev->hw_queue_lock); + ret = -EINVAL; + goto exit; + } + } + plt_info("ML: %s = %d", CN10K_ML_DEV_HW_QUEUE_LOCK, mldev->hw_queue_lock); + exit: if (kvlist) rte_kvargs_free(kvlist); @@ -756,4 +784,5 @@ RTE_PMD_REGISTER_PARAM_STRING(MLDEV_NAME_CN10K_PMD, CN10K_ML_FW_PATH "=<path>" CN10K_ML_FW_ENABLE_DPE_WARNINGS "=<0|1>" CN10K_ML_FW_REPORT_DPE_WARNINGS "=<0|1>" CN10K_ML_DEV_CACHE_MODEL_DATA - "=<0|1>" CN10K_ML_OCM_ALLOC_MODE "=<lowest|largest>"); + "=<0|1>" CN10K_ML_OCM_ALLOC_MODE + "=<lowest|largest>" CN10K_ML_DEV_HW_QUEUE_LOCK "=<0|1>"); diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h index 718edadde7..49676ac9e7 100644 --- a/drivers/ml/cnxk/cn10k_ml_dev.h +++ b/drivers/ml/cnxk/cn10k_ml_dev.h @@ -21,8 +21,11 @@ /* Maximum number of models per device */ #define ML_CN10K_MAX_MODELS 16 -/* Maximum number of queue-pairs per device */ -#define ML_CN10K_MAX_QP_PER_DEVICE 1 +/* Maximum number of queue-pairs per device, spinlock version */ +#define ML_CN10K_MAX_QP_PER_DEVICE_SL 16 + +/* Maximum number of queue-pairs per device, lock-free version */ +#define ML_CN10K_MAX_QP_PER_DEVICE_LF 1 /* Maximum number of descriptors per queue-pair */ #define ML_CN10K_MAX_DESC_PER_QP 1024 @@ -384,6 +387,12 @@ struct cn10k_ml_dev { /* Enable / disable model data caching */ int cache_model_data; + + /* Use spinlock version of ROC enqueue */ + int hw_queue_lock; + + /* JCMD enqueue function handler */ + bool (*ml_jcmdq_enqueue)(struct roc_ml *roc_ml, struct ml_job_cmd_s *job_cmd); }; uint64_t cn10k_ml_fw_flags_get(struct cn10k_ml_fw *fw); diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c index d69df42b27..f92f778e23 100644 --- a/drivers/ml/cnxk/cn10k_ml_ops.c +++ b/drivers/ml/cnxk/cn10k_ml_ops.c @@ -534,13 +534,21 @@ cn10k_ml_cache_model_data(struct rte_ml_dev *dev, uint16_t model_id) static int cn10k_ml_dev_info_get(struct rte_ml_dev *dev, struct rte_ml_dev_info *dev_info) { + struct cn10k_ml_dev *mldev; + if (dev_info == NULL) return -EINVAL; + mldev = dev->data->dev_private; + memset(dev_info, 0, sizeof(struct rte_ml_dev_info)); dev_info->driver_name = dev->device->driver->name; dev_info->max_models = ML_CN10K_MAX_MODELS; - dev_info->max_queue_pairs = ML_CN10K_MAX_QP_PER_DEVICE; + if (mldev->hw_queue_lock) + dev_info->max_queue_pairs = ML_CN10K_MAX_QP_PER_DEVICE_SL; + else + dev_info->max_queue_pairs = ML_CN10K_MAX_QP_PER_DEVICE_LF; + dev_info->max_desc = ML_CN10K_MAX_DESC_PER_QP; dev_info->max_segments = ML_CN10K_MAX_SEGMENTS; dev_info->min_align_size = ML_CN10K_ALIGN_SIZE; @@ -703,6 +711,12 @@ cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c else mldev->xstats_enabled = false; + /* Set JCMDQ enqueue function */ + if (mldev->hw_queue_lock == 1) + mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_sl; + else + mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_lf; + dev->enqueue_burst = cn10k_ml_enqueue_burst; dev->dequeue_burst = cn10k_ml_dequeue_burst; dev->op_error_get = cn10k_ml_op_error_get; @@ -1993,7 +2007,7 @@ cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op req->result.user_ptr = op->user_ptr; plt_write64(ML_CN10K_POLL_JOB_START, &req->status); - enqueued = roc_ml_jcmdq_enqueue_lf(&mldev->roc, &req->jcmd); + enqueued = mldev->ml_jcmdq_enqueue(&mldev->roc, &req->jcmd); if (unlikely(!enqueued)) goto jcmdq_full; @@ -2114,7 +2128,7 @@ cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op) timeout = true; req->timeout = plt_tsc_cycles() + ML_CN10K_CMD_TIMEOUT * plt_tsc_hz(); do { - if (roc_ml_jcmdq_enqueue_lf(&mldev->roc, &req->jcmd)) { + if (mldev->ml_jcmdq_enqueue(&mldev->roc, &req->jcmd)) { req->op = op; timeout = false; break; -- 2.17.1