Introduced data type conversion functions with support for
user defined scale factor and zero-point. Updated library
functions to support asymmetric / affine conversion for
integer types.

Signed-off-by: Srikanth Yalavarthi <syalavar...@marvell.com>
---
 drivers/ml/cnxk/cnxk_ml_io.c            | 134 +++----
 lib/mldev/mldev_utils.h                 | 453 -----------------------
 lib/mldev/mldev_utils_neon.c            | 452 ++++++++++++++---------
 lib/mldev/mldev_utils_neon_bfloat16.c   |  20 +-
 lib/mldev/mldev_utils_scalar.c          | 156 ++++----
 lib/mldev/mldev_utils_scalar_bfloat16.c |  12 +-
 lib/mldev/rte_mldev.h                   | 462 ++++++++++++++++++++++++
 lib/mldev/version.map                   |  40 +-
 8 files changed, 936 insertions(+), 793 deletions(-)

diff --git a/drivers/ml/cnxk/cnxk_ml_io.c b/drivers/ml/cnxk/cnxk_ml_io.c
index 4b0adc2ae47..a418b7e684d 100644
--- a/drivers/ml/cnxk/cnxk_ml_io.c
+++ b/drivers/ml/cnxk/cnxk_ml_io.c
@@ -26,39 +26,40 @@ cnxk_ml_io_quantize_single(struct cnxk_ml_io *input, 
uint8_t *dbuffer, uint8_t *
 
        if (dtype == qtype) {
                rte_memcpy(qbuffer, dbuffer, input->sz_d);
-       } else {
-               switch (qtype) {
-               case RTE_ML_IO_TYPE_INT8:
-                       ret = rte_ml_io_float32_to_int8(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT8:
-                       ret = rte_ml_io_float32_to_uint8(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT16:
-                       ret = rte_ml_io_float32_to_int16(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT16:
-                       ret = rte_ml_io_float32_to_uint16(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT32:
-                       ret = rte_ml_io_float32_to_int32(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT32:
-                       ret = rte_ml_io_float32_to_uint32(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT64:
-                       ret = rte_ml_io_float32_to_int64(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT64:
-                       ret = rte_ml_io_float32_to_uint64(qscale, nb_elements, 
dbuffer, qbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_FP16:
-                       ret = rte_ml_io_float32_to_float16(nb_elements, 
dbuffer, qbuffer);
-                       break;
-               default:
-                       plt_err("Unsupported qtype : %u", qtype);
-                       ret = -ENOTSUP;
-               }
+               return ret;
+       }
+
+       switch (qtype) {
+       case RTE_ML_IO_TYPE_INT8:
+               ret = rte_ml_io_float32_to_int8(dbuffer, qbuffer, nb_elements, 
1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT8:
+               ret = rte_ml_io_float32_to_uint8(dbuffer, qbuffer, nb_elements, 
1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT16:
+               ret = rte_ml_io_float32_to_int16(dbuffer, qbuffer, nb_elements, 
1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT16:
+               ret = rte_ml_io_float32_to_uint16(dbuffer, qbuffer, 
nb_elements, 1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT32:
+               ret = rte_ml_io_float32_to_int32(dbuffer, qbuffer, nb_elements, 
1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT32:
+               ret = rte_ml_io_float32_to_uint32(dbuffer, qbuffer, 
nb_elements, 1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT64:
+               ret = rte_ml_io_float32_to_int64(dbuffer, qbuffer, nb_elements, 
1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT64:
+               ret = rte_ml_io_float32_to_uint64(dbuffer, qbuffer, 
nb_elements, 1.0 / qscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_FP16:
+               ret = rte_ml_io_float32_to_float16(dbuffer, qbuffer, 
nb_elements);
+               break;
+       default:
+               plt_err("Unsupported qtype : %u", qtype);
+               ret = -ENOTSUP;
        }
 
        return ret;
@@ -80,39 +81,40 @@ cnxk_ml_io_dequantize_single(struct cnxk_ml_io *output, 
uint8_t *qbuffer, uint8_
 
        if (dtype == qtype) {
                rte_memcpy(dbuffer, qbuffer, output->sz_q);
-       } else {
-               switch (qtype) {
-               case RTE_ML_IO_TYPE_INT8:
-                       ret = rte_ml_io_int8_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT8:
-                       ret = rte_ml_io_uint8_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT16:
-                       ret = rte_ml_io_int16_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT16:
-                       ret = rte_ml_io_uint16_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT32:
-                       ret = rte_ml_io_int32_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT32:
-                       ret = rte_ml_io_uint32_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_INT64:
-                       ret = rte_ml_io_int64_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_UINT64:
-                       ret = rte_ml_io_uint64_to_float32(dscale, nb_elements, 
qbuffer, dbuffer);
-                       break;
-               case RTE_ML_IO_TYPE_FP16:
-                       ret = rte_ml_io_float16_to_float32(nb_elements, 
qbuffer, dbuffer);
-                       break;
-               default:
-                       plt_err("Unsupported qtype: %u", qtype);
-                       ret = -ENOTSUP;
-               }
+               return 0;
+       }
+
+       switch (qtype) {
+       case RTE_ML_IO_TYPE_INT8:
+               ret = rte_ml_io_int8_to_float32(qbuffer, dbuffer, nb_elements, 
dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT8:
+               ret = rte_ml_io_uint8_to_float32(qbuffer, dbuffer, nb_elements, 
dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT16:
+               ret = rte_ml_io_int16_to_float32(qbuffer, dbuffer, nb_elements, 
dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT16:
+               ret = rte_ml_io_uint16_to_float32(qbuffer, dbuffer, 
nb_elements, dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT32:
+               ret = rte_ml_io_int32_to_float32(qbuffer, dbuffer, nb_elements, 
dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT32:
+               ret = rte_ml_io_uint32_to_float32(qbuffer, dbuffer, 
nb_elements, dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_INT64:
+               ret = rte_ml_io_int64_to_float32(qbuffer, dbuffer, nb_elements, 
dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_UINT64:
+               ret = rte_ml_io_uint64_to_float32(qbuffer, dbuffer, 
nb_elements, dscale, 0);
+               break;
+       case RTE_ML_IO_TYPE_FP16:
+               ret = rte_ml_io_float16_to_float32(qbuffer, dbuffer, 
nb_elements);
+               break;
+       default:
+               plt_err("Unsupported qtype: %u", qtype);
+               ret = -ENOTSUP;
        }
 
        return ret;
diff --git a/lib/mldev/mldev_utils.h b/lib/mldev/mldev_utils.h
index 5e2a180adce..37c90b44a8e 100644
--- a/lib/mldev/mldev_utils.h
+++ b/lib/mldev/mldev_utils.h
@@ -52,459 +52,6 @@ __rte_internal
 void
 rte_ml_io_type_to_str(enum rte_ml_io_type type, char *str, int len);
 
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to signed 8-bit
- * integer format (INT8).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store INT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void 
*output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in signed 8-bit integer format (INT8) 
to single precision
- * floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing INT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void 
*output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
- * 8-bit integer format (UINT8).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store UINT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in unsigned 8-bit integer format 
(UINT8) to single precision
- * floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing UINT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to signed
- * 16-bit integer format (INT16).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store INT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in signed 16-bit integer format (INT16) 
to single precision
- * floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing INT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
- * 16-bit integer format (UINT16).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store UINT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in unsigned 16-bit integer format 
(UINT16) to single
- * precision floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing UINT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32)
- * to signed 32-bit integer format (INT32).
- *
- * @param[in] scale
- *     Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store INT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in signed 32-bit integer format (INT32)
- * to single precision floating format (float32).
- *
- * @param[in] scale
- *     Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing INT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32)
- * to unsigned 32-bit integer format (UINT32).
- *
- * @param[in] scale
- *     Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store UINT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in unsigned 32-bit integer format 
(UINT32)
- * to single precision floating format (float32).
- *
- * @param[in] scale
- *     Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing UINT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32)
- * to signed 64-bit integer format (INT64).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store INT64 numbers. Size of buffer is equal to 
(nb_elements * 8) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in signed 64-bit integer format (INT64)
- * to single precision floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing INT64 numbers. Size of buffer is equal to 
(nb_elements * 8) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32)
- * to unsigned 64-bit integer format (UINT64).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- * @param[out] output
- *     Output buffer to store UINT64 numbers. Size of buffer is equal to 
(nb_elements * 8) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in unsigned 64-bit integer format 
(UINT64)
- * to single precision floating format (float32).
- *
- * @param[in] scale
- *      Scale factor for conversion.
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing UINT64 numbers. Size of buffer is equal to 
(nb_elements * 8) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to half
- * precision floating point format (FP16).
- *
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements *4) bytes.
- * @param[out] output
- *     Output buffer to store float16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in half precision floating format 
(FP16) to single precision
- * floating point format (float32).
- *
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in single precision floating format 
(float32) to brain
- * floating point format (bfloat16).
- *
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements *4) bytes.
- * @param[out] output
- *     Output buffer to store bfloat16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output);
-
-/**
- * @internal
- *
- * Convert a buffer containing numbers in brain floating point format 
(bfloat16) to single precision
- * floating point format (float32).
- *
- * @param[in] nb_elements
- *     Number of elements in the buffer.
- * @param[in] input
- *     Input buffer containing bfloat16 numbers. Size of buffer is equal to 
(nb_elements * 2)
- * bytes.
- * @param[out] output
- *     Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
- *
- * @return
- *     - 0, Success.
- *     - < 0, Error code on failure.
- */
-__rte_internal
-int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/mldev/mldev_utils_neon.c b/lib/mldev/mldev_utils_neon.c
index 4cde2ebabd3..8751a40863e 100644
--- a/lib/mldev/mldev_utils_neon.c
+++ b/lib/mldev/mldev_utils_neon.c
@@ -17,7 +17,7 @@
  */
 
 static inline void
-__float32_to_int8_neon_s8x8(float scale, float *input, int8_t *output)
+__float32_to_int8_neon_s8x8(const float *input, int8_t *output, float scale, 
int8_t zero_point)
 {
        int16x4_t s16x4_l;
        int16x4_t s16x4_h;
@@ -30,7 +30,8 @@ __float32_to_int8_neon_s8x8(float scale, float *input, int8_t 
*output)
         * Use round to nearest with ties away rounding mode.
         */
        f32x4 = vld1q_f32(input);
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
        s32x4 = vcvtaq_s32_f32(f32x4);
        s16x4_l = vqmovn_s32(s32x4);
 
@@ -38,7 +39,8 @@ __float32_to_int8_neon_s8x8(float scale, float *input, int8_t 
*output)
         * Use round to nearest with ties away rounding mode.
         */
        f32x4 = vld1q_f32(input + 4);
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
        s32x4 = vcvtaq_s32_f32(f32x4);
        s16x4_h = vqmovn_s32(s32x4);
 
@@ -47,31 +49,37 @@ __float32_to_int8_neon_s8x8(float scale, float *input, 
int8_t *output)
 
        /* narrow to int8_t */
        s8x8 = vqmovn_s16(s16x8);
+       s8x8 = vmax_s8(s8x8, vdup_n_s8(INT8_MIN + 1));
 
        /* store 8 elements */
        vst1_s8(output, s8x8);
 }
 
 static inline void
-__float32_to_int8_neon_s8x1(float scale, float *input, int8_t *output)
+__float32_to_int8_neon_s8x1(const float *input, int8_t *output, float scale, 
int8_t zero_point)
 {
-       int32_t s32;
+       float32x2_t f32x2;
+       int32x2_t s32x2;
        int16_t s16;
 
        /* scale and convert, round to nearest with ties away rounding mode */
-       s32 = vcvtas_s32_f32(scale * (*input));
+       f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
+       s32x2 = vcvta_s32_f32(f32x2);
+       s32x2 = vmax_s32(s32x2, vdup_n_s32(INT8_MIN + 1));
 
        /* saturate narrow */
-       s16 = vqmovns_s32(s32);
+       s16 = vqmovns_s32(vget_lane_s32(s32x2, 0));
 
        /* convert to int8_t */
        *output = vqmovnh_s16(s16);
 }
 
 int
-rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void 
*output)
+rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                         int8_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int8_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -80,14 +88,14 @@ rte_ml_io_float32_to_int8(float scale, uint64_t 
nb_elements, void *input, void *
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int8_t *)output;
        vlen = 2 * sizeof(float) / sizeof(int8_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_int8_neon_s8x8(scale, input_buffer, output_buffer);
+               __float32_to_int8_neon_s8x8(input_buffer, output_buffer, scale, 
zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -95,7 +103,7 @@ rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, 
void *input, void *
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_int8_neon_s8x1(scale, input_buffer, output_buffer);
+               __float32_to_int8_neon_s8x1(input_buffer, output_buffer, scale, 
zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -104,7 +112,7 @@ rte_ml_io_float32_to_int8(float scale, uint64_t 
nb_elements, void *input, void *
 }
 
 static inline void
-__int8_to_float32_neon_f32x8(float scale, int8_t *input, float *output)
+__int8_to_float32_neon_f32x8(const int8_t *input, float *output, float scale, 
int8_t zero_point)
 {
        float32x4_t f32x4;
        int16x8_t s16x8;
@@ -122,6 +130,7 @@ __int8_to_float32_neon_f32x8(float scale, int8_t *input, 
float *output)
        s16x4 = vget_low_s16(s16x8);
        s32x4 = vmovl_s16(s16x4);
        f32x4 = vcvtq_f32_s32(s32x4);
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
        f32x4 = vmulq_n_f32(f32x4, scale);
        vst1q_f32(output, f32x4);
 
@@ -129,20 +138,22 @@ __int8_to_float32_neon_f32x8(float scale, int8_t *input, 
float *output)
        s16x4 = vget_high_s16(s16x8);
        s32x4 = vmovl_s16(s16x4);
        f32x4 = vcvtq_f32_s32(s32x4);
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
        f32x4 = vmulq_n_f32(f32x4, scale);
        vst1q_f32(output + 4, f32x4);
 }
 
 static inline void
-__int8_to_float32_neon_f32x1(float scale, int8_t *input, float *output)
+__int8_to_float32_neon_f32x1(const int8_t *input, float *output, float scale, 
int8_t zero_point)
 {
-       *output = scale * vcvts_f32_s32((int32_t)*input);
+       *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void 
*output)
+rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                         int8_t zero_point)
 {
-       int8_t *input_buffer;
+       const int8_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -151,14 +162,14 @@ rte_ml_io_int8_to_float32(float scale, uint64_t 
nb_elements, void *input, void *
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int8_t *)input;
+       input_buffer = (const int8_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(int8_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __int8_to_float32_neon_f32x8(scale, input_buffer, 
output_buffer);
+               __int8_to_float32_neon_f32x8(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -166,7 +177,7 @@ rte_ml_io_int8_to_float32(float scale, uint64_t 
nb_elements, void *input, void *
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __int8_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __int8_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -175,7 +186,7 @@ rte_ml_io_int8_to_float32(float scale, uint64_t 
nb_elements, void *input, void *
 }
 
 static inline void
-__float32_to_uint8_neon_u8x8(float scale, float *input, uint8_t *output)
+__float32_to_uint8_neon_u8x8(const float *input, uint8_t *output, float scale, 
uint8_t zero_point)
 {
        uint16x4_t u16x4_l;
        uint16x4_t u16x4_h;
@@ -188,7 +199,8 @@ __float32_to_uint8_neon_u8x8(float scale, float *input, 
uint8_t *output)
         * use round to nearest with ties away rounding mode.
         */
        f32x4 = vld1q_f32(input);
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
        u32x4 = vcvtaq_u32_f32(f32x4);
        u16x4_l = vqmovn_u32(u32x4);
 
@@ -196,7 +208,8 @@ __float32_to_uint8_neon_u8x8(float scale, float *input, 
uint8_t *output)
         * use round to nearest with ties away rounding mode.
         */
        f32x4 = vld1q_f32(input + 4);
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
        u32x4 = vcvtaq_u32_f32(f32x4);
        u16x4_h = vqmovn_u32(u32x4);
 
@@ -211,25 +224,29 @@ __float32_to_uint8_neon_u8x8(float scale, float *input, 
uint8_t *output)
 }
 
 static inline void
-__float32_to_uint8_neon_u8x1(float scale, float *input, uint8_t *output)
+__float32_to_uint8_neon_u8x1(const float *input, uint8_t *output, float scale, 
uint8_t zero_point)
 {
-       uint32_t u32;
+       float32x2_t f32x2;
+       uint32x2_t u32x2;
        uint16_t u16;
 
        /* scale and convert, round to nearest with ties away rounding mode */
-       u32 = vcvtas_u32_f32(scale * (*input));
+       f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
+       u32x2 = vcvta_u32_f32(f32x2);
 
        /* saturate narrow */
-       u16 = vqmovns_u32(u32);
+       u16 = vqmovns_u32(vget_lane_u32(u32x2, 0));
 
        /* convert to uint8_t */
        *output = vqmovnh_u16(u16);
 }
 
 int
-rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint8_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint8_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -238,14 +255,14 @@ rte_ml_io_float32_to_uint8(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint8_t *)output;
        vlen = 2 * sizeof(float) / sizeof(uint8_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_uint8_neon_u8x8(scale, input_buffer, 
output_buffer);
+               __float32_to_uint8_neon_u8x8(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -253,7 +270,7 @@ rte_ml_io_float32_to_uint8(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_uint8_neon_u8x1(scale, input_buffer, 
output_buffer);
+               __float32_to_uint8_neon_u8x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -262,45 +279,48 @@ rte_ml_io_float32_to_uint8(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__uint8_to_float32_neon_f32x8(float scale, uint8_t *input, float *output)
+__uint8_to_float32_neon_f32x8(const uint8_t *input, float *output, float 
scale, uint8_t zero_point)
 {
        float32x4_t f32x4;
        uint16x8_t u16x8;
-       uint16x4_t u16x4;
-       uint32x4_t u32x4;
+       int16x8_t s16x8;
+       int16x4_t s16x4;
+       int32x4_t s32x4;
        uint8x8_t u8x8;
 
        /* load 8 x uint8_t elements */
        u8x8 = vld1_u8(input);
-
-       /* widen uint8_t to uint16_t */
        u16x8 = vmovl_u8(u8x8);
+       s16x8 = vreinterpretq_s16_u16(u16x8);
 
        /* convert lower 4 elements: widen to uint32_t, convert to float, scale 
and store */
-       u16x4 = vget_low_u16(u16x8);
-       u32x4 = vmovl_u16(u16x4);
-       f32x4 = vcvtq_f32_u32(u32x4);
+       s16x4 = vget_low_s16(s16x8);
+       s32x4 = vmovl_s16(s16x4);
+       f32x4 = vcvtq_f32_s32(s32x4);
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
        f32x4 = vmulq_n_f32(f32x4, scale);
        vst1q_f32(output, f32x4);
 
        /* convert higher 4 elements: widen to uint32_t, convert to float, 
scale and store */
-       u16x4 = vget_high_u16(u16x8);
-       u32x4 = vmovl_u16(u16x4);
-       f32x4 = vcvtq_f32_u32(u32x4);
+       s16x4 = vget_high_s16(s16x8);
+       s32x4 = vmovl_s16(s16x4);
+       f32x4 = vcvtq_f32_s32(s32x4);
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
        f32x4 = vmulq_n_f32(f32x4, scale);
        vst1q_f32(output + 4, f32x4);
 }
 
 static inline void
-__uint8_to_float32_neon_f32x1(float scale, uint8_t *input, float *output)
+__uint8_to_float32_neon_f32x1(const uint8_t *input, float *output, float 
scale, uint8_t zero_point)
 {
-       *output = scale * vcvts_f32_u32((uint32_t)*input);
+       *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint8_t zero_point)
 {
-       uint8_t *input_buffer;
+       const uint8_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint64_t vlen;
@@ -309,14 +329,14 @@ rte_ml_io_uint8_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint8_t *)input;
+       input_buffer = (const uint8_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(uint8_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __uint8_to_float32_neon_f32x8(scale, input_buffer, 
output_buffer);
+               __uint8_to_float32_neon_f32x8(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -324,7 +344,7 @@ rte_ml_io_uint8_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __uint8_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __uint8_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -333,7 +353,7 @@ rte_ml_io_uint8_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_int16_neon_s16x4(float scale, float *input, int16_t *output)
+__float32_to_int16_neon_s16x4(const float *input, int16_t *output, float 
scale, int16_t zero_point)
 {
        float32x4_t f32x4;
        int16x4_t s16x4;
@@ -343,34 +363,43 @@ __float32_to_int16_neon_s16x4(float scale, float *input, 
int16_t *output)
        f32x4 = vld1q_f32(input);
 
        /* scale */
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+
+       /* add zero point */
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
 
        /* convert to int32x4_t using round to nearest with ties away rounding 
mode */
        s32x4 = vcvtaq_s32_f32(f32x4);
 
        /* saturate narrow to int16x4_t */
        s16x4 = vqmovn_s32(s32x4);
+       s16x4 = vmax_s16(s16x4, vdup_n_s16(INT16_MIN + 1));
 
        /* store 4 elements */
        vst1_s16(output, s16x4);
 }
 
 static inline void
-__float32_to_int16_neon_s16x1(float scale, float *input, int16_t *output)
+__float32_to_int16_neon_s16x1(const float *input, int16_t *output, float 
scale, int16_t zero_point)
 {
-       int32_t s32;
+       float32x2_t f32x2;
+       int32x2_t s32x2;
 
        /* scale and convert, round to nearest with ties away rounding mode */
-       s32 = vcvtas_s32_f32(scale * (*input));
+       f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
+       s32x2 = vcvta_s32_f32(f32x2);
+       s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
 
        /* saturate narrow */
-       *output = vqmovns_s32(s32);
+       *output = vqmovns_s32(vget_lane_s32(s32x2, 0));
 }
 
 int
-rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int16_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int16_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -379,14 +408,14 @@ rte_ml_io_float32_to_int16(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int16_t *)output;
        vlen = 2 * sizeof(float) / sizeof(int16_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_int16_neon_s16x4(scale, input_buffer, 
output_buffer);
+               __float32_to_int16_neon_s16x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -394,7 +423,7 @@ rte_ml_io_float32_to_int16(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_int16_neon_s16x1(scale, input_buffer, 
output_buffer);
+               __float32_to_int16_neon_s16x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -403,7 +432,7 @@ rte_ml_io_float32_to_int16(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__int16_to_float32_neon_f32x4(float scale, int16_t *input, float *output)
+__int16_to_float32_neon_f32x4(const int16_t *input, float *output, float 
scale, int16_t zero_point)
 {
        float32x4_t f32x4;
        int16x4_t s16x4;
@@ -418,6 +447,9 @@ __int16_to_float32_neon_f32x4(float scale, int16_t *input, 
float *output)
        /* convert int32_t to float */
        f32x4 = vcvtq_f32_s32(s32x4);
 
+       /* subtract zero point */
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
+
        /* scale */
        f32x4 = vmulq_n_f32(f32x4, scale);
 
@@ -426,15 +458,16 @@ __int16_to_float32_neon_f32x4(float scale, int16_t 
*input, float *output)
 }
 
 static inline void
-__int16_to_float32_neon_f32x1(float scale, int16_t *input, float *output)
+__int16_to_float32_neon_f32x1(const int16_t *input, float *output, float 
scale, int16_t zero_point)
 {
-       *output = scale * vcvts_f32_s32((int32_t)*input);
+       *output = scale * (vcvts_f32_s32((int32_t)*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int16_t zero_point)
 {
-       int16_t *input_buffer;
+       const int16_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -443,14 +476,14 @@ rte_ml_io_int16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int16_t *)input;
+       input_buffer = (const int16_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(int16_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __int16_to_float32_neon_f32x4(scale, input_buffer, 
output_buffer);
+               __int16_to_float32_neon_f32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -458,7 +491,7 @@ rte_ml_io_int16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __int16_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __int16_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -467,7 +500,8 @@ rte_ml_io_int16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_uint16_neon_u16x4(float scale, float *input, uint16_t *output)
+__float32_to_uint16_neon_u16x4(const float *input, uint16_t *output, float 
scale,
+                              uint16_t zero_point)
 {
        float32x4_t f32x4;
        uint16x4_t u16x4;
@@ -477,7 +511,10 @@ __float32_to_uint16_neon_u16x4(float scale, float *input, 
uint16_t *output)
        f32x4 = vld1q_f32(input);
 
        /* scale */
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+
+       /* add zero point */
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
 
        /* convert using round to nearest with ties to away rounding mode */
        u32x4 = vcvtaq_u32_f32(f32x4);
@@ -490,21 +527,23 @@ __float32_to_uint16_neon_u16x4(float scale, float *input, 
uint16_t *output)
 }
 
 static inline void
-__float32_to_uint16_neon_u16x1(float scale, float *input, uint16_t *output)
+__float32_to_uint16_neon_u16x1(const float *input, uint16_t *output, float 
scale,
+                              uint16_t zero_point)
 {
        uint32_t u32;
 
        /* scale and convert, round to nearest with ties away rounding mode */
-       u32 = vcvtas_u32_f32(scale * (*input));
+       u32 = vcvtas_u32_f32((*input) / scale + (float)zero_point);
 
        /* saturate narrow */
-       *output = vqmovns_u32(u32);
+       *output = vqmovns_u32(u32) + zero_point;
 }
 
 int
-rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint16_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint16_t *output_buffer;
        uint64_t nb_iterations;
        uint64_t vlen;
@@ -513,14 +552,14 @@ rte_ml_io_float32_to_uint16(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint16_t *)output;
        vlen = 2 * sizeof(float) / sizeof(uint16_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_uint16_neon_u16x4(scale, input_buffer, 
output_buffer);
+               __float32_to_uint16_neon_u16x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -528,7 +567,7 @@ rte_ml_io_float32_to_uint16(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_uint16_neon_u16x1(scale, input_buffer, 
output_buffer);
+               __float32_to_uint16_neon_u16x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -537,7 +576,8 @@ rte_ml_io_float32_to_uint16(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__uint16_to_float32_neon_f32x4(float scale, uint16_t *input, float *output)
+__uint16_to_float32_neon_f32x4(const uint16_t *input, float *output, float 
scale,
+                              uint16_t zero_point)
 {
        float32x4_t f32x4;
        uint16x4_t u16x4;
@@ -552,6 +592,9 @@ __uint16_to_float32_neon_f32x4(float scale, uint16_t 
*input, float *output)
        /* convert uint32_t to float */
        f32x4 = vcvtq_f32_u32(u32x4);
 
+       /* subtract zero point */
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
+
        /* scale */
        f32x4 = vmulq_n_f32(f32x4, scale);
 
@@ -560,15 +603,17 @@ __uint16_to_float32_neon_f32x4(float scale, uint16_t 
*input, float *output)
 }
 
 static inline void
-__uint16_to_float32_neon_f32x1(float scale, uint16_t *input, float *output)
+__uint16_to_float32_neon_f32x1(const uint16_t *input, float *output, float 
scale,
+                              uint16_t zero_point)
 {
-       *output = scale * vcvts_f32_u32((uint32_t)*input);
+       *output = scale * (vcvts_f32_u32((uint32_t)*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint16_t zero_point)
 {
-       uint16_t *input_buffer;
+       const uint16_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -577,14 +622,14 @@ rte_ml_io_uint16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint16_t *)input;
+       input_buffer = (const uint16_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(uint16_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __uint16_to_float32_neon_f32x4(scale, input_buffer, 
output_buffer);
+               __uint16_to_float32_neon_f32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -592,7 +637,7 @@ rte_ml_io_uint16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __uint16_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __uint16_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -601,7 +646,7 @@ rte_ml_io_uint16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_int32_neon_s32x4(float scale, float *input, int32_t *output)
+__float32_to_int32_neon_s32x4(const float *input, int32_t *output, float 
scale, int32_t zero_point)
 {
        float32x4_t f32x4;
        int32x4_t s32x4;
@@ -610,26 +655,43 @@ __float32_to_int32_neon_s32x4(float scale, float *input, 
int32_t *output)
        f32x4 = vld1q_f32(input);
 
        /* scale */
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+
+       /* add zero point */
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
 
        /* convert to int32x4_t using round to nearest with ties away rounding 
mode */
        s32x4 = vcvtaq_s32_f32(f32x4);
 
+       /* add zero_point */
+       s32x4 = vaddq_s32(s32x4, vdupq_n_s32(zero_point));
+       s32x4 = vmaxq_s32(s32x4, vdupq_n_s32(INT32_MIN + 1));
+
        /* store 4 elements */
        vst1q_s32(output, s32x4);
 }
 
 static inline void
-__float32_to_int32_neon_s32x1(float scale, float *input, int32_t *output)
+__float32_to_int32_neon_s32x1(const float *input, int32_t *output, float 
scale, int32_t zero_point)
 {
+       float32x2_t f32x2;
+       int32x2_t s32x2;
+
        /* scale and convert, round to nearest with ties away rounding mode */
-       *output = vcvtas_s32_f32(scale * (*input));
+       f32x2 = vdiv_f32(vdup_n_f32(*input), vdup_n_f32(scale));
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
+       s32x2 = vcvta_s32_f32(f32x2);
+       s32x2 = vmax_s32(s32x2, vdup_n_s32(INT16_MIN + 1));
+
+       /* saturate narrow */
+       vst1_lane_s32(output, s32x2, 0);
 }
 
 int
-rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int32_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int32_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -638,14 +700,14 @@ rte_ml_io_float32_to_int32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int32_t *)output;
        vlen = 2 * sizeof(float) / sizeof(int32_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_int32_neon_s32x4(scale, input_buffer, 
output_buffer);
+               __float32_to_int32_neon_s32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -653,7 +715,7 @@ rte_ml_io_float32_to_int32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_int32_neon_s32x1(scale, input_buffer, 
output_buffer);
+               __float32_to_int32_neon_s32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -662,7 +724,7 @@ rte_ml_io_float32_to_int32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__int32_to_float32_neon_f32x4(float scale, int32_t *input, float *output)
+__int32_to_float32_neon_f32x4(const int32_t *input, float *output, float 
scale, int32_t zero_point)
 {
        float32x4_t f32x4;
        int32x4_t s32x4;
@@ -673,6 +735,9 @@ __int32_to_float32_neon_f32x4(float scale, int32_t *input, 
float *output)
        /* convert int32_t to float */
        f32x4 = vcvtq_f32_s32(s32x4);
 
+       /* subtract zero point */
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
+
        /* scale */
        f32x4 = vmulq_n_f32(f32x4, scale);
 
@@ -681,15 +746,16 @@ __int32_to_float32_neon_f32x4(float scale, int32_t 
*input, float *output)
 }
 
 static inline void
-__int32_to_float32_neon_f32x1(float scale, int32_t *input, float *output)
+__int32_to_float32_neon_f32x1(const int32_t *input, float *output, float 
scale, int32_t zero_point)
 {
-       *output = scale * vcvts_f32_s32(*input);
+       *output = scale * (vcvts_f32_s32(*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int32_t zero_point)
 {
-       int32_t *input_buffer;
+       const int32_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -698,14 +764,14 @@ rte_ml_io_int32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int32_t *)input;
+       input_buffer = (const int32_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(int32_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __int32_to_float32_neon_f32x4(scale, input_buffer, 
output_buffer);
+               __int32_to_float32_neon_f32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -713,7 +779,7 @@ rte_ml_io_int32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __int32_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __int32_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -722,7 +788,8 @@ rte_ml_io_int32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_uint32_neon_u32x4(float scale, float *input, uint32_t *output)
+__float32_to_uint32_neon_u32x4(const float *input, uint32_t *output, float 
scale,
+                              uint32_t zero_point)
 {
        float32x4_t f32x4;
        uint32x4_t u32x4;
@@ -731,7 +798,10 @@ __float32_to_uint32_neon_u32x4(float scale, float *input, 
uint32_t *output)
        f32x4 = vld1q_f32(input);
 
        /* scale */
-       f32x4 = vmulq_n_f32(f32x4, scale);
+       f32x4 = vdivq_f32(f32x4, vdupq_n_f32(scale));
+
+       /* add zero point */
+       f32x4 = vaddq_f32(f32x4, vdupq_n_f32((float)zero_point));
 
        /* convert using round to nearest with ties to away rounding mode */
        u32x4 = vcvtaq_u32_f32(f32x4);
@@ -741,16 +811,18 @@ __float32_to_uint32_neon_u32x4(float scale, float *input, 
uint32_t *output)
 }
 
 static inline void
-__float32_to_uint32_neon_u32x1(float scale, float *input, uint32_t *output)
+__float32_to_uint32_neon_u32x1(const float *input, uint32_t *output, float 
scale,
+                              uint32_t zero_point)
 {
        /* scale and convert, round to nearest with ties away rounding mode */
-       *output = vcvtas_u32_f32(scale * (*input));
+       *output = vcvtas_u32_f32((*input) / scale + (float)zero_point);
 }
 
 int
-rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint32_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint32_t *output_buffer;
        uint64_t nb_iterations;
        uint64_t vlen;
@@ -759,14 +831,14 @@ rte_ml_io_float32_to_uint32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint32_t *)output;
        vlen = 2 * sizeof(float) / sizeof(uint32_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_uint32_neon_u32x4(scale, input_buffer, 
output_buffer);
+               __float32_to_uint32_neon_u32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -774,7 +846,7 @@ rte_ml_io_float32_to_uint32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_uint32_neon_u32x1(scale, input_buffer, 
output_buffer);
+               __float32_to_uint32_neon_u32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -783,7 +855,8 @@ rte_ml_io_float32_to_uint32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__uint32_to_float32_neon_f32x4(float scale, uint32_t *input, float *output)
+__uint32_to_float32_neon_f32x4(const uint32_t *input, float *output, float 
scale,
+                              uint32_t zero_point)
 {
        float32x4_t f32x4;
        uint32x4_t u32x4;
@@ -794,6 +867,9 @@ __uint32_to_float32_neon_f32x4(float scale, uint32_t 
*input, float *output)
        /* convert uint32_t to float */
        f32x4 = vcvtq_f32_u32(u32x4);
 
+       /* subtract zero point */
+       f32x4 = vsubq_f32(f32x4, vdupq_n_f32((float)zero_point));
+
        /* scale */
        f32x4 = vmulq_n_f32(f32x4, scale);
 
@@ -802,15 +878,17 @@ __uint32_to_float32_neon_f32x4(float scale, uint32_t 
*input, float *output)
 }
 
 static inline void
-__uint32_to_float32_neon_f32x1(float scale, uint32_t *input, float *output)
+__uint32_to_float32_neon_f32x1(const uint32_t *input, float *output, float 
scale,
+                              uint32_t zero_point)
 {
-       *output = scale * vcvts_f32_u32(*input);
+       *output = scale * (vcvts_f32_u32(*input) - (float)zero_point);
 }
 
 int
-rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint32_t zero_point)
 {
-       uint32_t *input_buffer;
+       const uint32_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -819,14 +897,14 @@ rte_ml_io_uint32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint32_t *)input;
+       input_buffer = (const uint32_t *)input;
        output_buffer = (float *)output;
        vlen = 2 * sizeof(float) / sizeof(uint32_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __uint32_to_float32_neon_f32x4(scale, input_buffer, 
output_buffer);
+               __uint32_to_float32_neon_f32x4(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -834,7 +912,7 @@ rte_ml_io_uint32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __uint32_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __uint32_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -843,55 +921,68 @@ rte_ml_io_uint32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_int64_neon_s64x2(float scale, float *input, int64_t *output)
+__float32_to_int64_neon_s64x2(const float *input, int64_t *output, float 
scale, int64_t zero_point)
 {
        float32x2_t f32x2;
        float64x2_t f64x2;
        int64x2_t s64x2;
+       int64_t s64;
 
        /* load 2 x float elements */
        f32x2 = vld1_f32(input);
 
        /* scale */
-       f32x2 = vmul_n_f32(f32x2, scale);
+       f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
+
+       /* add zero point */
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
 
        /* convert to float64x2_t */
        f64x2 = vcvt_f64_f32(f32x2);
 
        /* convert to int64x2_t */
        s64x2 = vcvtaq_s64_f64(f64x2);
+       s64 = vgetq_lane_s64(s64x2, 0);
+       s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
 
-       /* store 2 elements */
-       vst1q_s64(output, s64x2);
+       /* store lane 0 of int64x2_t */
+       *output = s64;
 }
 
 static inline void
-__float32_to_int64_neon_s64x1(float scale, float *input, int64_t *output)
+__float32_to_int64_neon_s64x1(const float *input, int64_t *output, float 
scale, int64_t zero_point)
 {
        float32x2_t f32x2;
        float64x2_t f64x2;
        int64x2_t s64x2;
+       int64_t s64;
 
        /* load 1 x float element */
        f32x2 = vdup_n_f32(*input);
 
        /* scale */
-       f32x2 = vmul_n_f32(f32x2, scale);
+       f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
+
+       /* add zero point */
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
 
        /* convert to float64x2_t */
        f64x2 = vcvt_f64_f32(f32x2);
 
        /* convert to int64x2_t */
        s64x2 = vcvtaq_s64_f64(f64x2);
+       s64 = vgetq_lane_s64(s64x2, 0);
+       s64 = (s64 == INT64_MIN) ? INT64_MIN + 1 : s64;
 
        /* store lane 0 of int64x2_t */
-       vst1q_lane_s64(output, s64x2, 0);
+       *output = s64;
 }
 
 int
-rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int64_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int64_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -900,14 +991,14 @@ rte_ml_io_float32_to_int64(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int64_t *)output;
        vlen = 4 * sizeof(float) / sizeof(int64_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_int64_neon_s64x2(scale, input_buffer, 
output_buffer);
+               __float32_to_int64_neon_s64x2(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -915,7 +1006,7 @@ rte_ml_io_float32_to_int64(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_int64_neon_s64x1(scale, input_buffer, 
output_buffer);
+               __float32_to_int64_neon_s64x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -924,7 +1015,7 @@ rte_ml_io_float32_to_int64(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__int64_to_float32_neon_f32x2(float scale, int64_t *input, float *output)
+__int64_to_float32_neon_f32x2(const int64_t *input, float *output, float 
scale, int64_t zero_point)
 {
        int64x2_t s64x2;
        float64x2_t f64x2;
@@ -939,6 +1030,9 @@ __int64_to_float32_neon_f32x2(float scale, int64_t *input, 
float *output)
        /* convert float64x2_t to float32x2_t */
        f32x2 = vcvt_f32_f64(f64x2);
 
+       /* subtract zero_point */
+       f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
+
        /* scale */
        f32x2 = vmul_n_f32(f32x2, scale);
 
@@ -947,7 +1041,7 @@ __int64_to_float32_neon_f32x2(float scale, int64_t *input, 
float *output)
 }
 
 static inline void
-__int64_to_float32_neon_f32x1(float scale, int64_t *input, float *output)
+__int64_to_float32_neon_f32x1(const int64_t *input, float *output, float 
scale, int64_t zero_point)
 {
        int64x2_t s64x2;
        float64x2_t f64x2;
@@ -962,17 +1056,21 @@ __int64_to_float32_neon_f32x1(float scale, int64_t 
*input, float *output)
        /* convert float64x2_t to float32x2_t */
        f32x2 = vcvt_f32_f64(f64x2);
 
+       /* subtract zero_point */
+       f32x2 = vsub_f32(f32x2, vdup_n_f32(zero_point));
+
        /* scale */
        f32x2 = vmul_n_f32(f32x2, scale);
 
-       /* store float32x2_t */
+       /* store float32x2_t lane 0 */
        vst1_lane_f32(output, f32x2, 0);
 }
 
 int
-rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int64_t zero_point)
 {
-       int64_t *input_buffer;
+       const int64_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -981,14 +1079,14 @@ rte_ml_io_int64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int64_t *)input;
+       input_buffer = (const int64_t *)input;
        output_buffer = (float *)output;
        vlen = 4 * sizeof(float) / sizeof(int64_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __int64_to_float32_neon_f32x2(scale, input_buffer, 
output_buffer);
+               __int64_to_float32_neon_f32x2(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -996,7 +1094,7 @@ rte_ml_io_int64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __int64_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __int64_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -1005,7 +1103,8 @@ rte_ml_io_int64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_uint64_neon_u64x2(float scale, float *input, uint64_t *output)
+__float32_to_uint64_neon_u64x2(const float *input, uint64_t *output, float 
scale,
+                              uint64_t zero_point)
 {
        float32x2_t f32x2;
        float64x2_t f64x2;
@@ -1015,7 +1114,10 @@ __float32_to_uint64_neon_u64x2(float scale, float 
*input, uint64_t *output)
        f32x2 = vld1_f32(input);
 
        /* scale */
-       f32x2 = vmul_n_f32(f32x2, scale);
+       f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
+
+       /* add zero point */
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
 
        /* convert to float64x2_t */
        f64x2 = vcvt_f64_f32(f32x2);
@@ -1028,7 +1130,8 @@ __float32_to_uint64_neon_u64x2(float scale, float *input, 
uint64_t *output)
 }
 
 static inline void
-__float32_to_uint64_neon_u64x1(float scale, float *input, uint64_t *output)
+__float32_to_uint64_neon_u64x1(const float *input, uint64_t *output, float 
scale,
+                              uint64_t zero_point)
 {
        float32x2_t f32x2;
        float64x2_t f64x2;
@@ -1038,7 +1141,10 @@ __float32_to_uint64_neon_u64x1(float scale, float 
*input, uint64_t *output)
        f32x2 = vld1_lane_f32(input, vdup_n_f32(0), 0);
 
        /* scale */
-       f32x2 = vmul_n_f32(f32x2, scale);
+       f32x2 = vdiv_f32(f32x2, vdup_n_f32(scale));
+
+       /* add zero_point */
+       f32x2 = vadd_f32(f32x2, vdup_n_f32((float)zero_point));
 
        /* convert to float64x2_t */
        f64x2 = vcvt_f64_f32(f32x2);
@@ -1051,9 +1157,10 @@ __float32_to_uint64_neon_u64x1(float scale, float 
*input, uint64_t *output)
 }
 
 int
-rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint64_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint64_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -1062,14 +1169,14 @@ rte_ml_io_float32_to_uint64(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint64_t *)output;
        vlen = 4 * sizeof(float) / sizeof(uint64_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __float32_to_uint64_neon_u64x2(scale, input_buffer, 
output_buffer);
+               __float32_to_uint64_neon_u64x2(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -1077,7 +1184,7 @@ rte_ml_io_float32_to_uint64(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __float32_to_uint64_neon_u64x1(scale, input_buffer, 
output_buffer);
+               __float32_to_uint64_neon_u64x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -1086,7 +1193,8 @@ rte_ml_io_float32_to_uint64(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__uint64_to_float32_neon_f32x2(float scale, uint64_t *input, float *output)
+__uint64_to_float32_neon_f32x2(const uint64_t *input, float *output, float 
scale,
+                              uint64_t zero_point)
 {
        uint64x2_t u64x2;
        float64x2_t f64x2;
@@ -1101,6 +1209,9 @@ __uint64_to_float32_neon_f32x2(float scale, uint64_t 
*input, float *output)
        /* convert float64x2_t to float32x2_t */
        f32x2 = vcvt_f32_f64(f64x2);
 
+       /* subtract zero_point */
+       f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
+
        /* scale */
        f32x2 = vmul_n_f32(f32x2, scale);
 
@@ -1109,7 +1220,8 @@ __uint64_to_float32_neon_f32x2(float scale, uint64_t 
*input, float *output)
 }
 
 static inline void
-__uint64_to_float32_neon_f32x1(float scale, uint64_t *input, float *output)
+__uint64_to_float32_neon_f32x1(const uint64_t *input, float *output, float 
scale,
+                              uint64_t zero_point)
 {
        uint64x2_t u64x2;
        float64x2_t f64x2;
@@ -1124,17 +1236,21 @@ __uint64_to_float32_neon_f32x1(float scale, uint64_t 
*input, float *output)
        /* convert float64x2_t to float32x2_t */
        f32x2 = vcvt_f32_f64(f64x2);
 
+       /* subtract zero_point */
+       f32x2 = vsub_f32(f32x2, vdup_n_f32((float)zero_point));
+
        /* scale */
        f32x2 = vmul_n_f32(f32x2, scale);
 
-       /* store float32x2_t */
+       /* store float32x2_t lane 0 */
        vst1_lane_f32(output, f32x2, 0);
 }
 
 int
-rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint64_t zero_point)
 {
-       uint64_t *input_buffer;
+       const uint64_t *input_buffer;
        float *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -1143,14 +1259,14 @@ rte_ml_io_uint64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint64_t *)input;
+       input_buffer = (const uint64_t *)input;
        output_buffer = (float *)output;
        vlen = 4 * sizeof(float) / sizeof(uint64_t);
        nb_iterations = nb_elements / vlen;
 
        /* convert vlen elements in each iteration */
        for (i = 0; i < nb_iterations; i++) {
-               __uint64_to_float32_neon_f32x2(scale, input_buffer, 
output_buffer);
+               __uint64_to_float32_neon_f32x2(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer += vlen;
                output_buffer += vlen;
        }
@@ -1158,7 +1274,7 @@ rte_ml_io_uint64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
        /* convert leftover elements */
        i = i * vlen;
        for (; i < nb_elements; i++) {
-               __uint64_to_float32_neon_f32x1(scale, input_buffer, 
output_buffer);
+               __uint64_to_float32_neon_f32x1(input_buffer, output_buffer, 
scale, zero_point);
                input_buffer++;
                output_buffer++;
        }
@@ -1167,7 +1283,7 @@ rte_ml_io_uint64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 static inline void
-__float32_to_float16_neon_f16x4(float32_t *input, float16_t *output)
+__float32_to_float16_neon_f16x4(const float32_t *input, float16_t *output)
 {
        float32x4_t f32x4;
        float16x4_t f16x4;
@@ -1183,7 +1299,7 @@ __float32_to_float16_neon_f16x4(float32_t *input, 
float16_t *output)
 }
 
 static inline void
-__float32_to_float16_neon_f16x1(float32_t *input, float16_t *output)
+__float32_to_float16_neon_f16x1(const float32_t *input, float16_t *output)
 {
        float32x4_t f32x4;
        float16x4_t f16x4;
@@ -1199,9 +1315,9 @@ __float32_to_float16_neon_f16x1(float32_t *input, 
float16_t *output)
 }
 
 int
-rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t 
nb_elements)
 {
-       float32_t *input_buffer;
+       const float32_t *input_buffer;
        float16_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -1210,7 +1326,7 @@ rte_ml_io_float32_to_float16(uint64_t nb_elements, void 
*input, void *output)
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (float32_t *)input;
+       input_buffer = (const float32_t *)input;
        output_buffer = (float16_t *)output;
        vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
        nb_iterations = nb_elements / vlen;
@@ -1234,7 +1350,7 @@ rte_ml_io_float32_to_float16(uint64_t nb_elements, void 
*input, void *output)
 }
 
 static inline void
-__float16_to_float32_neon_f32x4(float16_t *input, float32_t *output)
+__float16_to_float32_neon_f32x4(const float16_t *input, float32_t *output)
 {
        float16x4_t f16x4;
        float32x4_t f32x4;
@@ -1250,7 +1366,7 @@ __float16_to_float32_neon_f32x4(float16_t *input, 
float32_t *output)
 }
 
 static inline void
-__float16_to_float32_neon_f32x1(float16_t *input, float32_t *output)
+__float16_to_float32_neon_f32x1(const float16_t *input, float32_t *output)
 {
        float16x4_t f16x4;
        float32x4_t f32x4;
@@ -1266,9 +1382,9 @@ __float16_to_float32_neon_f32x1(float16_t *input, 
float32_t *output)
 }
 
 int
-rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t 
nb_elements)
 {
-       float16_t *input_buffer;
+       const float16_t *input_buffer;
        float32_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -1277,7 +1393,7 @@ rte_ml_io_float16_to_float32(uint64_t nb_elements, void 
*input, void *output)
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (float16_t *)input;
+       input_buffer = (const float16_t *)input;
        output_buffer = (float32_t *)output;
        vlen = 2 * sizeof(float32_t) / sizeof(float16_t);
        nb_iterations = nb_elements / vlen;
diff --git a/lib/mldev/mldev_utils_neon_bfloat16.c 
b/lib/mldev/mldev_utils_neon_bfloat16.c
index 8dec3fd8343..b8e68532669 100644
--- a/lib/mldev/mldev_utils_neon_bfloat16.c
+++ b/lib/mldev/mldev_utils_neon_bfloat16.c
@@ -18,7 +18,7 @@
 #ifdef __ARM_FEATURE_BF16
 
 static inline void
-__float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t *output)
+__float32_to_bfloat16_neon_f16x4(const float32_t *input, bfloat16_t *output)
 {
        float32x4_t f32x4;
        bfloat16x4_t bf16x4;
@@ -34,7 +34,7 @@ __float32_to_bfloat16_neon_f16x4(float32_t *input, bfloat16_t 
*output)
 }
 
 static inline void
-__float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t *output)
+__float32_to_bfloat16_neon_f16x1(const float32_t *input, bfloat16_t *output)
 {
        float32x4_t f32x4;
        bfloat16x4_t bf16x4;
@@ -50,9 +50,9 @@ __float32_to_bfloat16_neon_f16x1(float32_t *input, bfloat16_t 
*output)
 }
 
 int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float32_to_bfloat16(const void *input, void *output, uint64_t 
nb_elements)
 {
-       float32_t *input_buffer;
+       const float32_t *input_buffer;
        bfloat16_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -61,7 +61,7 @@ rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void 
*input, void *output)
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (float32_t *)input;
+       input_buffer = (const float32_t *)input;
        output_buffer = (bfloat16_t *)output;
        vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
        nb_iterations = nb_elements / vlen;
@@ -85,7 +85,7 @@ rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void 
*input, void *output)
 }
 
 static inline void
-__bfloat16_to_float32_neon_f32x4(bfloat16_t *input, float32_t *output)
+__bfloat16_to_float32_neon_f32x4(const bfloat16_t *input, float32_t *output)
 {
        bfloat16x4_t bf16x4;
        float32x4_t f32x4;
@@ -101,7 +101,7 @@ __bfloat16_to_float32_neon_f32x4(bfloat16_t *input, 
float32_t *output)
 }
 
 static inline void
-__bfloat16_to_float32_neon_f32x1(bfloat16_t *input, float32_t *output)
+__bfloat16_to_float32_neon_f32x1(const bfloat16_t *input, float32_t *output)
 {
        bfloat16x4_t bf16x4;
        float32x4_t f32x4;
@@ -117,9 +117,9 @@ __bfloat16_to_float32_neon_f32x1(bfloat16_t *input, 
float32_t *output)
 }
 
 int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_bfloat16_to_float32(const void *input, void *output, uint64_t 
nb_elements)
 {
-       bfloat16_t *input_buffer;
+       const bfloat16_t *input_buffer;
        float32_t *output_buffer;
        uint64_t nb_iterations;
        uint32_t vlen;
@@ -128,7 +128,7 @@ rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void 
*input, void *output)
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (bfloat16_t *)input;
+       input_buffer = (const bfloat16_t *)input;
        output_buffer = (float32_t *)output;
        vlen = 2 * sizeof(float32_t) / sizeof(bfloat16_t);
        nb_iterations = nb_elements / vlen;
diff --git a/lib/mldev/mldev_utils_scalar.c b/lib/mldev/mldev_utils_scalar.c
index 63a9900cc8c..e1fefdec3b4 100644
--- a/lib/mldev/mldev_utils_scalar.c
+++ b/lib/mldev/mldev_utils_scalar.c
@@ -10,9 +10,10 @@
  */
 
 int
-rte_ml_io_float32_to_int8(float scale, uint64_t nb_elements, void *input, void 
*output)
+rte_ml_io_float32_to_int8(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                         int8_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int8_t *output_buffer;
        uint64_t i;
        int i32;
@@ -20,11 +21,11 @@ rte_ml_io_float32_to_int8(float scale, uint64_t 
nb_elements, void *input, void *
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int8_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i32 = (int32_t)round((*input_buffer) * scale);
+               i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
 
                if (i32 < INT8_MIN)
                        i32 = INT8_MIN;
@@ -42,20 +43,21 @@ rte_ml_io_float32_to_int8(float scale, uint64_t 
nb_elements, void *input, void *
 }
 
 int
-rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, void *input, void 
*output)
+rte_ml_io_int8_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                         int8_t zero_point)
 {
-       int8_t *input_buffer;
+       const int8_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int8_t *)input;
+       input_buffer = (const int8_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -65,9 +67,10 @@ rte_ml_io_int8_to_float32(float scale, uint64_t nb_elements, 
void *input, void *
 }
 
 int
-rte_ml_io_float32_to_uint8(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint8(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint8_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint8_t *output_buffer;
        int32_t i32;
        uint64_t i;
@@ -75,11 +78,11 @@ rte_ml_io_float32_to_uint8(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint8_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i32 = (int32_t)round((*input_buffer) * scale);
+               i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
 
                if (i32 < 0)
                        i32 = 0;
@@ -97,20 +100,21 @@ rte_ml_io_float32_to_uint8(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_uint8_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint8_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          uint8_t zero_point)
 {
-       uint8_t *input_buffer;
+       const uint8_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint8_t *)input;
+       input_buffer = (const uint8_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -120,9 +124,10 @@ rte_ml_io_uint8_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_int16(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int16(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int16_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int16_t *output_buffer;
        int32_t i32;
        uint64_t i;
@@ -130,11 +135,11 @@ rte_ml_io_float32_to_int16(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int16_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i32 = (int32_t)round((*input_buffer) * scale);
+               i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
 
                if (i32 < INT16_MIN)
                        i32 = INT16_MIN;
@@ -152,20 +157,21 @@ rte_ml_io_float32_to_int16(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_int16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int16_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int16_t zero_point)
 {
-       int16_t *input_buffer;
+       const int16_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int16_t *)input;
+       input_buffer = (const int16_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -175,9 +181,10 @@ rte_ml_io_int16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_uint16(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint16(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint16_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint16_t *output_buffer;
        int32_t i32;
        uint64_t i;
@@ -185,11 +192,11 @@ rte_ml_io_float32_to_uint16(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint16_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i32 = (int32_t)round((*input_buffer) * scale);
+               i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
 
                if (i32 < 0)
                        i32 = 0;
@@ -207,20 +214,21 @@ rte_ml_io_float32_to_uint16(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_uint16_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint16_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint16_t zero_point)
 {
-       uint16_t *input_buffer;
+       const uint16_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint16_t *)input;
+       input_buffer = (const uint16_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -230,20 +238,21 @@ rte_ml_io_uint16_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_int32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int32_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int32_t *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int32_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = (int32_t)round((*input_buffer) * scale);
+               *output_buffer = (int32_t)(round(*input_buffer / scale) + 
zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -253,20 +262,21 @@ rte_ml_io_float32_to_int32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_int32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int32_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int32_t zero_point)
 {
-       int32_t *input_buffer;
+       const int32_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int32_t *)input;
+       input_buffer = (const int32_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -276,9 +286,10 @@ rte_ml_io_int32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_uint32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint32_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint32_t *output_buffer;
        int32_t i32;
        uint64_t i;
@@ -286,11 +297,11 @@ rte_ml_io_float32_to_uint32(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint32_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i32 = (int32_t)round((*input_buffer) * scale);
+               i32 = (int32_t)(round(*input_buffer / scale) + zero_point);
 
                if (i32 < 0)
                        i32 = 0;
@@ -305,20 +316,21 @@ rte_ml_io_float32_to_uint32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_uint32_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint32_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint32_t zero_point)
 {
-       uint32_t *input_buffer;
+       const uint32_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint32_t *)input;
+       input_buffer = (const uint32_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -328,20 +340,21 @@ rte_ml_io_uint32_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_int64(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_int64(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int64_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        int64_t *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (int64_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = (int64_t)round((*input_buffer) * scale);
+               *output_buffer = (int64_t)(round(*input_buffer / scale) + 
zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -351,20 +364,21 @@ rte_ml_io_float32_to_int64(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_int64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_int64_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                          int64_t zero_point)
 {
-       int64_t *input_buffer;
+       const int64_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (int64_t *)input;
+       input_buffer = (const int64_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -374,9 +388,10 @@ rte_ml_io_int64_to_float32(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_float32_to_uint64(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_float32_to_uint64(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint64_t zero_point)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint64_t *output_buffer;
        int64_t i64;
        uint64_t i;
@@ -384,11 +399,11 @@ rte_ml_io_float32_to_uint64(float scale, uint64_t 
nb_elements, void *input, void
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint64_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               i64 = (int64_t)round((*input_buffer) * scale);
+               i64 = (int64_t)(round(*input_buffer / scale) + zero_point);
 
                if (i64 < 0)
                        i64 = 0;
@@ -403,20 +418,21 @@ rte_ml_io_float32_to_uint64(float scale, uint64_t 
nb_elements, void *input, void
 }
 
 int
-rte_ml_io_uint64_to_float32(float scale, uint64_t nb_elements, void *input, 
void *output)
+rte_ml_io_uint64_to_float32(const void *input, void *output, uint64_t 
nb_elements, float scale,
+                           uint64_t zero_point)
 {
-       uint64_t *input_buffer;
+       const uint64_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((scale == 0) || (nb_elements == 0) || (input == NULL) || (output == 
NULL))
                return -EINVAL;
 
-       input_buffer = (uint64_t *)input;
+       input_buffer = (const uint64_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
-               *output_buffer = scale * (float)(*input_buffer);
+               *output_buffer = scale * (float)(*input_buffer - zero_point);
 
                input_buffer++;
                output_buffer++;
@@ -548,16 +564,16 @@ __float32_to_float16_scalar_rtn(float x)
 }
 
 int
-rte_ml_io_float32_to_float16(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float32_to_float16(const void *input, void *output, uint64_t 
nb_elements)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint16_t *output_buffer;
        uint64_t i;
 
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint16_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
@@ -632,16 +648,16 @@ __float16_to_float32_scalar_rtx(uint16_t f16)
 }
 
 int
-rte_ml_io_float16_to_float32(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float16_to_float32(const void *input, void *output, uint64_t 
nb_elements)
 {
-       uint16_t *input_buffer;
+       const uint16_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (uint16_t *)input;
+       input_buffer = (const uint16_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
diff --git a/lib/mldev/mldev_utils_scalar_bfloat16.c 
b/lib/mldev/mldev_utils_scalar_bfloat16.c
index 14374163131..3f93272518f 100644
--- a/lib/mldev/mldev_utils_scalar_bfloat16.c
+++ b/lib/mldev/mldev_utils_scalar_bfloat16.c
@@ -92,16 +92,16 @@ __float32_to_bfloat16_scalar_rtn(float x)
 }
 
 int
-rte_ml_io_float32_to_bfloat16(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_float32_to_bfloat16(const void *input, void *output, uint64_t 
nb_elements)
 {
-       float *input_buffer;
+       const float *input_buffer;
        uint16_t *output_buffer;
        uint64_t i;
 
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (float *)input;
+       input_buffer = (const float *)input;
        output_buffer = (uint16_t *)output;
 
        for (i = 0; i < nb_elements; i++) {
@@ -174,16 +174,16 @@ __bfloat16_to_float32_scalar_rtx(uint16_t f16)
 }
 
 int
-rte_ml_io_bfloat16_to_float32(uint64_t nb_elements, void *input, void *output)
+rte_ml_io_bfloat16_to_float32(const void *input, void *output, uint64_t 
nb_elements)
 {
-       uint16_t *input_buffer;
+       const uint16_t *input_buffer;
        float *output_buffer;
        uint64_t i;
 
        if ((nb_elements == 0) || (input == NULL) || (output == NULL))
                return -EINVAL;
 
-       input_buffer = (uint16_t *)input;
+       input_buffer = (const uint16_t *)input;
        output_buffer = (float *)output;
 
        for (i = 0; i < nb_elements; i++) {
diff --git a/lib/mldev/rte_mldev.h b/lib/mldev/rte_mldev.h
index 634af3d5e1a..8b595839056 100644
--- a/lib/mldev/rte_mldev.h
+++ b/lib/mldev/rte_mldev.h
@@ -1013,6 +1013,468 @@ rte_ml_model_params_update(int16_t dev_id, uint16_t 
model_id, void *buffer);
 
 /* IO operations */
 
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to signed 8-bit
+ * integer format (INT8).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] i8
+ *      Output buffer to store INT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_int8(const void *fp32, void *i8, uint64_t nb_elements, 
float scale,
+                         int8_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in signed 8-bit integer format (INT8) 
to single precision
+ * floating format (float32).
+ *
+ * @param[in] i8
+ *      Input buffer containing INT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_int8_to_float32(const void *i8, void *fp32, uint64_t nb_elements, 
float scale,
+                         int8_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
+ * 8-bit integer format (UINT8).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] ui8
+ *      Output buffer to store UINT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_uint8(const void *fp32, void *ui8, uint64_t nb_elements, 
float scale,
+                          uint8_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in unsigned 8-bit integer format 
(UINT8) to single precision
+ * floating format (float32).
+ *
+ * @param[in] ui8
+ *      Input buffer containing UINT8 numbers. Size of buffer is equal to 
(nb_elements * 1) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_uint8_to_float32(const void *ui8, void *fp32, uint64_t nb_elements, 
float scale,
+                          uint8_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to signed
+ * 16-bit integer format (INT16).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] i16
+ *      Output buffer to store INT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_int16(const void *fp32, void *i16, uint64_t nb_elements, 
float scale,
+                          int16_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in signed 16-bit integer format (INT16) 
to single precision
+ * floating format (float32).
+ *
+ * @param[in] i16
+ *      Input buffer containing INT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_int16_to_float32(const void *i16, void *fp32, uint64_t nb_elements, 
float scale,
+                          int16_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
+ * 16-bit integer format (UINT16).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] ui16
+ *      Output buffer to store UINT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_uint16(const void *fp32, void *ui16, uint64_t 
nb_elements, float scale,
+                           uint16_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in unsigned 16-bit integer format 
(UINT16) to single
+ * precision floating format (float32).
+ *
+ * @param[in] ui16
+ *      Input buffer containing UINT16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_uint16_to_float32(const void *ui16, void *fp32, uint64_t 
nb_elements, float scale,
+                           uint16_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to signed
+ * 32-bit integer format (INT32).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] i32
+ *      Output buffer to store INT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_int32(const void *fp32, void *i32, uint64_t nb_elements, 
float scale,
+                          int32_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in signed 32-bit integer format (INT32) 
to single precision
+ * floating format (float32).
+ *
+ * @param[in] i32
+ *      Input buffer containing INT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+
+__rte_experimental
+int
+rte_ml_io_int32_to_float32(const void *i32, void *fp32, uint64_t nb_elements, 
float scale,
+                          int32_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
+ * 32-bit integer format (UINT32).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] ui32
+ *      Output buffer to store UINT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_uint32(const void *fp32, void *ui32, uint64_t 
nb_elements, float scale,
+                           uint32_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in unsigned 32-bit integer format 
(UINT32) to single
+ * precision floating format (float32).
+ *
+ * @param[in] ui32
+ *      Input buffer containing UINT32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_uint32_to_float32(const void *ui32, void *fp32, uint64_t 
nb_elements, float scale,
+                           uint32_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to signed
+ * 64-bit integer format (INT64).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] i64
+ *      Output buffer to store INT64 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_int64(const void *fp32, void *i64, uint64_t nb_elements, 
float scale,
+                          int64_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in signed 64-bit integer format (INT64) 
to single precision
+ * floating format (float32).
+ *
+ * @param[in] i64
+ *      Input buffer containing INT64 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_int64_to_float32(const void *i64, void *fp32, uint64_t nb_elements, 
float scale,
+                          int64_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to unsigned
+ * 64-bit integer format (UINT64).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] ui64
+ *      Output buffer to store UINT64 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_uint64(const void *fp32, void *ui64, uint64_t 
nb_elements, float scale,
+                           uint64_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in unsigned 64-bit integer format 
(UINT64) to single
+ *precision floating format (float32).
+ *
+ * @param[in] ui64
+ *      Input buffer containing UINT64 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ * @param[in] scale
+ *      Scale factor for conversion.
+ * @param[in] zero_point
+ *      Zero point for conversion.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_uint64_to_float32(const void *ui64, void *fp32, uint64_t 
nb_elements, float scale,
+                           uint64_t zero_point);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to half
+ * precision floating point format (FP16).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements *4) bytes.
+ * @param[out] fp16
+ *      Output buffer to store float16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_float16(const void *fp32, void *fp16, uint64_t 
nb_elements);
+
+/**
+ * Convert a buffer containing numbers in half precision floating format 
(FP16) to single precision
+ * floating point format (float32).
+ *
+ * @param[in] fp16
+ *      Input buffer containing float16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float16_to_float32(const void *fp16, void *fp32, uint64_t 
nb_elements);
+
+/**
+ * Convert a buffer containing numbers in single precision floating format 
(float32) to brain
+ * floating point format (bfloat16).
+ *
+ * @param[in] fp32
+ *      Input buffer containing float32 numbers. Size of buffer is equal to 
(nb_elements *4) bytes.
+ * @param[out] bf16
+ *      Output buffer to store bfloat16 numbers. Size of buffer is equal to 
(nb_elements * 2) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_float32_to_bfloat16(const void *fp32, void *bf16, uint64_t 
nb_elements);
+
+/**
+ * Convert a buffer containing numbers in brain floating point format 
(bfloat16) to single precision
+ * floating point format (float32).
+ *
+ * @param[in] bf16
+ *      Input buffer containing bfloat16 numbers. Size of buffer is equal to 
(nb_elements * 2)
+ * bytes.
+ * @param[out] fp32
+ *      Output buffer to store float32 numbers. Size of buffer is equal to 
(nb_elements * 4) bytes.
+ * @param[in] nb_elements
+ *      Number of elements in the buffer.
+ *
+ * @return
+ *      - 0, Success.
+ *      - < 0, Error code on failure.
+ */
+__rte_experimental
+int
+rte_ml_io_bfloat16_to_float32(const void *bf16, void *fp32, uint64_t 
nb_elements);
+
 /**
  * Quantize input data.
  *
diff --git a/lib/mldev/version.map b/lib/mldev/version.map
index 84bdd6c3004..6f0a70b4bd1 100644
--- a/lib/mldev/version.map
+++ b/lib/mldev/version.map
@@ -23,6 +23,26 @@ EXPERIMENTAL {
        rte_ml_dev_xstats_names_get;
        rte_ml_dev_xstats_reset;
        rte_ml_enqueue_burst;
+       rte_ml_io_float32_to_int8;
+       rte_ml_io_int8_to_float32;
+       rte_ml_io_float32_to_uint8;
+       rte_ml_io_uint8_to_float32;
+       rte_ml_io_float32_to_int16;
+       rte_ml_io_int16_to_float32;
+       rte_ml_io_float32_to_uint16;
+       rte_ml_io_uint16_to_float32;
+       rte_ml_io_float32_to_int32;
+       rte_ml_io_int32_to_float32;
+       rte_ml_io_float32_to_uint32;
+       rte_ml_io_uint32_to_float32;
+       rte_ml_io_float32_to_int64;
+       rte_ml_io_int64_to_float32;
+       rte_ml_io_float32_to_uint64;
+       rte_ml_io_uint64_to_float32;
+       rte_ml_io_float32_to_float16;
+       rte_ml_io_float16_to_float32;
+       rte_ml_io_float32_to_bfloat16;
+       rte_ml_io_bfloat16_to_float32;
        rte_ml_io_dequantize;
        rte_ml_io_quantize;
        rte_ml_model_info_get;
@@ -50,24 +70,4 @@ INTERNAL {
 
        rte_ml_io_type_size_get;
        rte_ml_io_type_to_str;
-       rte_ml_io_float32_to_int8;
-       rte_ml_io_int8_to_float32;
-       rte_ml_io_float32_to_uint8;
-       rte_ml_io_uint8_to_float32;
-       rte_ml_io_float32_to_int16;
-       rte_ml_io_int16_to_float32;
-       rte_ml_io_float32_to_uint16;
-       rte_ml_io_uint16_to_float32;
-       rte_ml_io_float32_to_int32;
-       rte_ml_io_int32_to_float32;
-       rte_ml_io_float32_to_uint32;
-       rte_ml_io_uint32_to_float32;
-       rte_ml_io_float32_to_int64;
-       rte_ml_io_int64_to_float32;
-       rte_ml_io_float32_to_uint64;
-       rte_ml_io_uint64_to_float32;
-       rte_ml_io_float32_to_float16;
-       rte_ml_io_float16_to_float32;
-       rte_ml_io_float32_to_bfloat16;
-       rte_ml_io_bfloat16_to_float32;
 };
-- 
2.45.1

Reply via email to