This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git

commit a37ca36a8a9afc418a830e7dab8469c1f49c34de
Author: xinhaiteng <xinhait...@xiaomi.com>
AuthorDate: Sat Mar 2 21:46:44 2024 +0800

    Neon optimized Conv operator
    
    Based on CMSIS-NN, the Conv operator was optimized. Using Neon 
acceleration, multiply 8 input data and 8 filter data in a single loop; Using 
Im2col technology, convert the output data into a matrix, calculate 2 rows of 
input data and 4 rows of filter data in a single large loop, and obtain 2x4 
output data.
    
    Signed-off-by: xinhaiteng <xinhait...@xiaomi.com>
---
 .../tflite-micro/operators/neon/arm_convolve_s8.c  | 204 ++++++++++++
 .../operators/neon/arm_nn_mat_mult_kernel_s8_s16.c | 363 +++++++++++++++++++++
 .../operators/neon/arm_q7_to_q15_with_offset.c     |  60 ++++
 3 files changed, 627 insertions(+)

diff --git a/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c 
b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
new file mode 100644
index 000000000..454c467a1
--- /dev/null
+++ b/mlearning/tflite-micro/operators/neon/arm_convolve_s8.c
@@ -0,0 +1,204 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup NNConv
+ * @{
+ */
+
+/*
+ * Basic s8 convolution function.
+ *
+ * Refer header file for details. Optimal use case for the DSP/MVE 
implementation is when input and output channels
+ * are multiples of 4 or atleast greater than 4.
+ *
+ */
+arm_cmsis_nn_status arm_convolve_s8(const cmsis_nn_context *ctx,
+                                    const cmsis_nn_conv_params *conv_params,
+                                    const cmsis_nn_per_channel_quant_params 
*quant_params,
+                                    const cmsis_nn_dims *input_dims,
+                                    const int8_t *input_data,
+                                    const cmsis_nn_dims *filter_dims,
+                                    const int8_t *filter_data,
+                                    const cmsis_nn_dims *bias_dims,
+                                    const int32_t *bias_data,
+                                    const cmsis_nn_dims *output_dims,
+                                    int8_t *output_data)
+{
+    (void)bias_dims;
+
+    if (ctx->buf == NULL)
+    {
+        return ARM_CMSIS_NN_ARG_ERROR;
+    }
+    int16_t *buffer_a = (int16_t *)ctx->buf;
+
+    const int32_t input_batches = input_dims->n;
+    const uint16_t input_x = input_dims->w;
+    const uint16_t input_y = input_dims->h;
+    const uint16_t input_ch = input_dims->c;
+    const uint16_t kernel_x = filter_dims->w;
+    const uint16_t kernel_y = filter_dims->h;
+    const uint16_t output_x = output_dims->w;
+    const uint16_t output_y = output_dims->h;
+    const uint16_t output_ch = output_dims->c;
+
+    const uint16_t pad_x = conv_params->padding.w;
+    const uint16_t pad_y = conv_params->padding.h;
+    const uint16_t stride_x = conv_params->stride.w;
+    const uint16_t stride_y = conv_params->stride.h;
+    const int32_t dilation_x = conv_params->dilation.w;
+    const int32_t dilation_y = conv_params->dilation.h;
+    const int32_t out_offset = conv_params->output_offset;
+    const int32_t out_activation_min = conv_params->activation.min;
+    const int32_t out_activation_max = conv_params->activation.max;
+    const int32_t rhs_cols = kernel_x * kernel_y * input_ch;
+    const int32_t input_offset = conv_params->input_offset;
+
+    int32_t *output_mult = quant_params->multiplier;
+    int32_t *output_shift = quant_params->shift;
+
+    int i_batch;
+    for (i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+        const int32_t remainder = rhs_cols % 4;
+        const int32_t aligned_rhs_cols = remainder != 0 ? rhs_cols + 4 - 
remainder : rhs_cols;
+        /**
+         * Use Im2col to speed up conv2d calculations.
+         * Use as a ping-pong buffer for unordered elements.
+         */
+        int8_t *im2col_buf = (int8_t *)buffer_a + aligned_rhs_cols * 2;
+        int16_t *im2col_buf_start_s16 = buffer_a;
+        int8_t *out = output_data;
+        int32_t lhs_rows = 0;
+        /* This part implements the im2col function */
+        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+        {
+            const int32_t base_idx_x = stride_x * i_out_x - pad_x;
+            for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+            {
+                const int32_t base_idx_y = stride_y * i_out_y - pad_y;
+                for (int32_t i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
+                {
+                    int32_t k_x = base_idx_x + dilation_x * i_ker_x;
+                    int32_t k_y = base_idx_y - dilation_y;
+                    for (int32_t i_ker_y = 0; i_ker_y < kernel_y; i_ker_y++)
+                    {
+                        k_y += dilation_y;
+                        arm_memcpy_s8(im2col_buf,
+                                      input_data + (k_y * input_x + k_x) * 
input_ch,
+                                      input_ch);
+                        im2col_buf += input_ch;
+                    }
+                }
+                lhs_rows++;
+                /* Extend the input data from int8 to int16, and add offset. */
+                arm_q7_to_q15_with_offset(im2col_buf - rhs_cols,
+                                          im2col_buf_start_s16,
+                                          rhs_cols,
+                                          (int16_t)input_offset);
+                im2col_buf_start_s16 += aligned_rhs_cols;
+                if (lhs_rows & 2)
+                {
+                    out = arm_nn_mat_mult_kernel_s8_s16(filter_data,
+                                                        buffer_a,
+                                                        output_ch,
+                                                        output_shift,
+                                                        output_mult,
+                                                        out_offset,
+                                                        out_activation_min,
+                                                        out_activation_max,
+                                                        rhs_cols,
+                                                        aligned_rhs_cols,
+                                                        bias_data,
+                                                        out);
+                    /* counter reset */
+                    im2col_buf_start_s16 = buffer_a;
+                    im2col_buf = (int8_t *)buffer_a + (aligned_rhs_cols << 1);
+                    lhs_rows = 0;
+                }
+            }
+        }
+        if (lhs_rows != 0)
+        {
+            const int8_t *ker_a = filter_data;
+            int i;
+            for (i = 0; i < output_ch; i++)
+            {
+                /* Load the accumulator with bias first */
+                uint16_t col_count = rhs_cols / 8;
+                int32_t sum = 0;
+                const int16_t *ip_as_col = buffer_a;
+                int32x4_t res_s32 = vdupq_n_s32(0);
+                if (bias_data)
+                {
+                    sum = bias_data[i];
+                }
+                while (col_count)
+                {
+                    int8x8_t filter_s8 = vld1_s8(ker_a);
+                    int16x8_t input_s16 = vld1q_s16(ip_as_col);
+                    int16x8_t filter_s16 = vmovl_s8(filter_s8);
+                    ker_a += 8;
+                    ip_as_col += 8;
+                    res_s32 = vmlal_s16(res_s32,
+                                        vget_low_s16(input_s16),
+                                        vget_low_s16(filter_s16));
+                    res_s32 = vmlal_s16(res_s32,
+                                        vget_high_s16(input_s16),
+                                        vget_high_s16(filter_s16));
+                    col_count --;
+                }
+                sum += vgetq_lane_s32(res_s32, 0);
+                sum += vgetq_lane_s32(res_s32, 1);
+                sum += vgetq_lane_s32(res_s32, 2);
+                sum += vgetq_lane_s32(res_s32, 3);
+                col_count = rhs_cols % 8;
+                while (col_count)
+                {
+                    int8_t ker_a1 = *ker_a++;
+                    int16_t ip_b1 = *ip_as_col++;
+                    sum += ker_a1 * ip_b1;
+                    col_count--;
+                }
+                sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
+                sum += out_offset;
+                sum = MAX(sum, out_activation_min);
+                sum = MIN(sum, out_activation_max);
+                *out++ = (int8_t)sum;
+            }
+        }
+        /* Advance to the next batch */
+        input_data += (input_x * input_y * input_ch);
+        output_data += (output_x * output_y * output_ch);
+    }
+    /* Return to application */
+    return ARM_CMSIS_NN_SUCCESS;
+}
+
+/**
+ * @} end of NNConv group
+ */
\ No newline at end of file
diff --git 
a/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c 
b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
new file mode 100644
index 000000000..6cbea4de0
--- /dev/null
+++ b/mlearning/tflite-micro/operators/neon/arm_nn_mat_mult_kernel_s8_s16.c
@@ -0,0 +1,363 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_mat_mult_kernel_s8_s16.c
+ * Description:  Matrix-multiplication function for convolution
+ *
+ * $Date:        29 May 2023
+ * $Revision:    V.2.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ * -------------------------------------------------------------------- */
+
+#include <arm_neon.h>
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/*
+ * Matrix-multiplication function for convolution with per-channel 
requantization.
+ *
+ * Refer header file for details.
+ *
+ */
+int8_t *arm_nn_mat_mult_kernel_s8_s16(const int8_t *input_a,
+                                      const int16_t *input_b,
+                                      const uint16_t output_ch,
+                                      const int32_t *out_shift,
+                                      const int32_t *out_mult,
+                                      const int32_t out_offset,
+                                      const int16_t activation_min,
+                                      const int16_t activation_max,
+                                      const int32_t num_col_a,
+                                      const int32_t aligned_num_col_a,
+                                      const int32_t *const output_bias,
+                                      int8_t *out_0)
+{
+    int8_t *out_1 = out_0 + output_ch;
+    const int32_t *bias = output_bias;
+
+    uint16_t row_count = output_ch / 4;
+    const int8_t *ip_a0 = input_a;
+
+    /* this loop over rows in A */
+    while (row_count)
+    {
+        int32_t col_count = num_col_a / 8;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+        const int8_t *ip_a[4] = {ip_a0,
+                                 ip_a0 + num_col_a,
+                                 ip_a0 + 2 * num_col_a,
+                                 ip_a0 + 3 * num_col_a};
+        int32_t ch_out[4][2] = {0};
+        int32x4_t res[8];
+
+        for (int i = 0; i < 8; i++)
+        {
+            res[i] = vdupq_n_s32(0);
+        }
+
+        /* Init accumulator with bias for channel N and N + 1 */
+        if (bias)
+        {
+            for (int i = 0; i < 4; i++)
+            {
+                ch_out[i][0] = *bias;
+                ch_out[i][1] = *bias++;
+            }
+        }
+
+        /**
+         * Each time eight int8 data of four filters and eight int16 data
+         * of two inputs are read.First, the filter data is expanded to
+         * int16, and then cross-multiplied to obtain eight calculation 
results.
+         */
+        while (col_count)
+        {
+            int8x8_t filter_s8[4];
+            int16x8_t input_s16[2];
+            int16x8_t filter_s16[4];
+
+            input_s16[0] = vld1q_s16(ip_b0);
+            ip_b0 += 8;
+            input_s16[1] = vld1q_s16(ip_b1);
+            ip_b1 += 8;
+
+            for (int i = 0; i < 4; i++)
+            {
+                filter_s8[i] = vld1_s8(ip_a[i]);
+                ip_a[i] += 8;
+                filter_s16[i] = vmovl_s8(filter_s8[i]);
+                res[i * 2]     = vmlal_s16(res[i * 2],
+                                           vget_low_s16(filter_s16[i]),
+                                           vget_low_s16(input_s16[0]));
+                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                           vget_low_s16(filter_s16[i]),
+                                           vget_low_s16(input_s16[1]));
+                res[i * 2]     = vmlal_s16(res[i * 2],
+                                           vget_high_s16(filter_s16[i]),
+                                           vget_high_s16(input_s16[0]));
+                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                           vget_high_s16(filter_s16[i]),
+                                           vget_high_s16(input_s16[1]));
+            }
+
+            col_count --;
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
+            }
+        }
+
+        col_count = num_col_a % 8;
+        while (col_count)
+        {
+            int16_t b0 = *ip_b0++;
+            int16_t b1 = *ip_b1++;
+
+            for (int i = 0; i < 4; i++)
+            {
+                int8_t input_remaining = *(ip_a[i]++);
+                ch_out[i][0] += input_remaining * b0;
+                ch_out[i][1] += input_remaining * b1;
+            }
+            col_count--;
+        } /* while over col_count */
+
+        for (int i = 0; i < 4; i++)
+        {
+            ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, 
*out_shift);
+            ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, 
*out_shift);
+            ch_out[i][0] += out_offset;
+            ch_out[i][1] += out_offset;
+            ch_out[i][0] = MAX(ch_out[i][0], activation_min);
+            ch_out[i][1] = MAX(ch_out[i][1], activation_min);
+            ch_out[i][0] = MIN(ch_out[i][0], activation_max);
+            ch_out[i][1] = MIN(ch_out[i][1], activation_max);
+            *out_0++ = (int8_t)ch_out[i][0];
+            *out_1++ = (int8_t)ch_out[i][1];
+            out_mult++;
+            out_shift++;
+        }
+
+        /* skip row */
+        ip_a0 = ip_a[3];
+        row_count--;
+    }
+    row_count = output_ch % 4;
+    if (row_count >= 2)
+    {
+        int32_t col_count = num_col_a / 8;
+        const int8_t *ip_a1 = ip_a0 + num_col_a;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+        int32_t ch_out[2][2] = {0};
+        int32x4_t res[4];
+
+        /* Init accumulator with bias for channel N and N + 1 */
+        if (bias)
+        {
+            for (int i = 0; i < 2; i++)
+            {
+                ch_out[i][0] = *bias;
+                ch_out[i][1] = *bias++;
+            }
+        }
+
+        for (int i = 0; i < 4; i++)
+        {
+            res[i] = vdupq_n_s32(0);
+        }
+
+        /**
+         * Each time eight int8 data of four filters and eight int16 data
+         * of two inputs are read.First, the filter data is expanded to
+         * int16, and then cross-multiplied to obtain 8 calculation results.
+         */
+        while (col_count)
+        {
+            int8x8_t filter_s8[2];
+            int16x8_t input_s16[2];
+            int16x8_t filter_s16[2];
+
+            filter_s8[0] = vld1_s8(ip_a0);
+            ip_a0 += 8;
+            filter_s8[1] = vld1_s8(ip_a1);
+            ip_a1 += 8;
+
+            input_s16[0] = vld1q_s16(ip_b0);
+            ip_b0 += 8;
+            input_s16[1] = vld1q_s16(ip_b1);
+            ip_b1 += 8;
+
+            for (int i = 0; i < 2; i++)
+            {
+                filter_s16[i] = vmovl_s8(filter_s8[i]);
+                res[i * 2]     = vmlal_s16(res[i * 2],
+                                           vget_low_s16(filter_s16[i]),
+                                           vget_low_s16(input_s16[0]));
+                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                           vget_low_s16(filter_s16[i]),
+                                           vget_low_s16(input_s16[1]));
+                res[i * 2]     = vmlal_s16(res[i * 2],
+                                           vget_high_s16(filter_s16[i]),
+                                           vget_high_s16(input_s16[0]));
+                res[i * 2 + 1] = vmlal_s16(res[i * 2 + 1],
+                                           vget_high_s16(filter_s16[i]),
+                                           vget_high_s16(input_s16[1]));
+            }
+
+            col_count --;
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 0);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 1);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 2);
+                ch_out[i][j] += vgetq_lane_s32(res[i * 2 + j], 3);
+            }
+        }
+
+        col_count = num_col_a % 8;
+        while (col_count)
+        {
+            int8_t a0 = *ip_a0++; // filter
+            int8_t a1 = *ip_a1++;
+            int16_t b0 = *ip_b0++; // input
+            int16_t b1 = *ip_b1++;
+
+            ch_out[0][0] += a0 * b0;
+            ch_out[1][1] += a1 * b1;
+            ch_out[1][0] += a1 * b0;
+            ch_out[0][1] += a0 * b1;
+            col_count--;
+        } /* while over col_count */
+
+        for (int i = 0; i < 2; i++)
+        {
+            ch_out[i][0] = arm_nn_requantize(ch_out[i][0], *out_mult, 
*out_shift);
+            ch_out[i][1] = arm_nn_requantize(ch_out[i][1], *out_mult, 
*out_shift);
+            ch_out[i][0] += out_offset;
+            ch_out[i][1] += out_offset;
+            ch_out[i][0] = MAX(ch_out[i][0], activation_min);
+            ch_out[i][1] = MAX(ch_out[i][1], activation_min);
+            ch_out[i][0] = MIN(ch_out[i][0], activation_max);
+            ch_out[i][1] = MIN(ch_out[i][1], activation_max);
+            *out_0++ = (int8_t)ch_out[i][0];
+            *out_1++ = (int8_t)ch_out[i][1];
+            out_mult++;
+            out_shift++;
+        }
+
+        /* skip row */
+        ip_a0 += num_col_a;
+        row_count -= 2;
+    }
+
+    /* compute the last odd numbered row if any */
+    if (output_ch & 0x1)
+    {
+        int32_t col_count = num_col_a / 8;
+        const int16_t *ip_b0 = input_b;
+        const int16_t *ip_b1 = ip_b0 + aligned_num_col_a;
+        int32_t ch_out[2] = {0};
+        int32x4_t res[2];
+
+        /* load the bias */
+        if (bias)
+        {
+            ch_out[0] = *bias;
+            ch_out[1] = *bias++;
+        }
+
+        res[0] = vdupq_n_s32(0);
+        res[1] = vdupq_n_s32(0);
+
+        while(col_count)
+        {
+            int8x8_t filter_s8 = vld1_s8(ip_a0);
+            int16x8_t filter_s16 = vmovl_s8(filter_s8);
+            int16x8_t input_0_s16 = vld1q_s16(ip_b0);
+            int16x8_t input_1_s16 = vld1q_s16(ip_b1);
+            ip_a0 += 8;
+            ip_b0 += 8;
+            ip_b1 += 8;
+            res[0] = vmlal_s16(res[0],
+                               vget_low_s16(filter_s16),
+                               vget_low_s16(input_0_s16));
+            res[1] = vmlal_s16(res[1],
+                               vget_low_s16(filter_s16),
+                               vget_low_s16(input_1_s16));
+            res[0] = vmlal_s16(res[0],
+                               vget_high_s16(filter_s16),
+                               vget_high_s16(input_0_s16));
+            res[1] = vmlal_s16(res[1],
+                               vget_high_s16(filter_s16),
+                               vget_high_s16(input_1_s16));
+            col_count --;
+        }
+
+        for (int i = 0; i < 4; i++)
+        {
+            ch_out[0] += vgetq_lane_s32(res[0], i);
+            ch_out[1] += vgetq_lane_s32(res[1], 1);
+        }
+
+        col_count = num_col_a % 8;
+        while (col_count)
+        {
+            int8_t a0 = *ip_a0++;
+            int16_t b0 = *ip_b0++;
+            int16_t b1 = *ip_b1++;
+
+            ch_out[0] += a0 * b0;
+            ch_out[1] += a0 * b1;
+            col_count--;
+        }
+
+        ch_out[0] = arm_nn_requantize(ch_out[0], *out_mult, *out_shift);
+        ch_out[0] += out_offset;
+        ch_out[0] = MAX(ch_out[0], activation_min);
+        ch_out[0] = MIN(ch_out[0], activation_max);
+        *out_0++ = (int8_t)ch_out[0];
+
+        ch_out[1] = arm_nn_requantize(ch_out[1], *out_mult, *out_shift);
+        ch_out[1] += out_offset;
+        ch_out[1] = MAX(ch_out[1], activation_min);
+        ch_out[1] = MIN(ch_out[1], activation_max);
+        *out_1++ = (int8_t)ch_out[1];
+
+        out_mult++;
+        out_shift++;
+    }
+
+    out_0 += output_ch;
+
+    /* return the new output pointer with offset */
+    return out_0;
+}
diff --git a/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c 
b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
new file mode 100644
index 000000000..b7156e473
--- /dev/null
+++ b/mlearning/tflite-micro/operators/neon/arm_q7_to_q15_with_offset.c
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its 
affiliates <open-source-off...@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in_q7x4 compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in_q7x4 writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup supportConversion
+ * @{
+ */
+
+void arm_q7_to_q15_with_offset(const int8_t *src, int16_t *dst, int32_t 
block_size, int16_t offset)
+{
+    int32_t block_cnt;
+
+    block_cnt = block_size / 8;
+    int16x8_t offset_s16 = vdupq_n_s16(offset);
+    while (block_cnt)
+    {
+        int8x8_t src_s8 = vld1_s8(src);
+        int16x8_t src_s16 = vmovl_s8(src_s8);
+        src += 8;
+        src_s16 = vaddq_s16(offset_s16, src_s16);
+        block_cnt--;
+        vst1q_s16(dst, src_s16);
+        dst += 8;
+    }
+
+    block_cnt = block_size % 8;
+    while (block_cnt > 0)
+    {
+        *dst++ = (int16_t)*src++ + offset;
+
+        /* Decrement the loop counter */
+        block_cnt--;
+    }
+}
+
+/**
+ * @} end of Doxygen group
+ */

Reply via email to