This is an automated email from the ASF dual-hosted git repository.

xiaoxiang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nuttx-apps.git

commit 754d9e39a850a219acaf1e512ce3b190c9ed4d56
Author: xinhaiteng <xinhait...@xiaomi.com>
AuthorDate: Fri Nov 24 17:56:37 2023 +0800

    TFLM Cortex-A NEON Conv
    
    Use neon to accelerate the conv op, and the output results are the same.
    
    Signed-off-by: xinhaiteng <xinhait...@xiaomi.com>
---
 mlearning/tflite-micro/tflite-micro.patch | 94 +++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/mlearning/tflite-micro/tflite-micro.patch 
b/mlearning/tflite-micro/tflite-micro.patch
index f38291a5a..d39038d5a 100644
--- a/mlearning/tflite-micro/tflite-micro.patch
+++ b/mlearning/tflite-micro/tflite-micro.patch
@@ -12,3 +12,97 @@ index 7638d912..3261be56 100644
  
  namespace tflite {
  namespace tflm_signal {
+diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h 
b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+index eac00576..abfdea8c 100644
+--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
++++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+@@ -18,6 +18,9 @@ limitations under the License.
+ #include <algorithm>
+ 
+ #include "tensorflow/lite/kernels/internal/common.h"
++#ifdef USE_NEON
++#include <arm_neon.h>
++#endif
+ 
+ namespace tflite {
+ namespace reference_integer_ops {
+@@ -133,6 +136,79 @@ inline void ConvPerChannel(
+       }
+     }
+   }
++#ifdef USE_NEON
++  for (int batch = 0; batch < batches; ++batch) {
++    for (int out_y = 0; out_y < output_height; ++out_y) {
++      int in_y_origin = (out_y * stride_height) - pad_height;
++      for (int out_x = 0; out_x < output_width; ++out_x) {
++        int in_x_origin = (out_x * stride_width) - pad_width;
++        int filter_start_offset = 0;
++        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
++          auto group = out_channel / filters_per_group;
++          int32_t acc = 0;
++          int8x8_t input_v = vdup_n_s8(0);
++          int8x8_t filter_v = vdup_n_s8(0);
++          int16x8_t mid_mul = vdupq_n_s16(0);
++          int32x4_t res_v = vdupq_n_s32(0);
++          int32x4_t filter_offset_v = vdupq_n_s32(0);
++          int input_offset_temp = 0;
++          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
++            int in_y = in_y_origin + dilation_height_factor * filter_y;
++            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
++              int in_x = in_x_origin + dilation_width_factor * filter_x;
++              const bool is_point_inside_image = (in_x >= 0) && (in_x < 
input_width) && (in_y >= 0) && (in_y < input_height);
++              if (!is_point_inside_image)
++                continue;
++              int input_start_offset = ((batch * input_height + in_y) * 
input_width + in_x) * input_depth + group * filter_input_depth;
++              int in_channel = 0;
++              for (; in_channel < (filter_input_depth & -8); in_channel += 8) 
{
++                input_v = vld1_s8(input_data + input_start_offset);
++                input_start_offset += 8;
++                filter_v = vld1_s8(filter_data + filter_start_offset);
++                filter_start_offset += 8;
++
++                mid_mul = vmovl_s8(filter_v);
++                filter_offset_v = vaddw_s16(filter_offset_v, 
vget_low_s16(mid_mul));
++                filter_offset_v = vaddw_s16(filter_offset_v, 
vget_high_s16(mid_mul));
++                mid_mul = vmull_s8(input_v, filter_v);
++                res_v = vaddw_s16(res_v, vget_low_s16(mid_mul));
++                res_v = vaddw_s16(res_v, vget_high_s16(mid_mul));
++
++              }
++
++              for (; in_channel < filter_input_depth; ++in_channel) {
++                acc += (input_data[input_start_offset] +  input_offset) * 
filter_data[filter_start_offset];
++                ++input_start_offset;
++                ++filter_start_offset;
++              }
++            }
++          }
++          acc += vgetq_lane_s32(res_v, 0);
++          acc += vgetq_lane_s32(res_v, 1);
++          acc += vgetq_lane_s32(res_v, 2);
++          acc += vgetq_lane_s32(res_v, 3);
++          input_offset_temp += vgetq_lane_s32(filter_offset_v, 0);
++          input_offset_temp += vgetq_lane_s32(filter_offset_v, 1);
++          input_offset_temp += vgetq_lane_s32(filter_offset_v, 2);
++          input_offset_temp += vgetq_lane_s32(filter_offset_v, 3);
++          acc += input_offset_temp * input_offset;
++
++          if (bias_data)
++          {
++            acc += bias_data[out_channel];
++          }
++          acc = MultiplyByQuantizedMultiplier(
++              acc, output_multiplier[out_channel], output_shift[out_channel]);
++          acc += output_offset;
++          acc = std::max(acc, output_activation_min);
++          acc = std::min(acc, output_activation_max);
++          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] 
=
++              static_cast<int8_t>(acc);
++        }
++      }
++    }
++  }
++#endif
+ }
+ 
+ 

Reply via email to