From: Zhao Zhili <quinkbl...@foxmail.com>

For 8 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 
'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null 
-benchmark -

    Test results on Snapdragon 845:
    Before:
        frame=  250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.924x
        bench: utime=8.360s stime=2.350s rtime=10.820s
    After:
        frame=  250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=2.04x
        bench: utime=2.650s stime=2.210s rtime=4.909s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.239x
        bench: utime=35.156s stime=6.604s rtime=41.820s
    After:
        frame=  250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.403x
        bench: utime=18.400s stime=6.376s rtime=24.798s

For 16 bit depth:
    ./ffmpeg -threads 1 -f lavfi -t 10 -i 
'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null 
-benchmark -

    Test results on Snapdragon 845
    Before:
        frame=  250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.756x
        bench: utime=8.700s stime=4.410s rtime=13.226s
    After:
        frame=  250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=1.07x
        bench: utime=4.920s stime=4.350s rtime=9.356s

    Test results on HiSilicon Kirin 970:
    Before:
        frame=  250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.161x
        bench: utime=48.868s stime=13.124s rtime=62.110s
    After:
        frame=  250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A 
speed=0.205x
        bench: utime=35.600s stime=13.036s rtime=48.708s
---
 libavfilter/aarch64/Makefile         |   2 +
 libavfilter/aarch64/scene_sad_init.c |  37 +++++++
 libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++
 libavfilter/scene_sad.c              |   2 +
 libavfilter/scene_sad.h              |   2 +
 5 files changed, 192 insertions(+)
 create mode 100644 libavfilter/aarch64/scene_sad_init.c
 create mode 100644 libavfilter/aarch64/scene_sad_neon.S

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index 6c727f9859..3a458f511f 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_afir_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/af_anlmdn_init.o
+OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/scene_sad_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_afir_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/af_anlmdn_neon.o
+NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/scene_sad_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/scene_sad_init.c 
b/libavfilter/aarch64/scene_sad_init.c
new file mode 100644
index 0000000000..8de769ac10
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_init.c
@@ -0,0 +1,37 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/scene_sad.h"
+
+void ff_scene_sad_neon(SCENE_SAD_PARAMS);
+
+void ff_scene_sad16_neon(SCENE_SAD_PARAMS);
+
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth)
+{
+    int cpu_flags = av_get_cpu_flags();
+    if (have_neon(cpu_flags)) {
+        if (depth == 8)
+            return ff_scene_sad_neon;
+        if (depth == 16)
+            return ff_scene_sad16_neon;
+    }
+
+    return NULL;
+}
diff --git a/libavfilter/aarch64/scene_sad_neon.S 
b/libavfilter/aarch64/scene_sad_neon.S
new file mode 100644
index 0000000000..5b3b027a53
--- /dev/null
+++ b/libavfilter/aarch64/scene_sad_neon.S
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Zhao Zhili
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1,
+//                         const uint8_t *src2, ptrdiff_t stride2,
+//                         ptrdiff_t width, ptrdiff_t height,
+//                         uint64_t *sum)
+.macro scene_sad_neon, depth=8
+       // x0: src1
+       // x1: stride1
+       // x2: src2
+       // x3: stride2
+       // x4: width
+       // x5: height
+       // x6: sum
+
+       // x7: step of width loop
+       // x8: index of row
+       // x9: width / x7 * x7
+       // x10: sad
+       // x11: index of column
+       // w12: src1[x]
+       // w13: src2[x]
+
+       mov     x8, xzr
+       mov     x10, xzr
+
+.if \depth == 8
+       mov     x7, #64
+       and     x9, x4, #0xFFFFFFFFFFFFFFC0
+.endif
+
+.if \depth == 16
+       mov     x7, #32
+       and     x9, x4, #0xFFFFFFFFFFFFFFE0
+.endif
+
+1:     cmp     x4, x7          // check width
+       mov     x11, xzr
+       b.lt    3f
+
+       mov     v0.d[0], x10
+
+       // vector loop
+2:
+.if \depth == 8
+       add     x14, x0, x11
+       add     x15, x2, x11
+.endif
+
+.if \depth == 16
+       add     x14, x0, x11, lsl #1
+       add     x15, x2, x11, lsl #1
+.endif
+       ld1     {v16.4S, v17.4S, v18.4S, v19.4S}, [x14]
+       ld1     {v20.4S, v21.4S, v22.4S, v23.4S}, [x15]
+       add     x11, x11, x7
+       cmp     x9, x11
+
+.if \depth == 8
+       uabd    v16.16B, v16.16B, v20.16B
+       uabd    v17.16B, v17.16B, v21.16B
+       uabd    v18.16B, v18.16B, v22.16B
+       uabd    v19.16B, v19.16B, v23.16B
+       uaddlv  h16, v16.16B
+       uaddlv  h17, v17.16B
+       uaddlv  h18, v18.16B
+       uaddlv  h19, v19.16B
+.endif
+
+.if \depth == 16
+       uabd    v16.8H, v16.8H, v20.8H
+       uabd    v17.8H, v17.8H, v21.8H
+       uabd    v18.8H, v18.8H, v22.8H
+       uabd    v19.8H, v19.8H, v23.8H
+       uaddlv  s16, v16.8H
+       uaddlv  s17, v17.8H
+       uaddlv  s18, v18.8H
+       uaddlv  s19, v19.8H
+.endif
+
+       add     d16, d16, d17
+       add     d18, d18, d19
+       add     d0, d0, d16
+       add     d0, d0, d18
+
+       b.ne    2b
+
+       cmp     x9, x4
+       fmov    x10, d0
+       b.eq    4f
+
+       // scalar loop
+3:
+.if \depth == 8
+       ldrb    w12, [x0, x11]
+       ldrb    w13, [x2, x11]
+.endif
+
+.if \depth == 16
+       ldrh    w12, [x0, x11, lsl #1]
+       ldrh    w13, [x2, x11, lsl #1]
+.endif
+       add     x11, x11, #1
+       subs    w12, w12, w13
+       cneg    w12, w12, mi
+       add     x10, x10, x12
+       cmp     x11, x4
+       b.ne    3b
+
+       // next row
+4:
+       add     x8, x8, #1              // =1
+       add     x0, x0, x1
+       cmp     x8, x5
+       add     x2, x2, x3
+       b.ne    1b
+
+5:
+       str     x10, [x6]
+       ret
+.endm
+
+function ff_scene_sad_neon, export=1
+       scene_sad_neon  depth=8
+endfunc
+
+function ff_scene_sad16_neon, export=1
+       scene_sad_neon  depth=16
+endfunc
diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c
index 73d3eacbfa..ee0c71f659 100644
--- a/libavfilter/scene_sad.c
+++ b/libavfilter/scene_sad.c
@@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth)
     ff_scene_sad_fn sad = NULL;
     if (ARCH_X86)
         sad = ff_scene_sad_get_fn_x86(depth);
+    if (ARCH_AARCH64)
+        sad = ff_scene_sad_get_fn_aarch64(depth);
     if (!sad) {
         if (depth == 8)
             sad = ff_scene_sad_c;
diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h
index 173a051f2b..c868200dc4 100644
--- a/libavfilter/scene_sad.h
+++ b/libavfilter/scene_sad.h
@@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS);
 
 void ff_scene_sad16_c(SCENE_SAD_PARAMS);
 
+ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth);
+
 ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth);
 
 ff_scene_sad_fn ff_scene_sad_get_fn(int depth);
-- 
2.22.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to