From: Zhao Zhili <zhiliz...@tencent.com>
Test on Apple M1:
rgb24_to_uv_1080_c: 7.2
rgb24_to_uv_1080_neon: 5.5
rgb24_to_uv_1280_c: 8.2
rgb24_to_uv_1280_neon: 6.2
rgb24_to_uv_1920_c: 12.5
rgb24_to_uv_1920_neon: 9.5
rgb24_to_uv_half_540_c: 6.5
rgb24_to_uv_half_540_neon: 3.0
rgb24_to_uv_half_640_c: 7.5
rgb24_to_uv_half_640_neon: 3.2
rgb24_to_uv_half_960_c: 12.5
rgb24_to_uv_half_960_neon: 6.0
rgb24_to_y_1080_c: 4.5
rgb24_to_y_1080_neon: 3.5
rgb24_to_y_1280_c: 5.2
rgb24_to_y_1280_neon: 4.2
rgb24_to_y_1920_c: 8.0
rgb24_to_y_1920_neon: 6.0
Signed-off-by: Zhao Zhili <zhiliz...@tencent.com>
---
libswscale/aarch64/Makefile | 1 +
libswscale/aarch64/input.S | 229 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c | 25 ++++
3 files changed, 255 insertions(+)
create mode 100644 libswscale/aarch64/input.S
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..adfd90a1b6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS += aarch64/rgb2rgb.o \
aarch64/swscale_unscaled.o \
NEON-OBJS += aarch64/hscale.o \
+ aarch64/input.o \
aarch64/output.o \
aarch64/rgb2rgb_neon.o \
aarch64/yuv2rgb_neon.o \
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
new file mode 100644
index 0000000000..ee0d223c6e
--- /dev/null
+++ b/libswscale/aarch64/input.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkbl...@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+ ld3 { v16.16b, v17.16b, v18.16b }, [\src]
+ ushll v19.8h, v16.8b, #0 // v19: r
+ ushll v20.8h, v17.8b, #0 // v20: g
+ ushll v21.8h, v18.8b, #0 // v21: b
+ ushll2 v22.8h, v16.16b, #0 // v22: r
+ ushll2 v23.8h, v17.16b, #0 // v23: g
+ ushll2 v24.8h, v18.16b, #0 // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2,
right_shift
+ mov \dst1\().16b, v6.16b // dst1 =
const_offset
+ mov \dst2\().16b, v6.16b // dst2 =
const_offset
+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx
* r
+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx
* r
+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx
* g
+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx
* g
+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx
* b
+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx
* b
+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half =
dst1 >> right_shift
+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half =
dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+ cmp w4, #0 // check width > 0
+ b.le 4f
+
+ ldp w10, w11, [x5], #8 // w10: ry, w11: gy