aarch64: Add rgb24 to yuv implementation

Martin Storsjö Wed, 05 Jun 2024 00:34:39 -0700

On Wed, 5 Jun 2024, Zhao Zhili wrote:

On Jun 5, 2024, at 14:29, Rémi Denis-Courmont <r...@remlab.net> wrote:


Le 4 juin 2024 16:55:01 GMT+03:00, Zhao Zhili <quinkbl...@foxmail.com 
<mailto:quinkbl...@foxmail.com>> a écrit :

From: Zhao Zhili <zhiliz...@tencent.com>

Test on Apple M1:

rgb24_to_uv_1080_c: 7.2
rgb24_to_uv_1080_neon: 5.5
rgb24_to_uv_1280_c: 8.2
rgb24_to_uv_1280_neon: 6.2
rgb24_to_uv_1920_c: 12.5
rgb24_to_uv_1920_neon: 9.5

rgb24_to_uv_half_540_c: 6.5
rgb24_to_uv_half_540_neon: 3.0
rgb24_to_uv_half_640_c: 7.5
rgb24_to_uv_half_640_neon: 3.2
rgb24_to_uv_half_960_c: 12.5
rgb24_to_uv_half_960_neon: 6.0

rgb24_to_y_1080_c: 4.5
rgb24_to_y_1080_neon: 3.5
rgb24_to_y_1280_c: 5.2
rgb24_to_y_1280_neon: 4.2
rgb24_to_y_1920_c: 8.0
rgb24_to_y_1920_neon: 6.0

Signed-off-by: Zhao Zhili <zhiliz...@tencent.com>
---
libswscale/aarch64/Makefile  |   1 +
libswscale/aarch64/input.S   | 229 +++++++++++++++++++++++++++++++++++
libswscale/aarch64/swscale.c |  25 ++++
3 files changed, 255 insertions(+)
create mode 100644 libswscale/aarch64/input.S

diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index da1d909561..adfd90a1b6 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -3,6 +3,7 @@ OBJS        += aarch64/rgb2rgb.o                \
              aarch64/swscale_unscaled.o       \

NEON-OBJS   += aarch64/hscale.o                 \
+               aarch64/input.o                  \
              aarch64/output.o                 \
              aarch64/rgb2rgb_neon.o           \
              aarch64/yuv2rgb_neon.o           \
diff --git a/libswscale/aarch64/input.S b/libswscale/aarch64/input.S
new file mode 100644
index 0000000000..ee0d223c6e
--- /dev/null
+++ b/libswscale/aarch64/input.S
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2024 Zhao Zhili <quinkbl...@foxmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+        ld3             { v16.16b, v17.16b, v18.16b }, [\src]
+        ushll           v19.8h, v16.8b, #0         // v19: r
+        ushll           v20.8h, v17.8b, #0         // v20: g
+        ushll           v21.8h, v18.8b, #0         // v21: b
+        ushll2          v22.8h, v16.16b, #0        // v22: r
+        ushll2          v23.8h, v17.16b, #0        // v23: g
+        ushll2          v24.8h, v18.16b, #0        // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2, 
right_shift
+        mov             \dst1\().16b, v6.16b                    // dst1 = 
const_offset
+        mov             \dst2\().16b, v6.16b                    // dst2 = 
const_offset
+        smlal           \dst1\().4s, \coef0\().4h, \r\().4h     // dst1 += rx 
* r
+        smlal2          \dst2\().4s, \coef0\().8h, \r\().8h     // dst2 += rx 
* r
+        smlal           \dst1\().4s, \coef1\().4h, \g\().4h     // dst1 += gx 
* g
+        smlal2          \dst2\().4s, \coef1\().8h, \g\().8h     // dst2 += gx 
* g
+        smlal           \dst1\().4s, \coef2\().4h, \b\().4h     // dst1 += bx 
* b
+        smlal2          \dst2\().4s, \coef2\().8h, \b\().8h     // dst2 += bx 
* b
+        sqshrn          \dst\().4h, \dst1\().4s, \right_shift   // dst_lower_half = 
dst1 >> right_shift
+        sqshrn2         \dst\().8h, \dst2\().4s, \right_shift   // dst_higher_half = 
dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+        cmp             w4, #0                  // check width > 0
+        b.le            4f
+
+        ldp             w10, w11, [x5], #8       // w10: ry, w11: gy


I don't think it affects anything on your OoO execution hardware, but you're 
using the result of this load right off the bat in the next instruction. Ditto 
below. This may hurt perfs on not-so-fancy CPUs.


Will do.

+        dup             v0.8h, w10
+        dup             v1.8h, w11
+        ldr             w12, [x5]               // w12: by
+        dup             v2.8h, w12
+
+        mov             w9, #256                // w9 = 1 << (RGB2YUV_SHIFT - 
7)
+        movk            w9, #8, lsl #16         // w9 += 32 << (RGB2YUV_SHIFT 
- 1)
+        dup             v6.4s, w9               // w9: const_offset
+
+        mov             x2, #0                  // w2: i
+        and             w3, w4, #0xFFFFFFF0     // w3 = width / 16 * 16
+        cbz             w3, 3f
+1:
+        rgb24_to_yuv_load_rgb x1
+        rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+        rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+        stp             q16, q17, [x0], #32     // store to dst
+
+        add             w2, w2, #16             // i += 16
+        add             x1, x1, #48             // src += 48
+        cmp             w2, w3                  // i < (width / 16 * 16)
+        b.lt            1b
+        b               3f
+2:
+        ldrb            w13, [x1]               // w13: r
+        ldrb            w14, [x1, #1]           // w14: g
+        ldrb            w15, [x1, #2]           // w15: b


You can reorder instructions a little to use post-index and eliminate the ADD, 
though that won't make much difference.

I don't get why the perf gain is so low, or is this an artefact of Apple CPUs?


I have checked the assembly of C version. The compiler has done pretty well on 
loop unroll and
vectorize on this simple case.

To add some context here; ffmpeg's configure disables autovectorizationwith GCC (as it does miscompile things semi regularly), but not withClang. This can give somewhat misleading numbers wrt the relative speedup.

Then additionally, the Apple CPUs do have slightly different performancecharacteristics than other cores too, indeed. Plus the very coarse timerused on macOS doesn't help either...

FWIW, here are some numbers for this patch from some more traditionalCPUs, with a GCC build:


                         Cortex A53      A72      A78
rgb24_to_uv_1080_c:         19471.5   8720.7   7049.7
rgb24_to_uv_1080_neon:       5922.7   3147.5   2274.5
rgb24_to_uv_1280_c:         23067.0  10318.2   8348.5
rgb24_to_uv_1280_neon:       6842.5   3672.5   2656.5
rgb24_to_uv_1920_c:         34595.2  15483.2  12509.7
rgb24_to_uv_1920_neon:      10246.0   5496.7   3976.5
rgb24_to_uv_half_540_c:     11396.0   5481.0   4576.0
rgb24_to_uv_half_540_neon:   3655.7   1687.5   1382.5
rgb24_to_uv_half_640_c:     13546.0   6480.2   5399.0
rgb24_to_uv_half_640_neon:   4202.7   1958.2   1611.2
rgb24_to_uv_half_960_c:     20311.0   9724.2   8068.2
rgb24_to_uv_half_960_neon:   6282.7   2934.2   2372.2
rgb24_to_y_1080_c:          12984.2   4339.7   4074.2
rgb24_to_y_1080_neon:        3492.5   1960.5   1444.7
rgb24_to_y_1280_c:          15384.2   6709.2   4823.5
rgb24_to_y_1280_neon:        4038.2   2265.0   1674.0
rgb24_to_y_1920_c:          23069.7   7708.7   7224.7
rgb24_to_y_1920_neon:        6036.2   3389.0   2514.0


// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/5] swscale/aarch64: Add rgb24 to yuv implementation

Reply via email to