+#include "libavutil/aarch64/asm.S"
+
+.macro rgb24_to_yuv_load_rgb, src
+ ld3 { v16.16b, v17.16b, v18.16b }, [\src]
+ uxtl v19.8h, v16.8b // v19: r
+ uxtl v20.8h, v17.8b // v20: g
+ uxtl v21.8h, v18.8b // v21: b
+ uxtl2 v22.8h, v16.16b // v22: r
+ uxtl2 v23.8h, v17.16b // v23: g
+ uxtl2 v24.8h, v18.16b // v24: b
+.endm
+
+.macro rgb24_to_yuv_product, r, g, b, dst1, dst2, dst, coef0, coef1, coef2,
right_shift
+ mov \dst1\().16b, v6.16b // dst1 =
const_offset
+ mov \dst2\().16b, v6.16b // dst2 =
const_offset
+ smlal \dst1\().4s, \coef0\().4h, \r\().4h // dst1 += rx
* r
+ smlal \dst1\().4s, \coef1\().4h, \g\().4h // dst1 += gx
* g
+ smlal \dst1\().4s, \coef2\().4h, \b\().4h // dst1 += bx
* b
+ smlal2 \dst2\().4s, \coef0\().8h, \r\().8h // dst2 += rx
* r
+ smlal2 \dst2\().4s, \coef1\().8h, \g\().8h // dst2 += gx
* g
+ smlal2 \dst2\().4s, \coef2\().8h, \b\().8h // dst2 += bx
* b
+ sqshrn \dst\().4h, \dst1\().4s, \right_shift // dst_lower_half =
dst1 >> right_shift
+ sqshrn2 \dst\().8h, \dst2\().4s, \right_shift // dst_higher_half =
dst2 >> right_shift
+.endm
+
+function ff_rgb24ToY_neon, export=1
+ cmp w4, #0 // check width > 0
+ ldp w10, w11, [x5] // w10: ry, w11: gy
+ ldr w12, [x5, #8] // w12: by
+ b.le 3f
+
+ mov w9, #256 // w9 = 1 << (RGB2YUV_SHIFT -
7)
+ movk w9, #8, lsl #16 // w9 += 32 << (RGB2YUV_SHIFT
- 1)
+ dup v6.4s, w9 // w9: const_offset
+
+ cmp w4, #16
+ dup v0.8h, w10
+ dup v1.8h, w11
+ dup v2.8h, w12
+ b.lt 2f
+1:
+ rgb24_to_yuv_load_rgb x1
+ rgb24_to_yuv_product v19, v20, v21, v25, v26, v16, v0, v1, v2, #9
+ rgb24_to_yuv_product v22, v23, v24, v27, v28, v17, v0, v1, v2, #9
+ sub w4, w4, #16 // width -= 16
+ add x1, x1, #48 // src += 48
+ cmp w4, #16 // width >= 16 ?
+ stp q16, q17, [x0], #32 // store to dst
+ b.ge 1b
+ cbz x4, 3f
+2:
+ ldrb w13, [x1] // w13: r
+ ldrb w14, [x1, #1] // w14: g
+ ldrb w15, [x1, #2] // w15: b
+
+ smaddl x13, w13, w10, x9 // x13 = ry * r + const_offset
+ smaddl x13, w14, w11, x13 // x13 += gy * g
+ smaddl x13, w15, w12, x13 // x13 += by * b
+ asr w13, w13, #9 // x13 >>= 9
+ sub w4, w4, #1 // i++