The branch, master has been updated
via eb14d4582447aa6560a47e5d8c6f142d76540766 (commit)
via 6450e01446f5c8c48e4cd2fd43c805ba2991d9ba (commit)
from f07c12d806b9a6f3ca05870a99d10a818a3aabc1 (commit)
- Log -----------------------------------------------------------------
commit eb14d4582447aa6560a47e5d8c6f142d76540766
Author: Zhao Zhili <[email protected]>
AuthorDate: Thu Aug 21 20:44:37 2025 +0800
Commit: Zhao Zhili <[email protected]>
CommitDate: Mon Sep 1 15:35:16 2025 +0000
avfilter/vf_colordetect: add aarch64 asm
| rpi5 gcc 12 | m1 clang -fno-vectorize | m1 clang
---------------------------------------------------------------------------
alpha_8_full_c: | 32159.2 ( 1.00x) | 135.8 ( 1.00x) | 26.4 ( 1.00x)
alpha_8_full_neon: | 1266.0 (25.40x) | 8.0 (17.03x) | 8.4 ( 3.15x)
alpha_8_limited_c: | 37561.9 ( 1.00x) | 169.1 ( 1.00x) | 47.7 ( 1.00x)
alpha_8_limited_neon: | 3967.0 ( 9.47x) | 12.5 (13.53x) | 13.3 ( 3.59x)
alpha_16_full_c: | 15867.9 ( 1.00x) | 64.5 ( 1.00x) | 13.7 ( 1.00x)
alpha_16_full_neon: | 1256.9 (12.62x) | 7.9 ( 8.15x) | 8.3 ( 1.64x)
alpha_16_limited_c: | 16723.7 ( 1.00x) | 88.7 ( 1.00x) | 103.3 ( 1.00x)
alpha_16_limited_neon: | 4031.3 ( 4.15x) | 12.5 ( 7.08x) | 13.2 ( 7.86x)
range_8_c: | 21819.7 ( 1.00x) | 120.0 ( 1.00x) | 9.4 ( 1.00x)
range_8_neon: | 1148.3 (19.00x) | 4.3 (27.60x) | 4.8 ( 1.97x)
range_16_c: | 10757.1 ( 1.00x) | 45.7 ( 1.00x) | 7.9 ( 1.00x)
range_16_neon: | 1141.5 ( 9.42x) | 4.4 (10.38x) | 4.6 ( 1.72x)
diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b68209bc94..c7b7e18467 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,5 +1,7 @@
OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_init_aarch64.o
+OBJS-$(CONFIG_COLORDETECT_FILTER) += aarch64/vf_colordetect_init.o
OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o
NEON-OBJS-$(CONFIG_BWDIF_FILTER) += aarch64/vf_bwdif_neon.o
+NEON-OBJS-$(CONFIG_COLORDETECT_FILTER) += aarch64/vf_colordetect_neon.o
NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_colordetect_init.c
b/libavfilter/aarch64/vf_colordetect_init.c
new file mode 100644
index 0000000000..4db6b90542
--- /dev/null
+++ b/libavfilter/aarch64/vf_colordetect_init.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/vf_colordetect.h"
+
+int ff_detect_alpha_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+ const uint8_t *alpha, ptrdiff_t alpha_stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+ const uint8_t *alpha, ptrdiff_t alpha_stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha_limited_neon(const uint8_t *color, ptrdiff_t color_stride,
+ const uint8_t *alpha, ptrdiff_t alpha_stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_limited_neon(const uint8_t *color, ptrdiff_t
color_stride,
+ const uint8_t *alpha, ptrdiff_t
alpha_stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_range_neon(const uint8_t *data, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int mpeg_min, int mpeg_max);
+
+int ff_detect_range16_neon(const uint8_t *data, ptrdiff_t stride,
+ ptrdiff_t width, ptrdiff_t height,
+ int mpeg_min, int mpeg_max);
+
+av_cold void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp,
int depth,
+ enum AVColorRange color_range)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ dsp->detect_range = depth > 8 ? ff_detect_range16_neon :
ff_detect_range_neon;
+ if (color_range == AVCOL_RANGE_JPEG)
+ dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_full_neon :
ff_detect_alpha_full_neon;
+ else
+ dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_neon :
ff_detect_alpha_limited_neon;
+ }
+}
diff --git a/libavfilter/aarch64/vf_colordetect_neon.S
b/libavfilter/aarch64/vf_colordetect_neon.S
new file mode 100644
index 0000000000..f3cca16fed
--- /dev/null
+++ b/libavfilter/aarch64/vf_colordetect_neon.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define FF_ALPHA_TRANSPARENT (1 << 0)
+#define FF_ALPHA_STRAIGHT ((1 << 1) | FF_ALPHA_TRANSPARENT)
+
+const mask
+ .byte 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 255, 255, 255, 255, 255, 255, 255, 255
+mask_start:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 255, 255, 255, 255, 255, 255, 255, 255
+ .byte 255, 255, 255, 255, 255, 255, 255, 255
+endconst
+
+.macro load_mask_zero, shift=0
+ movrel x9, mask_start
+ sub x9, x9, x7, lsl #(\shift)
+ ldr q3, [x9]
+.endm
+
+.macro load_mask, shift=0
+ movrel x9, mask_start
+ sub x9, x9, x7, lsl #(\shift)
+ ld1 {v3.16b, v4.16b}, [x9]
+.endm
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range_neon, export=1
+ ands x7, x2, #15 // width % 16
+ bic x8, x2, #15 // width / 16 * 16
+ bic x6, x2, #31 // width / 32 * 32
+ and x10, x2, #16 // check x8 != x6
+ dup v0.16b, w4 // mpeg_min
+ dup v1.16b, w5 // mpeg_max
+ movi v2.16b, #0 // cond
+ sub x1, x1, x8
+ b.eq 1f
+ load_mask_zero
+1:
+ cbz x6, 20f // width < 32
+ mov x12, x6
+2:
+ ld1 {v5.16b, v6.16b}, [x0], #32
+ cmhi v16.16b, v0.16b, v5.16b
+ cmhi v17.16b, v5.16b, v1.16b
+ cmhi v18.16b, v0.16b, v6.16b
+ cmhi v19.16b, v6.16b, v1.16b
+ orr v20.16b, v16.16b, v17.16b
+ orr v21.16b, v18.16b, v19.16b
+ subs x12, x12, #32
+ orr v20.16b, v20.16b, v21.16b
+ orr v2.16b, v2.16b, v20.16b
+ b.gt 2b
+20:
+ cbz x10, 3f // width < 16
+ ldr q20, [x0], #16
+ cmhi v16.16b, v0.16b, v20.16b
+ cmhi v17.16b, v20.16b, v1.16b
+ orr v16.16b, v16.16b, v17.16b
+ orr v2.16b, v2.16b, v16.16b
+3:
+ cbz x7, 4f
+ ldr q21, [x0]
+ cmhi v18.16b, v0.16b, v21.16b
+ cmhi v19.16b, v21.16b, v1.16b
+ orr v16.16b, v18.16b, v19.16b
+ and v16.16b, v16.16b, v3.16b
+ orr v2.16b, v2.16b, v16.16b
+4:
+ umaxv b4, v2.16b
+ subs x3, x3, #1
+ umov w9, v4.b[0]
+ add x0, x0, x1
+ cbnz w9, 8f
+ b.gt 1b
+ mov x0, #0
+ ret
+8:
+ mov x0, #1
+ ret
+endfunc
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range16_neon, export=1
+ ands x7, x2, #7 // width % 7
+ bic x8, x2, #7 // width / 8 * 8
+ bic x6, x2, #15 // width / 16 * 16
+ and x10, x2, #8 // check x8 != x6
+ dup v0.8h, w4 // mpeg_min
+ dup v1.8h, w5 // mpeg_max
+ movi v2.16b, #0 // cond
+ sub x1, x1, x8, lsl #1
+ b.eq 1f
+ load_mask_zero shift=1
+1:
+ cbz x6, 20f // width < 16
+ mov x12, x6
+2:
+ ld1 {v5.8h, v6.8h}, [x0], #32
+ cmhi v16.8h, v0.8h, v5.8h
+ cmhi v17.8h, v5.8h, v1.8h
+ cmhi v18.8h, v0.8h, v6.8h
+ cmhi v19.8h, v6.8h, v1.8h
+ orr v20.16b, v16.16b, v17.16b
+ orr v21.16b, v18.16b, v19.16b
+ subs x12, x12, #16
+ orr v20.16b, v20.16b, v21.16b
+ orr v2.16b, v2.16b, v20.16b
+ b.gt 2b
+20:
+ cbz x10, 3f // width < 8
+ ldr q20, [x0], #16
+ cmhi v16.8h, v0.8h, v20.8h
+ cmhi v17.8h, v20.8h, v1.8h
+ orr v16.16b, v16.16b, v17.16b
+ orr v2.16b, v2.16b, v16.16b
+3:
+ cbz x7, 4f
+ ldr q21, [x0]
+ cmhi v18.8h, v0.8h, v21.8h
+ cmhi v19.8h, v21.8h, v1.8h
+ orr v16.16b, v18.16b, v19.16b
+ and v16.16b, v16.16b, v3.16b
+ orr v2.16b, v2.16b, v16.16b
+4:
+ umaxv h4, v2.8h
+ subs x3, x3, #1
+ umov w9, v4.h[0]
+ add x0, x0, x1
+ cbnz w9, 8f
+ b.gt 1b
+ mov x0, #0
+ ret
+8:
+ mov x0, #1
+ ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha_full_neon, export=1
+ ands x7, x4, #15 // width % 16
+ bic x8, x4, #15 // width / 16 * 16
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v2.16b, w6 // alpha_max
+ sub x1, x1, x8 // color_stride - aligned_width
+ sub x3, x3, x8 // alpha_stride - aligned_width
+ b.eq 1f
+
+ // Create mask for non-aligned width
+ load_mask
+1:
+ cbz x8, 20f // width < 16
+ mov x12, x8 // w12: aligned_width
+2:
+ ldr q5, [x0], #16
+ ldr q6, [x2], #16
+ subs x12, x12, #16
+ cmhi v7.16b, v5.16b, v6.16b
+ cmeq v16.16b, v6.16b, v2.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+ b.gt 2b
+20:
+ cbz w7, 3f
+ // handle loop tail
+ ldr q5, [x0]
+ ldr q6, [x2]
+ cmhi v7.16b, v5.16b, v6.16b
+ cmeq v16.16b, v6.16b, v2.16b
+ and v7.16b, v7.16b, v3.16b
+ orr v16.16b, v16.16b, v4.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+3:
+ umaxv b17, v0.16b
+ subs x5, x5, #1
+ umov w9, v17.b[0]
+ add x0, x0, x1
+ add x2, x2, x3
+ cbnz w9, 4f
+ b.gt 1b
+
+ uminv b1, v1.16b
+ umov w9, v1.b[0]
+ mov x0, #0
+ cbnz w9, 5f
+ mov x0, #FF_ALPHA_TRANSPARENT
+ ret
+4:
+ mov x0, #FF_ALPHA_STRAIGHT
+5:
+ ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha16_full_neon, export=1
+ ands x7, x4, #7 // width % 8
+ bic x8, x4, #7 // width / 8 * 8
+ movi v0.8h, #0
+ movi v1.16b, #255
+ dup v2.8h, w6 // alpha_max
+ sub x1, x1, x8, lsl #1 // color_stride -
(aligned_width * 2)
+ sub x3, x3, x8, lsl #1 // alpha_stride -
(aligned_width * 2)
+ b.eq 1f
+
+ // Create mask for non-aligned width
+ load_mask shift=1
+1:
+ cbz x8, 20f // width < 8
+ mov x12, x8 // w12: aligned_width
+2:
+ ldr q5, [x0], #16
+ ldr q6, [x2], #16
+ subs x12, x12, #8
+ cmhi v7.8h, v5.8h, v6.8h
+ cmeq v16.8h, v6.8h, v2.8h
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+ b.gt 2b
+20:
+ cbz w7, 3f
+ // handle loop tail
+ ldr q5, [x0]
+ ldr q6, [x2]
+ cmhi v7.8h, v5.8h, v6.8h
+ cmeq v16.8h, v6.8h, v2.8h
+ and v7.16b, v7.16b, v3.16b
+ orr v16.16b, v16.16b, v4.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+3:
+ umaxv h17, v0.8h
+ subs x5, x5, #1
+ umov w9, v17.h[0]
+ add x0, x0, x1
+ add x2, x2, x3
+ cbnz w9, 4f
+ b.gt 1b
+
+ uminv h1, v1.8h
+ umov w9, v1.h[0]
+ mov x0, #0
+ cbnz w9, 5f
+ mov x0, #FF_ALPHA_TRANSPARENT
+ ret
+4:
+ mov x0, #FF_ALPHA_STRAIGHT
+5:
+ ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha_limited_neon, export=1
+ dup v17.16b, w7 // mpeg_range
+ ldr w13, [sp]
+ movi v0.16b, #0
+ movi v1.16b, #255
+ dup v2.16b, w6 // alpha_max
+ ands x7, x4, #15 // width % 16
+ bic x8, x4, #15 // width / 16 * 16
+ dup v18.8h, w13 // offset
+ sub x1, x1, x8 // color_stride - aligned_width
+ sub x3, x3, x8 // alpha_stride - aligned_width
+ b.eq 1f
+
+ // Create mask for non-aligned width
+ load_mask
+1:
+ cbz x8, 20f // width < 16
+ mov x12, x8 // w12: aligned_width
+2:
+ ldr q5, [x0], #16 // color
+ ldr q6, [x2], #16 // alpha
+ umull v19.8h, v2.8b, v5.8b // alpha_max * color
+ umull2 v20.8h, v2.16b, v5.16b // alpha_max * color
+ umull v21.8h, v17.8b, v6.8b // range * alpha
+ umull2 v22.8h, v17.16b, v6.16b // range * alpha
+ cmeq v16.16b, v6.16b, v2.16b
+ subs x12, x12, #16
+ uqsub v19.8h, v19.8h, v18.8h // alpha_max * color -
offset
+ uqsub v20.8h, v20.8h, v18.8h // alpha_max * color -
offset
+
+ cmhi v19.8h, v19.8h, v21.8h
+ cmhi v20.8h, v20.8h, v22.8h
+ orr v7.16b, v19.16b, v20.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+ b.gt 2b
+20:
+ cbz w7, 3f
+ // handle loop tail
+ ldr q5, [x0]
+ ldr q6, [x2]
+ umull v19.8h, v2.8b, v5.8b // alpha_max * color
+ umull2 v20.8h, v2.16b, v5.16b // alpha_max * color
+ umull v21.8h, v17.8b, v6.8b // range * alpha
+ umull2 v22.8h, v17.16b, v6.16b // range * alpha
+ uqsub v19.8h, v19.8h, v18.8h // alpha_max * color -
offset
+ uqsub v20.8h, v20.8h, v18.8h // alpha_max * color -
offset
+
+ cmhi v19.8h, v19.8h, v21.8h
+ cmhi v20.8h, v20.8h, v22.8h
+ uqxtn v7.8b, v19.8h
+ uqxtn2 v7.16b, v20.8h
+ cmeq v16.16b, v6.16b, v2.16b
+
+ and v7.16b, v7.16b, v3.16b
+ orr v16.16b, v16.16b, v4.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+3:
+ umaxv b23, v0.16b
+ subs x5, x5, #1
+ umov w9, v23.b[0]
+ add x0, x0, x1
+ add x2, x2, x3
+ cbnz w9, 4f
+ b.gt 1b
+
+ uminv b1, v1.16b
+ umov w9, v1.b[0]
+ mov x0, #0
+ cbnz w9, 5f
+ mov x0, #FF_ALPHA_TRANSPARENT
+ ret
+4:
+ mov x0, #FF_ALPHA_STRAIGHT
+5:
+ ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha16_limited_neon, export=1
+ dup v17.8h, w7 // mpeg_range
+ ldr w13, [sp]
+ movi v0.8h, #0
+ movi v1.16b, #255
+ dup v2.8h, w6 // alpha_max
+ ands x7, x4, #7 // width % 8
+ bic x8, x4, #7 // width / 8 * 8
+ dup v18.4s, w13 // offset
+ sub x1, x1, x8, lsl #1 // color_stride -
(aligned_width * 2)
+ sub x3, x3, x8, lsl #1 // alpha_stride -
(aligned_width * 2)
+ b.eq 1f
+
+ // Create mask for non-aligned width
+ load_mask shift=1
+1:
+ cbz x8, 20f // width < 8
+ mov x12, x8 // w12: aligned_width
+2:
+ ldr q5, [x0], #16
+ ldr q6, [x2], #16
+ umull v19.4s, v2.4h, v5.4h // alpha_max * color
+ umull2 v20.4s, v2.8h, v5.8h // alpha_max * color
+ umull v21.4s, v17.4h, v6.4h // range * alpha
+ umull2 v22.4s, v17.8h, v6.8h // range * alpha
+ cmeq v16.8h, v6.8h, v2.8h
+ subs x12, x12, #8
+ uqsub v19.4s, v19.4s, v18.4s // alpha_max * color -
offset
+ uqsub v20.4s, v20.4s, v18.4s // alpha_max * color -
offset
+
+ cmhi v19.4s, v19.4s, v21.4s
+ cmhi v20.4s, v20.4s, v22.4s
+ orr v7.16b, v19.16b, v20.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+ b.gt 2b
+20:
+ cbz w7, 3f
+ // handle loop tail
+ ldr q5, [x0]
+ ldr q6, [x2]
+ umull v19.4s, v2.4h, v5.4h // alpha_max * color
+ umull2 v20.4s, v2.8h, v5.8h // alpha_max * color
+ umull v21.4s, v17.4h, v6.4h // range * alpha
+ umull2 v22.4s, v17.8h, v6.8h // range * alpha
+ uqsub v19.4s, v19.4s, v18.4s // alpha_max * color -
offset
+ uqsub v20.4s, v20.4s, v18.4s // alpha_max * color -
offset
+
+ cmhi v19.4s, v19.4s, v21.4s
+ cmhi v20.4s, v20.4s, v22.4s
+ uqxtn v7.4h, v19.4s
+ uqxtn2 v7.8h, v20.4s
+ cmeq v16.8h, v6.8h, v2.8h
+
+ and v7.16b, v7.16b, v3.16b
+ orr v16.16b, v16.16b, v4.16b
+ orr v0.16b, v0.16b, v7.16b
+ and v1.16b, v1.16b, v16.16b
+3:
+ umaxv s23, v0.4s
+ subs x5, x5, #1
+ umov w9, v23.s[0]
+ add x0, x0, x1
+ add x2, x2, x3
+ cbnz w9, 4f
+ b.gt 1b
+
+ uminv h1, v1.8h
+ umov w9, v1.h[0]
+ mov x0, #0
+ cbnz w9, 5f
+ mov x0, #FF_ALPHA_TRANSPARENT
+ ret
+4:
+ mov x0, #FF_ALPHA_STRAIGHT
+5:
+ ret
+endfunc
diff --git a/libavfilter/vf_colordetect.c b/libavfilter/vf_colordetect.c
index 88374ac3e2..ef7fb25130 100644
--- a/libavfilter/vf_colordetect.c
+++ b/libavfilter/vf_colordetect.c
@@ -236,7 +236,9 @@ av_cold void
ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_c :
ff_detect_alpha_limited_c;
}
-#if ARCH_X86
+#if ARCH_AARCH64
+ ff_color_detect_dsp_init_aarch64(dsp, depth, color_range);
+#elif ARCH_X86
ff_color_detect_dsp_init_x86(dsp, depth, color_range);
#endif
}
diff --git a/libavfilter/vf_colordetect.h b/libavfilter/vf_colordetect.h
index 0c0dc889dc..95f30a6ac2 100644
--- a/libavfilter/vf_colordetect.h
+++ b/libavfilter/vf_colordetect.h
@@ -50,6 +50,8 @@ typedef struct FFColorDetectDSPContext {
void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range);
+void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp, int depth,
+ enum AVColorRange color_range);
void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
enum AVColorRange color_range);
commit 6450e01446f5c8c48e4cd2fd43c805ba2991d9ba
Author: Zhao Zhili <[email protected]>
AuthorDate: Thu Aug 21 16:40:09 2025 +0800
Commit: Zhao Zhili <[email protected]>
CommitDate: Mon Sep 1 15:35:16 2025 +0000
checkasm/vf_colordetect: test non-aligned width
diff --git a/tests/checkasm/vf_colordetect.c b/tests/checkasm/vf_colordetect.c
index 9a16de7392..18472e9b66 100644
--- a/tests/checkasm/vf_colordetect.c
+++ b/tests/checkasm/vf_colordetect.c
@@ -22,9 +22,9 @@
#include "libavfilter/vf_colordetect.h"
#include "libavutil/mem_internal.h"
-#define WIDTH 256
+#define WIDTH 540
#define HEIGHT 16
-#define STRIDE (WIDTH + 32)
+#define STRIDE FFALIGN(WIDTH, 32)
static void check_range_detect(int depth)
{
-----------------------------------------------------------------------
Summary of changes:
libavfilter/aarch64/Makefile | 2 +
libavfilter/aarch64/vf_colordetect_init.c | 64 ++++
libavfilter/aarch64/vf_colordetect_neon.S | 480 ++++++++++++++++++++++++++++++
libavfilter/vf_colordetect.c | 4 +-
libavfilter/vf_colordetect.h | 2 +
tests/checkasm/vf_colordetect.c | 4 +-
6 files changed, 553 insertions(+), 3 deletions(-)
create mode 100644 libavfilter/aarch64/vf_colordetect_init.c
create mode 100644 libavfilter/aarch64/vf_colordetect_neon.S
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]