[FFmpeg-cvslog] [ffmpeg] branch master updated. eb14d45824 avfilter/vf_colordetect: add aarch64 asm

ffmpeg-git Mon, 01 Sep 2025 08:36:18 -0700

The branch, master has been updated
       via  eb14d4582447aa6560a47e5d8c6f142d76540766 (commit)
       via  6450e01446f5c8c48e4cd2fd43c805ba2991d9ba (commit)
      from  f07c12d806b9a6f3ca05870a99d10a818a3aabc1 (commit)



- Log -----------------------------------------------------------------
commit eb14d4582447aa6560a47e5d8c6f142d76540766
Author:     Zhao Zhili <[email protected]>
AuthorDate: Thu Aug 21 20:44:37 2025 +0800
Commit:     Zhao Zhili <[email protected]>
CommitDate: Mon Sep 1 15:35:16 2025 +0000

    avfilter/vf_colordetect: add aarch64 asm
    
                           | rpi5 gcc 12  | m1 clang -fno-vectorize | m1 clang
    ---------------------------------------------------------------------------
    alpha_8_full_c:        | 32159.2 ( 1.00x) | 135.8 ( 1.00x) |  26.4 ( 1.00x)
    alpha_8_full_neon:     |  1266.0 (25.40x) |   8.0 (17.03x) |   8.4 ( 3.15x)
    alpha_8_limited_c:     | 37561.9 ( 1.00x) | 169.1 ( 1.00x) |  47.7 ( 1.00x)
    alpha_8_limited_neon:  |  3967.0 ( 9.47x) |  12.5 (13.53x) |  13.3 ( 3.59x)
    alpha_16_full_c:       | 15867.9 ( 1.00x) |  64.5 ( 1.00x) |  13.7 ( 1.00x)
    alpha_16_full_neon:    |  1256.9 (12.62x) |   7.9 ( 8.15x) |   8.3 ( 1.64x)
    alpha_16_limited_c:    | 16723.7 ( 1.00x) |  88.7 ( 1.00x) | 103.3 ( 1.00x)
    alpha_16_limited_neon: |  4031.3 ( 4.15x) |  12.5 ( 7.08x) |  13.2 ( 7.86x)
    range_8_c:             | 21819.7 ( 1.00x) | 120.0 ( 1.00x) |   9.4 ( 1.00x)
    range_8_neon:          |  1148.3 (19.00x) |   4.3 (27.60x) |   4.8 ( 1.97x)
    range_16_c:            | 10757.1 ( 1.00x) |  45.7 ( 1.00x) |   7.9 ( 1.00x)
    range_16_neon:         |  1141.5 ( 9.42x) |   4.4 (10.38x) |   4.6 ( 1.72x)

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index b68209bc94..c7b7e18467 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,5 +1,7 @@
 OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+OBJS-$(CONFIG_COLORDETECT_FILTER)            += aarch64/vf_colordetect_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
 
 NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
+NEON-OBJS-$(CONFIG_COLORDETECT_FILTER)       += aarch64/vf_colordetect_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
diff --git a/libavfilter/aarch64/vf_colordetect_init.c 
b/libavfilter/aarch64/vf_colordetect_init.c
new file mode 100644
index 0000000000..4db6b90542
--- /dev/null
+++ b/libavfilter/aarch64/vf_colordetect_init.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/vf_colordetect.h"
+
+int ff_detect_alpha_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+                              const uint8_t *alpha, ptrdiff_t alpha_stride,
+                              ptrdiff_t width, ptrdiff_t height,
+                              int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_full_neon(const uint8_t *color, ptrdiff_t color_stride,
+                                const uint8_t *alpha, ptrdiff_t alpha_stride,
+                                ptrdiff_t width, ptrdiff_t height,
+                                int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha_limited_neon(const uint8_t *color, ptrdiff_t color_stride,
+                                 const uint8_t *alpha, ptrdiff_t alpha_stride,
+                                 ptrdiff_t width, ptrdiff_t height,
+                                 int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_alpha16_limited_neon(const uint8_t *color, ptrdiff_t 
color_stride,
+                                   const uint8_t *alpha, ptrdiff_t 
alpha_stride,
+                                   ptrdiff_t width, ptrdiff_t height,
+                                   int alpha_max, int mpeg_range, int offset);
+
+int ff_detect_range_neon(const uint8_t *data, ptrdiff_t stride,
+                         ptrdiff_t width, ptrdiff_t height,
+                         int mpeg_min, int mpeg_max);
+
+int ff_detect_range16_neon(const uint8_t *data, ptrdiff_t stride,
+                           ptrdiff_t width, ptrdiff_t height,
+                           int mpeg_min, int mpeg_max);
+
+av_cold void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp, 
int depth,
+                                          enum AVColorRange color_range)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        dsp->detect_range = depth > 8 ? ff_detect_range16_neon : 
ff_detect_range_neon;
+        if (color_range == AVCOL_RANGE_JPEG)
+            dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_full_neon : 
ff_detect_alpha_full_neon;
+        else
+            dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_neon : 
ff_detect_alpha_limited_neon;
+    }
+}
diff --git a/libavfilter/aarch64/vf_colordetect_neon.S 
b/libavfilter/aarch64/vf_colordetect_neon.S
new file mode 100644
index 0000000000..f3cca16fed
--- /dev/null
+++ b/libavfilter/aarch64/vf_colordetect_neon.S
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2025 Zhao Zhili <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+#define FF_ALPHA_TRANSPARENT        (1 << 0)
+#define FF_ALPHA_STRAIGHT           ((1 << 1) | FF_ALPHA_TRANSPARENT)
+
+const mask
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+mask_start:
+        .byte           0, 0, 0, 0, 0, 0, 0, 0
+        .byte           0, 0, 0, 0, 0, 0, 0, 0
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+        .byte           255, 255, 255, 255, 255, 255, 255, 255
+endconst
+
+.macro load_mask_zero, shift=0
+        movrel          x9, mask_start
+        sub             x9, x9, x7, lsl #(\shift)
+        ldr             q3, [x9]
+.endm
+
+.macro load_mask, shift=0
+        movrel          x9, mask_start
+        sub             x9, x9, x7, lsl #(\shift)
+        ld1             {v3.16b, v4.16b}, [x9]
+.endm
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range_neon, export=1
+        ands            x7, x2, #15                 // width % 16
+        bic             x8, x2, #15                 // width / 16 * 16
+        bic             x6, x2, #31                 // width / 32 * 32
+        and             x10, x2, #16                // check x8 != x6
+        dup             v0.16b, w4                  // mpeg_min
+        dup             v1.16b, w5                  // mpeg_max
+        movi            v2.16b, #0                  // cond
+        sub             x1, x1, x8
+        b.eq            1f
+        load_mask_zero
+1:
+        cbz             x6, 20f                     // width < 32
+        mov             x12, x6
+2:
+        ld1             {v5.16b, v6.16b}, [x0], #32
+        cmhi            v16.16b, v0.16b, v5.16b
+        cmhi            v17.16b, v5.16b, v1.16b
+        cmhi            v18.16b, v0.16b, v6.16b
+        cmhi            v19.16b, v6.16b, v1.16b
+        orr             v20.16b, v16.16b, v17.16b
+        orr             v21.16b, v18.16b, v19.16b
+        subs            x12, x12, #32
+        orr             v20.16b, v20.16b, v21.16b
+        orr             v2.16b, v2.16b, v20.16b
+        b.gt            2b
+20:
+        cbz             x10, 3f                     // width < 16
+        ldr             q20, [x0], #16
+        cmhi            v16.16b, v0.16b, v20.16b
+        cmhi            v17.16b, v20.16b, v1.16b
+        orr             v16.16b, v16.16b, v17.16b
+        orr             v2.16b, v2.16b, v16.16b
+3:
+        cbz             x7, 4f
+        ldr             q21, [x0]
+        cmhi            v18.16b, v0.16b, v21.16b
+        cmhi            v19.16b, v21.16b, v1.16b
+        orr             v16.16b, v18.16b, v19.16b
+        and             v16.16b, v16.16b, v3.16b
+        orr             v2.16b, v2.16b, v16.16b
+4:
+        umaxv           b4, v2.16b
+        subs            x3, x3, #1
+        umov            w9, v4.b[0]
+        add             x0, x0, x1
+        cbnz            w9, 8f
+        b.gt            1b
+        mov             x0, #0
+        ret
+8:
+        mov             x0, #1
+        ret
+endfunc
+
+/* x0: const uint8_t *data
+ * x1: ptrdiff_t stride
+ * x2: ptrdiff_t width
+ * x3: ptrdiff_t height
+ * w4: int mpeg_min
+ * w5: int mpeg_max
+ */
+function ff_detect_range16_neon, export=1
+        ands            x7, x2, #7                  // width % 7
+        bic             x8, x2, #7                  // width / 8 * 8
+        bic             x6, x2, #15                 // width / 16 * 16
+        and             x10, x2, #8                 // check x8 != x6
+        dup             v0.8h, w4                   // mpeg_min
+        dup             v1.8h, w5                   // mpeg_max
+        movi            v2.16b, #0                  // cond
+        sub             x1, x1, x8, lsl #1
+        b.eq            1f
+        load_mask_zero  shift=1
+1:
+        cbz             x6, 20f                     // width < 16
+        mov             x12, x6
+2:
+        ld1             {v5.8h, v6.8h}, [x0], #32
+        cmhi            v16.8h, v0.8h, v5.8h
+        cmhi            v17.8h, v5.8h, v1.8h
+        cmhi            v18.8h, v0.8h, v6.8h
+        cmhi            v19.8h, v6.8h, v1.8h
+        orr             v20.16b, v16.16b, v17.16b
+        orr             v21.16b, v18.16b, v19.16b
+        subs            x12, x12, #16
+        orr             v20.16b, v20.16b, v21.16b
+        orr             v2.16b, v2.16b, v20.16b
+        b.gt            2b
+20:
+        cbz             x10, 3f                     // width < 8
+        ldr             q20, [x0], #16
+        cmhi            v16.8h, v0.8h, v20.8h
+        cmhi            v17.8h, v20.8h, v1.8h
+        orr             v16.16b, v16.16b, v17.16b
+        orr             v2.16b, v2.16b, v16.16b
+3:
+        cbz             x7, 4f
+        ldr             q21, [x0]
+        cmhi            v18.8h, v0.8h, v21.8h
+        cmhi            v19.8h, v21.8h, v1.8h
+        orr             v16.16b, v18.16b, v19.16b
+        and             v16.16b, v16.16b, v3.16b
+        orr             v2.16b, v2.16b, v16.16b
+4:
+        umaxv           h4, v2.8h
+        subs            x3, x3, #1
+        umov            w9, v4.h[0]
+        add             x0, x0, x1
+        cbnz            w9, 8f
+        b.gt            1b
+        mov             x0, #0
+        ret
+8:
+        mov             x0, #1
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha_full_neon, export=1
+        ands            x7, x4, #15             // width % 16
+        bic             x8, x4, #15             // width / 16 * 16
+        movi            v0.16b, #0
+        movi            v1.16b, #255
+        dup             v2.16b, w6              // alpha_max
+        sub             x1, x1, x8              // color_stride - aligned_width
+        sub             x3, x3, x8              // alpha_stride - aligned_width
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask
+1:
+        cbz             x8, 20f                 // width < 16
+        mov             x12, x8                 // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        subs            x12, x12, #16
+        cmhi            v7.16b, v5.16b, v6.16b
+        cmeq            v16.16b, v6.16b, v2.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        cmhi            v7.16b, v5.16b, v6.16b
+        cmeq            v16.16b, v6.16b, v2.16b
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           b17, v0.16b
+        subs            x5, x5, #1
+        umov            w9, v17.b[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           b1, v1.16b
+        umov            w9, v1.b[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ */
+function ff_detect_alpha16_full_neon, export=1
+        ands            x7, x4, #7              // width % 8
+        bic             x8, x4, #7              // width / 8 * 8
+        movi            v0.8h, #0
+        movi            v1.16b, #255
+        dup             v2.8h, w6               // alpha_max
+        sub             x1, x1, x8, lsl #1      // color_stride - 
(aligned_width * 2)
+        sub             x3, x3, x8, lsl #1      // alpha_stride - 
(aligned_width * 2)
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask       shift=1
+1:
+        cbz             x8, 20f                 // width < 8
+        mov             x12, x8                 // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        subs            x12, x12, #8
+        cmhi            v7.8h, v5.8h, v6.8h
+        cmeq            v16.8h, v6.8h, v2.8h
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        cmhi            v7.8h, v5.8h, v6.8h
+        cmeq            v16.8h, v6.8h, v2.8h
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           h17, v0.8h
+        subs            x5, x5, #1
+        umov            w9, v17.h[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           h1, v1.8h
+        umov            w9, v1.h[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha_limited_neon, export=1
+        dup             v17.16b, w7             // mpeg_range
+        ldr             w13, [sp]
+        movi            v0.16b, #0
+        movi            v1.16b, #255
+        dup             v2.16b, w6              // alpha_max
+        ands            x7, x4, #15             // width % 16
+        bic             x8, x4, #15             // width / 16 * 16
+        dup             v18.8h, w13             // offset
+        sub             x1, x1, x8              // color_stride - aligned_width
+        sub             x3, x3, x8              // alpha_stride - aligned_width
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask
+1:
+        cbz             x8, 20f                     // width < 16
+        mov             x12, x8                     // w12: aligned_width
+2:
+        ldr             q5, [x0], #16               // color
+        ldr             q6, [x2], #16               // alpha
+        umull           v19.8h, v2.8b, v5.8b        // alpha_max * color
+        umull2          v20.8h, v2.16b, v5.16b      // alpha_max * color
+        umull           v21.8h, v17.8b, v6.8b       // range * alpha
+        umull2          v22.8h, v17.16b, v6.16b     // range * alpha
+        cmeq            v16.16b, v6.16b, v2.16b
+        subs            x12, x12, #16
+        uqsub           v19.8h, v19.8h, v18.8h      // alpha_max * color - 
offset
+        uqsub           v20.8h, v20.8h, v18.8h      // alpha_max * color - 
offset
+
+        cmhi            v19.8h, v19.8h, v21.8h
+        cmhi            v20.8h, v20.8h, v22.8h
+        orr             v7.16b, v19.16b, v20.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        umull           v19.8h, v2.8b, v5.8b        // alpha_max * color
+        umull2          v20.8h, v2.16b, v5.16b      // alpha_max * color
+        umull           v21.8h, v17.8b, v6.8b       // range * alpha
+        umull2          v22.8h, v17.16b, v6.16b     // range * alpha
+        uqsub           v19.8h, v19.8h, v18.8h      // alpha_max * color - 
offset
+        uqsub           v20.8h, v20.8h, v18.8h      // alpha_max * color - 
offset
+
+        cmhi            v19.8h, v19.8h, v21.8h
+        cmhi            v20.8h, v20.8h, v22.8h
+        uqxtn           v7.8b, v19.8h
+        uqxtn2          v7.16b, v20.8h
+        cmeq            v16.16b, v6.16b, v2.16b
+
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           b23, v0.16b
+        subs            x5, x5, #1
+        umov            w9, v23.b[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           b1, v1.16b
+        umov            w9, v1.b[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
+
+/*
+ * x0: const uint8_t *color,
+ * x1: ptrdiff_t color_stride,
+ * x2: const uint8_t *alpha,
+ * x3: ptrdiff_t alpha_stride,
+ * x4: ptrdiff_t width,
+ * x5: ptrdiff_t height,
+ * w6: int alpha_max,
+ * w7: int mpeg_range
+ * [sp]: int offset
+ */
+function ff_detect_alpha16_limited_neon, export=1
+        dup             v17.8h, w7                  // mpeg_range
+        ldr             w13, [sp]
+        movi            v0.8h, #0
+        movi            v1.16b, #255
+        dup             v2.8h, w6                   // alpha_max
+        ands            x7, x4, #7                  // width % 8
+        bic             x8, x4, #7                  // width / 8 * 8
+        dup             v18.4s, w13                 // offset
+        sub             x1, x1, x8, lsl #1          // color_stride - 
(aligned_width * 2)
+        sub             x3, x3, x8, lsl #1          // alpha_stride - 
(aligned_width * 2)
+        b.eq            1f
+
+        // Create mask for non-aligned width
+        load_mask       shift=1
+1:
+        cbz             x8, 20f                     // width < 8
+        mov             x12, x8                     // w12: aligned_width
+2:
+        ldr             q5, [x0], #16
+        ldr             q6, [x2], #16
+        umull           v19.4s, v2.4h, v5.4h        // alpha_max * color
+        umull2          v20.4s, v2.8h, v5.8h        // alpha_max * color
+        umull           v21.4s, v17.4h, v6.4h       // range * alpha
+        umull2          v22.4s, v17.8h, v6.8h       // range * alpha
+        cmeq            v16.8h, v6.8h, v2.8h
+        subs            x12, x12, #8
+        uqsub           v19.4s, v19.4s, v18.4s      // alpha_max * color - 
offset
+        uqsub           v20.4s, v20.4s, v18.4s      // alpha_max * color - 
offset
+
+        cmhi            v19.4s, v19.4s, v21.4s
+        cmhi            v20.4s, v20.4s, v22.4s
+        orr             v7.16b, v19.16b, v20.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+        b.gt            2b
+20:
+        cbz             w7, 3f
+        // handle loop tail
+        ldr             q5, [x0]
+        ldr             q6, [x2]
+        umull           v19.4s, v2.4h, v5.4h        // alpha_max * color
+        umull2          v20.4s, v2.8h, v5.8h        // alpha_max * color
+        umull           v21.4s, v17.4h, v6.4h       // range * alpha
+        umull2          v22.4s, v17.8h, v6.8h       // range * alpha
+        uqsub           v19.4s, v19.4s, v18.4s      // alpha_max * color - 
offset
+        uqsub           v20.4s, v20.4s, v18.4s      // alpha_max * color - 
offset
+
+        cmhi            v19.4s, v19.4s, v21.4s
+        cmhi            v20.4s, v20.4s, v22.4s
+        uqxtn           v7.4h, v19.4s
+        uqxtn2          v7.8h, v20.4s
+        cmeq            v16.8h, v6.8h, v2.8h
+
+        and             v7.16b, v7.16b, v3.16b
+        orr             v16.16b, v16.16b, v4.16b
+        orr             v0.16b, v0.16b, v7.16b
+        and             v1.16b, v1.16b, v16.16b
+3:
+        umaxv           s23, v0.4s
+        subs            x5, x5, #1
+        umov            w9, v23.s[0]
+        add             x0, x0, x1
+        add             x2, x2, x3
+        cbnz            w9, 4f
+        b.gt            1b
+
+        uminv           h1, v1.8h
+        umov            w9, v1.h[0]
+        mov             x0, #0
+        cbnz            w9, 5f
+        mov             x0, #FF_ALPHA_TRANSPARENT
+        ret
+4:
+        mov             x0, #FF_ALPHA_STRAIGHT
+5:
+        ret
+endfunc
diff --git a/libavfilter/vf_colordetect.c b/libavfilter/vf_colordetect.c
index 88374ac3e2..ef7fb25130 100644
--- a/libavfilter/vf_colordetect.c
+++ b/libavfilter/vf_colordetect.c
@@ -236,7 +236,9 @@ av_cold void 
ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
         dsp->detect_alpha = depth > 8 ? ff_detect_alpha16_limited_c : 
ff_detect_alpha_limited_c;
     }
 
-#if ARCH_X86
+#if ARCH_AARCH64
+    ff_color_detect_dsp_init_aarch64(dsp, depth, color_range);
+#elif ARCH_X86
     ff_color_detect_dsp_init_x86(dsp, depth, color_range);
 #endif
 }
diff --git a/libavfilter/vf_colordetect.h b/libavfilter/vf_colordetect.h
index 0c0dc889dc..95f30a6ac2 100644
--- a/libavfilter/vf_colordetect.h
+++ b/libavfilter/vf_colordetect.h
@@ -50,6 +50,8 @@ typedef struct FFColorDetectDSPContext {
 void ff_color_detect_dsp_init(FFColorDetectDSPContext *dsp, int depth,
                               enum AVColorRange color_range);
 
+void ff_color_detect_dsp_init_aarch64(FFColorDetectDSPContext *dsp, int depth,
+                                      enum AVColorRange color_range);
 void ff_color_detect_dsp_init_x86(FFColorDetectDSPContext *dsp, int depth,
                                   enum AVColorRange color_range);
 

commit 6450e01446f5c8c48e4cd2fd43c805ba2991d9ba
Author:     Zhao Zhili <[email protected]>
AuthorDate: Thu Aug 21 16:40:09 2025 +0800
Commit:     Zhao Zhili <[email protected]>
CommitDate: Mon Sep 1 15:35:16 2025 +0000

    checkasm/vf_colordetect: test non-aligned width

diff --git a/tests/checkasm/vf_colordetect.c b/tests/checkasm/vf_colordetect.c
index 9a16de7392..18472e9b66 100644
--- a/tests/checkasm/vf_colordetect.c
+++ b/tests/checkasm/vf_colordetect.c
@@ -22,9 +22,9 @@
 #include "libavfilter/vf_colordetect.h"
 #include "libavutil/mem_internal.h"
 
-#define WIDTH  256
+#define WIDTH  540
 #define HEIGHT 16
-#define STRIDE (WIDTH + 32)
+#define STRIDE FFALIGN(WIDTH, 32)
 
 static void check_range_detect(int depth)
 {

-----------------------------------------------------------------------

Summary of changes:
 libavfilter/aarch64/Makefile              |   2 +
 libavfilter/aarch64/vf_colordetect_init.c |  64 ++++
 libavfilter/aarch64/vf_colordetect_neon.S | 480 ++++++++++++++++++++++++++++++
 libavfilter/vf_colordetect.c              |   4 +-
 libavfilter/vf_colordetect.h              |   2 +
 tests/checkasm/vf_colordetect.c           |   4 +-
 6 files changed, 553 insertions(+), 3 deletions(-)
 create mode 100644 libavfilter/aarch64/vf_colordetect_init.c
 create mode 100644 libavfilter/aarch64/vf_colordetect_neon.S


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. eb14d45824 avfilter/vf_colordetect: add aarch64 asm

Reply via email to