12-bit (PR #20737)

george.zaguri via ffmpeg-devel Wed, 22 Oct 2025 11:30:37 -0700

PR #20737 opened by george.zaguri
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20737
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20737.patch


RPi4:
put_luma_h_10_4x4_c:                                   261.8 ( 1.00x)
put_luma_h_10_8x8_c:                                  1051.5 ( 1.00x)
put_luma_h_10_8x8_neon:                                231.5 ( 4.54x)
put_luma_h_10_16x16_c:                                4131.0 ( 1.00x)
put_luma_h_10_16x16_neon:                              848.6 ( 4.87x)
put_luma_h_10_32x32_c:                               16469.5 ( 1.00x)
put_luma_h_10_32x32_neon:                             3345.6 ( 4.92x)
put_luma_h_10_64x64_c:                               66734.0 ( 1.00x)
put_luma_h_10_64x64_neon:                            14586.9 ( 4.57x)
put_luma_h_10_128x128_c:                            264228.9 ( 1.00x)
put_luma_h_10_128x128_neon:                          52199.7 ( 5.06x)
put_luma_h_12_4x4_c:                                   262.1 ( 1.00x)
put_luma_h_12_8x8_c:                                  1051.3 ( 1.00x)
put_luma_h_12_8x8_neon:                                230.9 ( 4.55x)
put_luma_h_12_16x16_c:                                4124.4 ( 1.00x)
put_luma_h_12_16x16_neon:                              848.0 ( 4.86x)
put_luma_h_12_32x32_c:                               16446.9 ( 1.00x)
put_luma_h_12_32x32_neon:                             3347.4 ( 4.91x)
put_luma_h_12_64x64_c:                               66770.1 ( 1.00x)
put_luma_h_12_64x64_neon:                            14360.2 ( 4.65x)
put_luma_h_12_128x128_c:                            264419.5 ( 1.00x)
put_luma_h_12_128x128_neon:                          52200.6 ( 5.07x)

M2 Air (with auto-vectorization feature):
put_luma_h_10_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_10_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_10_8x8_neon:                                  0.4 ( 2.58x)
put_luma_h_10_16x16_c:                                   3.0 ( 1.00x)
put_luma_h_10_16x16_neon:                                1.5 ( 2.01x)
put_luma_h_10_32x32_c:                                   9.7 ( 1.00x)
put_luma_h_10_32x32_neon:                                6.2 ( 1.57x)
put_luma_h_10_64x64_c:                                  36.6 ( 1.00x)
put_luma_h_10_64x64_neon:                               23.9 ( 1.53x)
put_luma_h_10_128x128_c:                               134.2 ( 1.00x)
put_luma_h_10_128x128_neon:                             95.4 ( 1.41x)
put_luma_h_12_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_12_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_12_8x8_neon:                                  0.4 ( 2.57x)
put_luma_h_12_16x16_c:                                   3.0 ( 1.00x)
put_luma_h_12_16x16_neon:                                1.5 ( 2.01x)
put_luma_h_12_32x32_c:                                   9.7 ( 1.00x)
put_luma_h_12_32x32_neon:                                6.0 ( 1.63x)
put_luma_h_12_64x64_c:                                  36.5 ( 1.00x)
put_luma_h_12_64x64_neon:                               23.9 ( 1.53x)
put_luma_h_12_128x128_c:                               134.8 ( 1.00x)
put_luma_h_12_128x128_neon:                             95.2 ( 1.42x)


>From dba0d5709658f01e40496d1f1fc8a1832e21b708 Mon Sep 17 00:00:00 2001
From: Georgii Zagoruiko <[email protected]>
Date: Wed, 22 Oct 2025 19:22:23 +0100
Subject: [PATCH] aarch64/vvc: Optimisations of put_luma_h() functions for
 10/12-bit

RPi4:
put_luma_h_10_4x4_c:                                   261.8 ( 1.00x)
put_luma_h_10_8x8_c:                                  1051.5 ( 1.00x)
put_luma_h_10_8x8_neon:                                231.5 ( 4.54x)
put_luma_h_10_16x16_c:                                4131.0 ( 1.00x)
put_luma_h_10_16x16_neon:                              848.6 ( 4.87x)
put_luma_h_10_32x32_c:                               16469.5 ( 1.00x)
put_luma_h_10_32x32_neon:                             3345.6 ( 4.92x)
put_luma_h_10_64x64_c:                               66734.0 ( 1.00x)
put_luma_h_10_64x64_neon:                            14586.9 ( 4.57x)
put_luma_h_10_128x128_c:                            264228.9 ( 1.00x)
put_luma_h_10_128x128_neon:                          52199.7 ( 5.06x)
put_luma_h_12_4x4_c:                                   262.1 ( 1.00x)
put_luma_h_12_8x8_c:                                  1051.3 ( 1.00x)
put_luma_h_12_8x8_neon:                                230.9 ( 4.55x)
put_luma_h_12_16x16_c:                                4124.4 ( 1.00x)
put_luma_h_12_16x16_neon:                              848.0 ( 4.86x)
put_luma_h_12_32x32_c:                               16446.9 ( 1.00x)
put_luma_h_12_32x32_neon:                             3347.4 ( 4.91x)
put_luma_h_12_64x64_c:                               66770.1 ( 1.00x)
put_luma_h_12_64x64_neon:                            14360.2 ( 4.65x)
put_luma_h_12_128x128_c:                            264419.5 ( 1.00x)
put_luma_h_12_128x128_neon:                          52200.6 ( 5.07x)

M2 Air (with auto-vectorization feature):
put_luma_h_10_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_10_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_10_8x8_neon:                                  0.4 ( 2.58x)
put_luma_h_10_16x16_c:                                   3.0 ( 1.00x)
put_luma_h_10_16x16_neon:                                1.5 ( 2.01x)
put_luma_h_10_32x32_c:                                   9.7 ( 1.00x)
put_luma_h_10_32x32_neon:                                6.2 ( 1.57x)
put_luma_h_10_64x64_c:                                  36.6 ( 1.00x)
put_luma_h_10_64x64_neon:                               23.9 ( 1.53x)
put_luma_h_10_128x128_c:                               134.2 ( 1.00x)
put_luma_h_10_128x128_neon:                             95.4 ( 1.41x)
put_luma_h_12_4x4_c:                                     0.3 ( 1.00x)
put_luma_h_12_8x8_c:                                     1.0 ( 1.00x)
put_luma_h_12_8x8_neon:                                  0.4 ( 2.57x)
put_luma_h_12_16x16_c:                                   3.0 ( 1.00x)
put_luma_h_12_16x16_neon:                                1.5 ( 2.01x)
put_luma_h_12_32x32_c:                                   9.7 ( 1.00x)
put_luma_h_12_32x32_neon:                                6.0 ( 1.63x)
put_luma_h_12_64x64_c:                                  36.5 ( 1.00x)
put_luma_h_12_64x64_neon:                               23.9 ( 1.53x)
put_luma_h_12_128x128_c:                               134.8 ( 1.00x)
put_luma_h_12_128x128_neon:                             95.2 ( 1.42x)
---
 libavcodec/aarch64/vvc/dsp_init.c | 22 ++++++++
 libavcodec/aarch64/vvc/inter.S    | 90 +++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index b7dc1d89f8..053d453fa7 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,6 +30,18 @@
 #define BDOF_BLOCK_SIZE         16
 #define BDOF_MIN_BLOCK_SIZE     4
 
+void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+    const int height, const int8_t *hf, const int8_t *vf, const int width);
 
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t 
gshift, uint32_t steps);
 
@@ -245,6 +257,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
+        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
+        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
+        c->inter.put[0][4][0][1] =
+        c->inter.put[0][5][0][1] =
+        c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -256,6 +273,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
+        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
+        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
+        c->inter.put[0][4][0][1] =
+        c->inter.put[0][5][0][1] =
+        c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a874edf889..be37df8e38 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -1713,3 +1713,93 @@ endfunc
 #undef GRADIENT_V1_OFFSET
 #undef VX_OFFSET
 #undef VY_OFFSET
+
+#define VVC_MAX_PB_SIZE 128
+
+.macro put_luma_h_x8_16bit_ n, is_w_loop, shift
+        // dst           .req x0
+        // _src          .req x1
+        // _src_stride   .req x2
+        // height        .req w3
+        // hf            .req x4
+        // vf            .req x5
+        // width         .req w6
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ld1             {v0.8b}, [x4]
+        sxtl            v0.8h, v0.8b
+        mov             w7, #0                      // y loop: height
+1:
+        sub             x11, x1, #6
+        add             w7, w7, #1
+        ld1             {v20.8h}, [x11], #16
+        mov             x10, x0
+    .if \is_w_loop == 1
+        mov             w8, #(8*\n)
+2:
+    .endif
+    .rept \n
+        ld1             {v16.8h}, [x11], #16
+        ext             v1.16b, v20.16b, v16.16b, #2
+        ext             v2.16b, v20.16b, v16.16b, #4
+        ext             v3.16b, v20.16b, v16.16b, #6
+        ext             v4.16b, v20.16b, v16.16b, #8
+        ext             v5.16b, v20.16b, v16.16b, #10
+        ext             v6.16b, v20.16b, v16.16b, #12
+        ext             v17.16b, v20.16b, v16.16b, #14
+        smull           v21.4s, v20.4h, v0.h[0]
+        smull2          v22.4s, v20.8h, v0.h[0]
+        smlal           v21.4s, v1.4h, v0.h[1]
+        smlal2          v22.4s, v1.8h, v0.h[1]
+        smlal           v21.4s, v2.4h, v0.h[2]
+        smlal2          v22.4s, v2.8h, v0.h[2]
+        smlal           v21.4s, v3.4h, v0.h[3]
+        smlal2          v22.4s, v3.8h, v0.h[3]
+        smlal           v21.4s, v4.4h, v0.h[4]
+        smlal2          v22.4s, v4.8h, v0.h[4]
+        smlal           v21.4s, v5.4h, v0.h[5]
+        smlal2          v22.4s, v5.8h, v0.h[5]
+        smlal           v21.4s, v6.4h, v0.h[6]
+        smlal2          v22.4s, v6.8h, v0.h[6]
+        smlal           v21.4s, v17.4h, v0.h[7]
+        smlal2          v22.4s, v17.8h, v0.h[7]
+        sqshrn          v21.4h, v21.4s, #(\shift)
+        sqshrn          v22.4h, v22.4s, #(\shift)
+        st1             {v21.4h, v22.4h}, [x10], #16
+        mov             v20.16b, v16.16b
+    .endr
+    .if \is_w_loop == 1
+        cmp             w8, w6
+        add             w8, w8, #(8*\n)
+        b.lt            2b
+    .endif
+        cmp             w7, w3
+        add             x0, x0, x9
+        add             x1, x1, x2
+        b.lt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_h8_10_neon, export=1
+        put_luma_h_x8_16bit_ 1, 0, 2
+endfunc
+
+function ff_vvc_put_luma_h8_12_neon, export=1
+        put_luma_h_x8_16bit_ 1, 0, 4
+endfunc
+
+function ff_vvc_put_luma_h16_10_neon, export=1
+        put_luma_h_x8_16bit_ 2, 0, 2
+endfunc
+
+function ff_vvc_put_luma_h16_12_neon, export=1
+        put_luma_h_x8_16bit_ 2, 0, 4
+endfunc
+
+function ff_vvc_put_luma_h_x16_10_neon, export=1
+        put_luma_h_x8_16bit_ 2, 1, 2
+endfunc
+
+function ff_vvc_put_luma_h_x16_12_neon, export=1
+        put_luma_h_x8_16bit_ 2, 1, 4
+endfunc
+
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit (PR #20737)

Reply via email to