The branch, master has been updated
       via  f790de2a878861a34820d7d8ab64badd192331f5 (commit)
      from  8b71eeb72e74545d10d6e69dc6be612e81148791 (commit)


- Log -----------------------------------------------------------------
commit f790de2a878861a34820d7d8ab64badd192331f5
Author:     Georgii Zagoruiko <[email protected]>
AuthorDate: Mon Nov 24 19:51:26 2025 +0000
Commit:     Martin Storsjö <[email protected]>
CommitDate: Mon Nov 24 21:22:55 2025 +0000

    aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit
    
    RPi4 (auto-vectorisation is turned on)
    put_luma_h_10_4x4_c:                                   282.8 ( 1.00x)
    put_luma_h_10_8x8_c:                                  1069.5 ( 1.00x)
    put_luma_h_10_8x8_neon:                                207.5 ( 5.15x)
    put_luma_h_10_16x16_c:                                1999.6 ( 1.00x)
    put_luma_h_10_16x16_neon:                              777.5 ( 2.57x)
    put_luma_h_10_32x32_c:                                6612.9 ( 1.00x)
    put_luma_h_10_32x32_neon:                             3201.6 ( 2.07x)
    put_luma_h_10_64x64_c:                               25059.0 ( 1.00x)
    put_luma_h_10_64x64_neon:                            13623.5 ( 1.84x)
    put_luma_h_10_128x128_c:                             91310.1 ( 1.00x)
    put_luma_h_10_128x128_neon:                          50358.3 ( 1.81x)
    put_luma_h_12_4x4_c:                                   282.1 ( 1.00x)
    put_luma_h_12_8x8_c:                                  1068.4 ( 1.00x)
    put_luma_h_12_8x8_neon:                                207.7 ( 5.14x)
    put_luma_h_12_16x16_c:                                1998.0 ( 1.00x)
    put_luma_h_12_16x16_neon:                              777.5 ( 2.57x)
    put_luma_h_12_32x32_c:                                6612.0 ( 1.00x)
    put_luma_h_12_32x32_neon:                             3201.6 ( 2.07x)
    put_luma_h_12_64x64_c:                               25036.8 ( 1.00x)
    put_luma_h_12_64x64_neon:                            13595.1 ( 1.84x)
    put_luma_h_12_128x128_c:                             91305.8 ( 1.00x)
    put_luma_h_12_128x128_neon:                          50359.7 ( 1.81x)
    
    Apple M2 Air (auto-vectorisation is turned on)
    put_luma_h_10_4x4_c:                                     0.3 ( 1.00x)
    put_luma_h_10_8x8_c:                                     1.0 ( 1.00x)
    put_luma_h_10_8x8_neon:                                  0.4 ( 2.59x)
    put_luma_h_10_16x16_c:                                   2.9 ( 1.00x)
    put_luma_h_10_16x16_neon:                                1.4 ( 2.01x)
    put_luma_h_10_32x32_c:                                   9.4 ( 1.00x)
    put_luma_h_10_32x32_neon:                                5.8 ( 1.62x)
    put_luma_h_10_64x64_c:                                  35.6 ( 1.00x)
    put_luma_h_10_64x64_neon:                               23.6 ( 1.51x)
    put_luma_h_10_128x128_c:                               131.1 ( 1.00x)
    put_luma_h_10_128x128_neon:                             92.6 ( 1.42x)
    put_luma_h_12_4x4_c:                                     0.3 ( 1.00x)
    put_luma_h_12_8x8_c:                                     1.0 ( 1.00x)
    put_luma_h_12_8x8_neon:                                  0.4 ( 2.58x)
    put_luma_h_12_16x16_c:                                   2.9 ( 1.00x)
    put_luma_h_12_16x16_neon:                                1.4 ( 2.00x)
    put_luma_h_12_32x32_c:                                   9.4 ( 1.00x)
    put_luma_h_12_32x32_neon:                                5.8 ( 1.61x)
    put_luma_h_12_64x64_c:                                  35.3 ( 1.00x)
    put_luma_h_12_64x64_neon:                               23.3 ( 1.52x)
    put_luma_h_12_128x128_c:                               131.2 ( 1.00x)
    put_luma_h_12_128x128_neon:                             92.4 ( 1.42x)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index b7dc1d89f8..aa75d22b78 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,6 +30,18 @@
 #define BDOF_BLOCK_SIZE         16
 #define BDOF_MIN_BLOCK_SIZE     4
 
+void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t 
gshift, uint32_t steps);
 
@@ -245,6 +257,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
+        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
+        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
+        c->inter.put[0][4][0][1] =
+        c->inter.put[0][5][0][1] =
+        c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -256,6 +273,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
         c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
         c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
+        c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
+        c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
+        c->inter.put[0][4][0][1] =
+        c->inter.put[0][5][0][1] =
+        c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
 
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a874edf889..41444ec44c 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -1713,3 +1713,122 @@ endfunc
 #undef GRADIENT_V1_OFFSET
 #undef VX_OFFSET
 #undef VY_OFFSET
+
+#define VVC_MAX_PB_SIZE 128
+
+.macro put_luma_h_x8_vector_filter shift
+        // 8 bytes from hf loaded to v0.8h
+        // 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded 
for shift to v1.8h,..,v6.8h,v17.8h
+        // v24.4h & v25.4h are output vectors to store
+        ext             v1.16b, v20.16b, v21.16b, #2
+        ext             v2.16b, v20.16b, v21.16b, #4
+        ext             v3.16b, v20.16b, v21.16b, #6
+        ext             v4.16b, v20.16b, v21.16b, #8
+        ext             v5.16b, v20.16b, v21.16b, #10
+        ext             v6.16b, v20.16b, v21.16b, #12
+        ext             v17.16b, v20.16b, v21.16b, #14
+        smull           v24.4s, v20.4h, v0.h[0]
+        smull2          v25.4s, v20.8h, v0.h[0]
+        smlal           v24.4s, v1.4h, v0.h[1]
+        smlal2          v25.4s, v1.8h, v0.h[1]
+        smlal           v24.4s, v2.4h, v0.h[2]
+        smlal2          v25.4s, v2.8h, v0.h[2]
+        smlal           v24.4s, v3.4h, v0.h[3]
+        smlal2          v25.4s, v3.8h, v0.h[3]
+        smlal           v24.4s, v4.4h, v0.h[4]
+        smlal2          v25.4s, v4.8h, v0.h[4]
+        smlal           v24.4s, v5.4h, v0.h[5]
+        smlal2          v25.4s, v5.8h, v0.h[5]
+        smlal           v24.4s, v6.4h, v0.h[6]
+        smlal2          v25.4s, v6.8h, v0.h[6]
+        smlal           v24.4s, v17.4h, v0.h[7]
+        smlal2          v25.4s, v17.8h, v0.h[7]
+        sqshrn          v24.4h, v24.4s, #(\shift)
+        sqshrn          v25.4h, v25.4s, #(\shift)
+.endm
+
+.macro put_luma_h8_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ld1             {v0.8b}, [x4]
+        sub             x1, x1, #6
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h, v21.8h}, [x1], x2
+        put_luma_h_x8_vector_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_luma_h16_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ld1             {v0.8b}, [x4]
+        sub             x9, x9, #16
+        sub             x1, x1, #6
+        sxtl            v0.8h, v0.8b
+1:
+        ld1             {v20.8h, v21.8h, v22.8h}, [x1], x2
+        put_luma_h_x8_vector_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_luma_h_x8_vector_filter \shift
+        subs            w3, w3, #1
+        st1             {v24.4h, v25.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+.macro put_luma_h_x16_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        ld1             {v0.8b}, [x4]
+        sub             x9, x9, w6, uxtw #1
+        sub             x2, x2, w6, uxtw #1
+        sxtl            v0.8h, v0.8b
+        sub             x1, x1, #6
+        sub             x2, x2, #16
+1:
+        ld1             {v20.8h}, [x1], #16
+        mov             w8, w6
+2:
+        ld1             {v21.8h, v22.8h}, [x1], #32
+        put_luma_h_x8_vector_filter \shift
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+        st1             {v24.4h, v25.4h}, [x0], #16
+        put_luma_h_x8_vector_filter \shift
+        mov             v20.16b, v21.16b
+        subs            w8, w8, #16
+        st1             {v24.4h, v25.4h}, [x0], #16
+        b.gt            2b
+        subs            w3, w3, #1
+        add             x0, x0, x9
+        add             x1, x1, x2
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_h8_10_neon, export=1
+        put_luma_h8_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h8_12_neon, export=1
+        put_luma_h8_xx_neon 4
+endfunc
+
+function ff_vvc_put_luma_h16_10_neon, export=1
+        put_luma_h16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h16_12_neon, export=1
+        put_luma_h16_xx_neon 4
+endfunc
+
+function ff_vvc_put_luma_h_x16_10_neon, export=1
+        put_luma_h_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h_x16_12_neon, export=1
+        put_luma_h_x16_xx_neon 4
+endfunc

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/aarch64/vvc/dsp_init.c |  22 +++++++
 libavcodec/aarch64/vvc/inter.S    | 119 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to