PR #20737 opened by george.zaguri URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20737 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20737.patch
RPi4: put_luma_h_10_4x4_c: 261.8 ( 1.00x) put_luma_h_10_8x8_c: 1051.5 ( 1.00x) put_luma_h_10_8x8_neon: 231.5 ( 4.54x) put_luma_h_10_16x16_c: 4131.0 ( 1.00x) put_luma_h_10_16x16_neon: 848.6 ( 4.87x) put_luma_h_10_32x32_c: 16469.5 ( 1.00x) put_luma_h_10_32x32_neon: 3345.6 ( 4.92x) put_luma_h_10_64x64_c: 66734.0 ( 1.00x) put_luma_h_10_64x64_neon: 14586.9 ( 4.57x) put_luma_h_10_128x128_c: 264228.9 ( 1.00x) put_luma_h_10_128x128_neon: 52199.7 ( 5.06x) put_luma_h_12_4x4_c: 262.1 ( 1.00x) put_luma_h_12_8x8_c: 1051.3 ( 1.00x) put_luma_h_12_8x8_neon: 230.9 ( 4.55x) put_luma_h_12_16x16_c: 4124.4 ( 1.00x) put_luma_h_12_16x16_neon: 848.0 ( 4.86x) put_luma_h_12_32x32_c: 16446.9 ( 1.00x) put_luma_h_12_32x32_neon: 3347.4 ( 4.91x) put_luma_h_12_64x64_c: 66770.1 ( 1.00x) put_luma_h_12_64x64_neon: 14360.2 ( 4.65x) put_luma_h_12_128x128_c: 264419.5 ( 1.00x) put_luma_h_12_128x128_neon: 52200.6 ( 5.07x) M2 Air (with auto-vectorization feature): put_luma_h_10_4x4_c: 0.3 ( 1.00x) put_luma_h_10_8x8_c: 1.0 ( 1.00x) put_luma_h_10_8x8_neon: 0.4 ( 2.58x) put_luma_h_10_16x16_c: 3.0 ( 1.00x) put_luma_h_10_16x16_neon: 1.5 ( 2.01x) put_luma_h_10_32x32_c: 9.7 ( 1.00x) put_luma_h_10_32x32_neon: 6.2 ( 1.57x) put_luma_h_10_64x64_c: 36.6 ( 1.00x) put_luma_h_10_64x64_neon: 23.9 ( 1.53x) put_luma_h_10_128x128_c: 134.2 ( 1.00x) put_luma_h_10_128x128_neon: 95.4 ( 1.41x) put_luma_h_12_4x4_c: 0.3 ( 1.00x) put_luma_h_12_8x8_c: 1.0 ( 1.00x) put_luma_h_12_8x8_neon: 0.4 ( 2.57x) put_luma_h_12_16x16_c: 3.0 ( 1.00x) put_luma_h_12_16x16_neon: 1.5 ( 2.01x) put_luma_h_12_32x32_c: 9.7 ( 1.00x) put_luma_h_12_32x32_neon: 6.0 ( 1.63x) put_luma_h_12_64x64_c: 36.5 ( 1.00x) put_luma_h_12_64x64_neon: 23.9 ( 1.53x) put_luma_h_12_128x128_c: 134.8 ( 1.00x) put_luma_h_12_128x128_neon: 95.2 ( 1.42x) >From dba0d5709658f01e40496d1f1fc8a1832e21b708 Mon Sep 17 00:00:00 2001 From: Georgii Zagoruiko <[email protected]> Date: Wed, 22 Oct 2025 19:22:23 +0100 Subject: [PATCH] aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit RPi4: put_luma_h_10_4x4_c: 261.8 ( 1.00x) put_luma_h_10_8x8_c: 1051.5 ( 1.00x) put_luma_h_10_8x8_neon: 231.5 ( 4.54x) put_luma_h_10_16x16_c: 4131.0 ( 1.00x) put_luma_h_10_16x16_neon: 848.6 ( 4.87x) put_luma_h_10_32x32_c: 16469.5 ( 1.00x) put_luma_h_10_32x32_neon: 3345.6 ( 4.92x) put_luma_h_10_64x64_c: 66734.0 ( 1.00x) put_luma_h_10_64x64_neon: 14586.9 ( 4.57x) put_luma_h_10_128x128_c: 264228.9 ( 1.00x) put_luma_h_10_128x128_neon: 52199.7 ( 5.06x) put_luma_h_12_4x4_c: 262.1 ( 1.00x) put_luma_h_12_8x8_c: 1051.3 ( 1.00x) put_luma_h_12_8x8_neon: 230.9 ( 4.55x) put_luma_h_12_16x16_c: 4124.4 ( 1.00x) put_luma_h_12_16x16_neon: 848.0 ( 4.86x) put_luma_h_12_32x32_c: 16446.9 ( 1.00x) put_luma_h_12_32x32_neon: 3347.4 ( 4.91x) put_luma_h_12_64x64_c: 66770.1 ( 1.00x) put_luma_h_12_64x64_neon: 14360.2 ( 4.65x) put_luma_h_12_128x128_c: 264419.5 ( 1.00x) put_luma_h_12_128x128_neon: 52200.6 ( 5.07x) M2 Air (with auto-vectorization feature): put_luma_h_10_4x4_c: 0.3 ( 1.00x) put_luma_h_10_8x8_c: 1.0 ( 1.00x) put_luma_h_10_8x8_neon: 0.4 ( 2.58x) put_luma_h_10_16x16_c: 3.0 ( 1.00x) put_luma_h_10_16x16_neon: 1.5 ( 2.01x) put_luma_h_10_32x32_c: 9.7 ( 1.00x) put_luma_h_10_32x32_neon: 6.2 ( 1.57x) put_luma_h_10_64x64_c: 36.6 ( 1.00x) put_luma_h_10_64x64_neon: 23.9 ( 1.53x) put_luma_h_10_128x128_c: 134.2 ( 1.00x) put_luma_h_10_128x128_neon: 95.4 ( 1.41x) put_luma_h_12_4x4_c: 0.3 ( 1.00x) put_luma_h_12_8x8_c: 1.0 ( 1.00x) put_luma_h_12_8x8_neon: 0.4 ( 2.57x) put_luma_h_12_16x16_c: 3.0 ( 1.00x) put_luma_h_12_16x16_neon: 1.5 ( 2.01x) put_luma_h_12_32x32_c: 9.7 ( 1.00x) put_luma_h_12_32x32_neon: 6.0 ( 1.63x) put_luma_h_12_64x64_c: 36.5 ( 1.00x) put_luma_h_12_64x64_neon: 23.9 ( 1.53x) put_luma_h_12_128x128_c: 134.8 ( 1.00x) put_luma_h_12_128x128_neon: 95.2 ( 1.42x) --- libavcodec/aarch64/vvc/dsp_init.c | 22 ++++++++ libavcodec/aarch64/vvc/inter.S | 90 +++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index b7dc1d89f8..053d453fa7 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -30,6 +30,18 @@ #define BDOF_BLOCK_SIZE 16 #define BDOF_MIN_BLOCK_SIZE 4 +void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); +void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, + const int height, const int8_t *hf, const int8_t *vf, const int width); void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps); @@ -245,6 +257,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon; c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon; + c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon; + c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon; + c->inter.put[0][4][0][1] = + c->inter.put[0][5][0][1] = + c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon; c->alf.filter[LUMA] = alf_filter_luma_10_neon; c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; @@ -256,6 +273,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon; c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon; c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon; + c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon; + c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon; + c->inter.put[0][4][0][1] = + c->inter.put[0][5][0][1] = + c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon; c->alf.filter[LUMA] = alf_filter_luma_12_neon; c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S index a874edf889..be37df8e38 100644 --- a/libavcodec/aarch64/vvc/inter.S +++ b/libavcodec/aarch64/vvc/inter.S @@ -1713,3 +1713,93 @@ endfunc #undef GRADIENT_V1_OFFSET #undef VX_OFFSET #undef VY_OFFSET + +#define VVC_MAX_PB_SIZE 128 + +.macro put_luma_h_x8_16bit_ n, is_w_loop, shift + // dst .req x0 + // _src .req x1 + // _src_stride .req x2 + // height .req w3 + // hf .req x4 + // vf .req x5 + // width .req w6 + mov x9, #(VVC_MAX_PB_SIZE * 2) + ld1 {v0.8b}, [x4] + sxtl v0.8h, v0.8b + mov w7, #0 // y loop: height +1: + sub x11, x1, #6 + add w7, w7, #1 + ld1 {v20.8h}, [x11], #16 + mov x10, x0 + .if \is_w_loop == 1 + mov w8, #(8*\n) +2: + .endif + .rept \n + ld1 {v16.8h}, [x11], #16 + ext v1.16b, v20.16b, v16.16b, #2 + ext v2.16b, v20.16b, v16.16b, #4 + ext v3.16b, v20.16b, v16.16b, #6 + ext v4.16b, v20.16b, v16.16b, #8 + ext v5.16b, v20.16b, v16.16b, #10 + ext v6.16b, v20.16b, v16.16b, #12 + ext v17.16b, v20.16b, v16.16b, #14 + smull v21.4s, v20.4h, v0.h[0] + smull2 v22.4s, v20.8h, v0.h[0] + smlal v21.4s, v1.4h, v0.h[1] + smlal2 v22.4s, v1.8h, v0.h[1] + smlal v21.4s, v2.4h, v0.h[2] + smlal2 v22.4s, v2.8h, v0.h[2] + smlal v21.4s, v3.4h, v0.h[3] + smlal2 v22.4s, v3.8h, v0.h[3] + smlal v21.4s, v4.4h, v0.h[4] + smlal2 v22.4s, v4.8h, v0.h[4] + smlal v21.4s, v5.4h, v0.h[5] + smlal2 v22.4s, v5.8h, v0.h[5] + smlal v21.4s, v6.4h, v0.h[6] + smlal2 v22.4s, v6.8h, v0.h[6] + smlal v21.4s, v17.4h, v0.h[7] + smlal2 v22.4s, v17.8h, v0.h[7] + sqshrn v21.4h, v21.4s, #(\shift) + sqshrn v22.4h, v22.4s, #(\shift) + st1 {v21.4h, v22.4h}, [x10], #16 + mov v20.16b, v16.16b + .endr + .if \is_w_loop == 1 + cmp w8, w6 + add w8, w8, #(8*\n) + b.lt 2b + .endif + cmp w7, w3 + add x0, x0, x9 + add x1, x1, x2 + b.lt 1b + ret +.endm + +function ff_vvc_put_luma_h8_10_neon, export=1 + put_luma_h_x8_16bit_ 1, 0, 2 +endfunc + +function ff_vvc_put_luma_h8_12_neon, export=1 + put_luma_h_x8_16bit_ 1, 0, 4 +endfunc + +function ff_vvc_put_luma_h16_10_neon, export=1 + put_luma_h_x8_16bit_ 2, 0, 2 +endfunc + +function ff_vvc_put_luma_h16_12_neon, export=1 + put_luma_h_x8_16bit_ 2, 0, 4 +endfunc + +function ff_vvc_put_luma_h_x16_10_neon, export=1 + put_luma_h_x8_16bit_ 2, 1, 2 +endfunc + +function ff_vvc_put_luma_h_x16_12_neon, export=1 + put_luma_h_x8_16bit_ 2, 1, 4 +endfunc + -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
