The branch, master has been updated
via f790de2a878861a34820d7d8ab64badd192331f5 (commit)
from 8b71eeb72e74545d10d6e69dc6be612e81148791 (commit)
- Log -----------------------------------------------------------------
commit f790de2a878861a34820d7d8ab64badd192331f5
Author: Georgii Zagoruiko <[email protected]>
AuthorDate: Mon Nov 24 19:51:26 2025 +0000
Commit: Martin Storsjö <[email protected]>
CommitDate: Mon Nov 24 21:22:55 2025 +0000
aarch64/vvc: Optimisations of put_luma_h() functions for 10/12-bit
RPi4 (auto-vectorisation is turned on)
put_luma_h_10_4x4_c: 282.8 ( 1.00x)
put_luma_h_10_8x8_c: 1069.5 ( 1.00x)
put_luma_h_10_8x8_neon: 207.5 ( 5.15x)
put_luma_h_10_16x16_c: 1999.6 ( 1.00x)
put_luma_h_10_16x16_neon: 777.5 ( 2.57x)
put_luma_h_10_32x32_c: 6612.9 ( 1.00x)
put_luma_h_10_32x32_neon: 3201.6 ( 2.07x)
put_luma_h_10_64x64_c: 25059.0 ( 1.00x)
put_luma_h_10_64x64_neon: 13623.5 ( 1.84x)
put_luma_h_10_128x128_c: 91310.1 ( 1.00x)
put_luma_h_10_128x128_neon: 50358.3 ( 1.81x)
put_luma_h_12_4x4_c: 282.1 ( 1.00x)
put_luma_h_12_8x8_c: 1068.4 ( 1.00x)
put_luma_h_12_8x8_neon: 207.7 ( 5.14x)
put_luma_h_12_16x16_c: 1998.0 ( 1.00x)
put_luma_h_12_16x16_neon: 777.5 ( 2.57x)
put_luma_h_12_32x32_c: 6612.0 ( 1.00x)
put_luma_h_12_32x32_neon: 3201.6 ( 2.07x)
put_luma_h_12_64x64_c: 25036.8 ( 1.00x)
put_luma_h_12_64x64_neon: 13595.1 ( 1.84x)
put_luma_h_12_128x128_c: 91305.8 ( 1.00x)
put_luma_h_12_128x128_neon: 50359.7 ( 1.81x)
Apple M2 Air (auto-vectorisation is turned on)
put_luma_h_10_4x4_c: 0.3 ( 1.00x)
put_luma_h_10_8x8_c: 1.0 ( 1.00x)
put_luma_h_10_8x8_neon: 0.4 ( 2.59x)
put_luma_h_10_16x16_c: 2.9 ( 1.00x)
put_luma_h_10_16x16_neon: 1.4 ( 2.01x)
put_luma_h_10_32x32_c: 9.4 ( 1.00x)
put_luma_h_10_32x32_neon: 5.8 ( 1.62x)
put_luma_h_10_64x64_c: 35.6 ( 1.00x)
put_luma_h_10_64x64_neon: 23.6 ( 1.51x)
put_luma_h_10_128x128_c: 131.1 ( 1.00x)
put_luma_h_10_128x128_neon: 92.6 ( 1.42x)
put_luma_h_12_4x4_c: 0.3 ( 1.00x)
put_luma_h_12_8x8_c: 1.0 ( 1.00x)
put_luma_h_12_8x8_neon: 0.4 ( 2.58x)
put_luma_h_12_16x16_c: 2.9 ( 1.00x)
put_luma_h_12_16x16_neon: 1.4 ( 2.00x)
put_luma_h_12_32x32_c: 9.4 ( 1.00x)
put_luma_h_12_32x32_neon: 5.8 ( 1.61x)
put_luma_h_12_64x64_c: 35.3 ( 1.00x)
put_luma_h_12_64x64_neon: 23.3 ( 1.52x)
put_luma_h_12_128x128_c: 131.2 ( 1.00x)
put_luma_h_12_128x128_neon: 92.4 ( 1.42x)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c
b/libavcodec/aarch64/vvc/dsp_init.c
index b7dc1d89f8..aa75d22b78 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -30,6 +30,18 @@
#define BDOF_BLOCK_SIZE 16
#define BDOF_MIN_BLOCK_SIZE 4
+void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
+void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
+void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
+void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const
ptrdiff_t _src_stride,
+ const int height, const int8_t *hf, const
int8_t *vf, const int width);
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t
gshift, uint32_t steps);
@@ -245,6 +257,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const
int bd)
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
+ c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
+ c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
+ c->inter.put[0][4][0][1] =
+ c->inter.put[0][5][0][1] =
+ c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
c->alf.filter[LUMA] = alf_filter_luma_10_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
@@ -256,6 +273,11 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const
int bd)
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
+ c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
+ c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
+ c->inter.put[0][4][0][1] =
+ c->inter.put[0][5][0][1] =
+ c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
c->alf.filter[LUMA] = alf_filter_luma_12_neon;
c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index a874edf889..41444ec44c 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -1713,3 +1713,122 @@ endfunc
#undef GRADIENT_V1_OFFSET
#undef VX_OFFSET
#undef VY_OFFSET
+
+#define VVC_MAX_PB_SIZE 128
+
+.macro put_luma_h_x8_vector_filter shift
+ // 8 bytes from hf loaded to v0.8h
+ // 32 bytes from _src loaded to v20.8h & v21.8h where v21.8h is loaded
for shift to v1.8h,..,v6.8h,v17.8h
+ // v24.4h & v25.4h are output vectors to store
+ ext v1.16b, v20.16b, v21.16b, #2
+ ext v2.16b, v20.16b, v21.16b, #4
+ ext v3.16b, v20.16b, v21.16b, #6
+ ext v4.16b, v20.16b, v21.16b, #8
+ ext v5.16b, v20.16b, v21.16b, #10
+ ext v6.16b, v20.16b, v21.16b, #12
+ ext v17.16b, v20.16b, v21.16b, #14
+ smull v24.4s, v20.4h, v0.h[0]
+ smull2 v25.4s, v20.8h, v0.h[0]
+ smlal v24.4s, v1.4h, v0.h[1]
+ smlal2 v25.4s, v1.8h, v0.h[1]
+ smlal v24.4s, v2.4h, v0.h[2]
+ smlal2 v25.4s, v2.8h, v0.h[2]
+ smlal v24.4s, v3.4h, v0.h[3]
+ smlal2 v25.4s, v3.8h, v0.h[3]
+ smlal v24.4s, v4.4h, v0.h[4]
+ smlal2 v25.4s, v4.8h, v0.h[4]
+ smlal v24.4s, v5.4h, v0.h[5]
+ smlal2 v25.4s, v5.8h, v0.h[5]
+ smlal v24.4s, v6.4h, v0.h[6]
+ smlal2 v25.4s, v6.8h, v0.h[6]
+ smlal v24.4s, v17.4h, v0.h[7]
+ smlal2 v25.4s, v17.8h, v0.h[7]
+ sqshrn v24.4h, v24.4s, #(\shift)
+ sqshrn v25.4h, v25.4s, #(\shift)
+.endm
+
+.macro put_luma_h8_xx_neon shift
+ mov x9, #(VVC_MAX_PB_SIZE * 2)
+ ld1 {v0.8b}, [x4]
+ sub x1, x1, #6
+ sxtl v0.8h, v0.8b
+1:
+ ld1 {v20.8h, v21.8h}, [x1], x2
+ put_luma_h_x8_vector_filter \shift
+ subs w3, w3, #1
+ st1 {v24.4h, v25.4h}, [x0], x9
+ b.gt 1b
+ ret
+.endm
+
+.macro put_luma_h16_xx_neon shift
+ mov x9, #(VVC_MAX_PB_SIZE * 2)
+ ld1 {v0.8b}, [x4]
+ sub x9, x9, #16
+ sub x1, x1, #6
+ sxtl v0.8h, v0.8b
+1:
+ ld1 {v20.8h, v21.8h, v22.8h}, [x1], x2
+ put_luma_h_x8_vector_filter \shift
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+ st1 {v24.4h, v25.4h}, [x0], #16
+ put_luma_h_x8_vector_filter \shift
+ subs w3, w3, #1
+ st1 {v24.4h, v25.4h}, [x0], x9
+ b.gt 1b
+ ret
+.endm
+
+.macro put_luma_h_x16_xx_neon shift
+ mov x9, #(VVC_MAX_PB_SIZE * 2)
+ ld1 {v0.8b}, [x4]
+ sub x9, x9, w6, uxtw #1
+ sub x2, x2, w6, uxtw #1
+ sxtl v0.8h, v0.8b
+ sub x1, x1, #6
+ sub x2, x2, #16
+1:
+ ld1 {v20.8h}, [x1], #16
+ mov w8, w6
+2:
+ ld1 {v21.8h, v22.8h}, [x1], #32
+ put_luma_h_x8_vector_filter \shift
+ mov v20.16b, v21.16b
+ mov v21.16b, v22.16b
+ st1 {v24.4h, v25.4h}, [x0], #16
+ put_luma_h_x8_vector_filter \shift
+ mov v20.16b, v21.16b
+ subs w8, w8, #16
+ st1 {v24.4h, v25.4h}, [x0], #16
+ b.gt 2b
+ subs w3, w3, #1
+ add x0, x0, x9
+ add x1, x1, x2
+ b.gt 1b
+ ret
+.endm
+
+function ff_vvc_put_luma_h8_10_neon, export=1
+ put_luma_h8_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h8_12_neon, export=1
+ put_luma_h8_xx_neon 4
+endfunc
+
+function ff_vvc_put_luma_h16_10_neon, export=1
+ put_luma_h16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h16_12_neon, export=1
+ put_luma_h16_xx_neon 4
+endfunc
+
+function ff_vvc_put_luma_h_x16_10_neon, export=1
+ put_luma_h_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_h_x16_12_neon, export=1
+ put_luma_h_x16_xx_neon 4
+endfunc
-----------------------------------------------------------------------
Summary of changes:
libavcodec/aarch64/vvc/dsp_init.c | 22 +++++++
libavcodec/aarch64/vvc/inter.S | 119 ++++++++++++++++++++++++++++++++++++++
2 files changed, 141 insertions(+)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]