From: Zhao Zhili <zhiliz...@tencent.com> put_chroma_h_8_4x4_c: 0.4 ( 1.00x) put_chroma_h_8_4x4_neon: 0.0 ( 0.00x) put_chroma_h_8_4x4_i8mm: 0.1 ( 2.67x) put_chroma_h_8_8x8_c: 1.6 ( 1.00x) put_chroma_h_8_8x8_neon: 0.1 (11.00x) put_chroma_h_8_8x8_i8mm: 0.1 (11.00x) put_chroma_h_8_16x16_c: 6.9 ( 1.00x) put_chroma_h_8_16x16_neon: 1.1 ( 6.00x) put_chroma_h_8_16x16_i8mm: 0.7 (10.62x) put_chroma_h_8_32x32_c: 27.6 ( 1.00x) put_chroma_h_8_32x32_neon: 4.7 ( 5.95x) put_chroma_h_8_32x32_i8mm: 4.4 ( 6.28x) put_chroma_h_8_64x64_c: 116.2 ( 1.00x) put_chroma_h_8_64x64_neon: 19.1 ( 6.07x) put_chroma_h_8_64x64_i8mm: 17.1 ( 6.77x) put_chroma_h_8_128x128_c: 466.6 ( 1.00x) put_chroma_h_8_128x128_neon: 81.4 ( 5.73x) put_chroma_h_8_128x128_i8mm: 71.7 ( 6.51x) --- libavcodec/aarch64/h26x/dsp.h | 6 ++- libavcodec/aarch64/h26x/epel_neon.S | 60 ++++++++++++++++++++++++++--- libavcodec/aarch64/vvc/dsp_init.c | 7 ++++ 3 files changed, 66 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h index 6978b900fe..90a42d7108 100644 --- a/libavcodec/aarch64/h26x/dsp.h +++ b/libavcodec/aarch64/h26x/dsp.h @@ -273,7 +273,11 @@ NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, const int8_t *hf, const int8_t *vf, int width),); -NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst, +NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t *dst, + const uint8_t *_src, ptrdiff_t _srcstride, int height, + const int8_t *hf, const int8_t *vf, int width), _i8mm); + +NEON8_FNPROTO_PARTIAL_6(epel_h, (int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width), _i8mm); diff --git a/libavcodec/aarch64/h26x/epel_neon.S b/libavcodec/aarch64/h26x/epel_neon.S index 80a0b66a52..cad8f2a5f4 100644 --- a/libavcodec/aarch64/h26x/epel_neon.S +++ b/libavcodec/aarch64/h26x/epel_neon.S @@ -1910,6 +1910,12 @@ endfunc #if HAVE_I8MM ENABLE_I8MM + +function ff_vvc_put_epel_h4_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + b 1f +endfunc + function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1 EPEL_H_HEADER 1: ld1 {v4.8b}, [x1], x2 @@ -1953,6 +1959,11 @@ function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1 ret endfunc +function ff_vvc_put_epel_h8_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + b 1f +endfunc + function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1 EPEL_H_HEADER 1: ld1 {v4.16b}, [x1], x2 @@ -2003,6 +2014,11 @@ function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1 ret endfunc +function ff_vvc_put_epel_h16_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + b 1f +endfunc + function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1 EPEL_H_HEADER 1: ld1 {v0.16b, v1.16b}, [x1], x2 @@ -2077,6 +2093,11 @@ function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1 ret endfunc +function ff_vvc_put_epel_h32_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + b 1f +endfunc + function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1 EPEL_H_HEADER 1: ld1 {v0.16b, v1.16b, v2.16b}, [x1], x2 @@ -2176,11 +2197,8 @@ function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1 ret endfunc -function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 - EPEL_H_HEADER - sub x2, x2, #64 -1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64 - subs w3, w3, #1 // height +.macro put_epel_h64_8_neon_i8mm + ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64 ext v4.16b, v0.16b, v1.16b, #1 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #3 @@ -2243,7 +2261,37 @@ function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 xtn2 v22.8h, v26.4s xtn v23.4h, v23.4s xtn2 v23.8h, v27.4s - st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st4 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10 +.endm + +function ff_vvc_put_epel_h64_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + mov x10, #(VVC_MAX_PB_SIZE * 2 - 64) + sub x2, x2, #64 + b 1f +endfunc + +function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1 + EPEL_H_HEADER + mov x10, #64 + sub x2, x2, #64 +1: + subs w3, w3, #1 // height + put_epel_h64_8_neon_i8mm + b.ne 1b + ret +endfunc + +function ff_vvc_put_epel_h128_8_neon_i8mm, export=1 + VVC_EPEL_H_HEADER + sub x11, x2, #128 + mov x10, #64 + mov x2, #0 +1: + put_epel_h64_8_neon_i8mm + subs w3, w3, #1 + put_epel_h64_8_neon_i8mm + add x1, x1, x11 b.ne 1b ret endfunc diff --git a/libavcodec/aarch64/vvc/dsp_init.c b/libavcodec/aarch64/vvc/dsp_init.c index c8c13eb068..c947885145 100644 --- a/libavcodec/aarch64/vvc/dsp_init.c +++ b/libavcodec/aarch64/vvc/dsp_init.c @@ -127,6 +127,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm; c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm; c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm; + + c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon_i8mm; + c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon_i8mm; + c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon_i8mm; + c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm; + c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm; + c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm; } } else if (bd == 10) { c->alf.filter[LUMA] = alf_filter_luma_10_neon; -- 2.42.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".