25 Mar 2022, 19:52 by bavi...@riscosopen.org: > checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. Note that the C > version can still outperform the NEON version in specific cases. The balance > between different code paths is stream-dependent, but in practice the best > case happens about 5% of the time, the worst case happens about 40% of the > time, and the complexity of the remaining cases fall somewhere in between. > Therefore, taking the average of the best and worst case timings is > probably a conservative estimate of the degree by which the NEON code > improves performance. > > vc1dsp.vc1_h_loop_filter4_bestcase_c: 19.0 > vc1dsp.vc1_h_loop_filter4_bestcase_neon: 48.5 > vc1dsp.vc1_h_loop_filter4_worstcase_c: 144.7 > vc1dsp.vc1_h_loop_filter4_worstcase_neon: 76.2 > vc1dsp.vc1_h_loop_filter8_bestcase_c: 41.0 > vc1dsp.vc1_h_loop_filter8_bestcase_neon: 75.0 > vc1dsp.vc1_h_loop_filter8_worstcase_c: 294.0 > vc1dsp.vc1_h_loop_filter8_worstcase_neon: 102.7 > vc1dsp.vc1_h_loop_filter16_bestcase_c: 54.7 > vc1dsp.vc1_h_loop_filter16_bestcase_neon: 130.0 > vc1dsp.vc1_h_loop_filter16_worstcase_c: 569.7 > vc1dsp.vc1_h_loop_filter16_worstcase_neon: 186.7 > vc1dsp.vc1_v_loop_filter4_bestcase_c: 20.2 > vc1dsp.vc1_v_loop_filter4_bestcase_neon: 47.2 > vc1dsp.vc1_v_loop_filter4_worstcase_c: 164.2 > vc1dsp.vc1_v_loop_filter4_worstcase_neon: 68.5 > vc1dsp.vc1_v_loop_filter8_bestcase_c: 43.5 > vc1dsp.vc1_v_loop_filter8_bestcase_neon: 55.2 > vc1dsp.vc1_v_loop_filter8_worstcase_c: 316.2 > vc1dsp.vc1_v_loop_filter8_worstcase_neon: 72.7 > vc1dsp.vc1_v_loop_filter16_bestcase_c: 62.2 > vc1dsp.vc1_v_loop_filter16_bestcase_neon: 103.7 > vc1dsp.vc1_v_loop_filter16_worstcase_c: 646.5 > vc1dsp.vc1_v_loop_filter16_worstcase_neon: 110.7 > > Signed-off-by: Ben Avison <bavi...@riscosopen.org> > --- > libavcodec/arm/vc1dsp_init_neon.c | 14 + > libavcodec/arm/vc1dsp_neon.S | 643 ++++++++++++++++++++++++++++++ > 2 files changed, 657 insertions(+) > > diff --git a/libavcodec/arm/vc1dsp_init_neon.c > b/libavcodec/arm/vc1dsp_init_neon.c > index 2cca784f5a..f5f5c702d7 100644 > --- a/libavcodec/arm/vc1dsp_init_neon.c > +++ b/libavcodec/arm/vc1dsp_init_neon.c > @@ -32,6 +32,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t > stride, int16_t *bloc > void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t > *block); > > +void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq); > +void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq); > +void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq); > +void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq); > +void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq); > +void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq); > + > void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels, > ptrdiff_t line_size, int rnd); > > @@ -92,6 +99,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp) > dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon; > dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon; > > + dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon; > + dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon; > + dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon; > + dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_neon; > + dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon; > + dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon; > + > dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon; > FN_ASSIGN(1, 0); > FN_ASSIGN(2, 0); > diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S > index 93f043bf08..a639e81171 100644 > --- a/libavcodec/arm/vc1dsp_neon.S > +++ b/libavcodec/arm/vc1dsp_neon.S > @@ -1161,3 +1161,646 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1 > vst1.32 {d1[1]}, [r0,:32] > bx lr > endfunc > + > +@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of > vertically-neighbouring blocks > +@ On entry: > +@ r0 -> top-left pel of lower block > +@ r1 = row stride, bytes > +@ r2 = PQUANT bitstream parameter > +function ff_vc1_v_loop_filter4_neon, export=1 > + sub r3, r0, r1, lsl #2 > + vldr d0, .Lcoeffs > + vld1.32 {d1[0]}, [r0], r1 @ P5 > + vld1.32 {d2[0]}, [r3], r1 @ P1 > + vld1.32 {d3[0]}, [r3], r1 @ P2 > + vld1.32 {d4[0]}, [r0], r1 @ P6 > + vld1.32 {d5[0]}, [r3], r1 @ P3 > + vld1.32 {d6[0]}, [r0], r1 @ P7 > + vld1.32 {d7[0]}, [r3] @ P4 > + vld1.32 {d16[0]}, [r0] @ P8 >
Nice patches, but 2 notes so far: What's with the weird comment syntax used only in this commit? Different indentation style used. We try to indent our Arm assembly to: <8 spaces><instruction><spaces until and column 24><instruction arguments>. Take a look at e.g. libavcodec/aarch64/vp9itxfm_neon.S. It's just something that stuck around. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".