Inlined a few comments for ff_pred16x16_top_dc_neon_10, other are similar. At 2021-04-14 20:35:44, "Martin Storsjö" <mar...@martin.st> wrote: >On Tue, 13 Apr 2021, Mikhail Nitenko wrote: > >> Benchmarks: >> pred16x16_dc_10_c: 124.0 >> pred16x16_dc_10_neon: 97.2 >> pred16x16_horizontal_10_c: 71.7 >> pred16x16_horizontal_10_neon: 66.2 >> pred16x16_top_dc_10_c: 90.7 >> pred16x16_top_dc_10_neon: 71.5 >> pred16x16_vertical_10_c: 64.7 >> pred16x16_vertical_10_neon: 61.7 > >When posting benchmark numbers, it's nice if you'd mention what CPU it was >benchmarked on, as the numbers sometimes differ quite a bit between >various CPUs. > >> >> Some functions work slower than C and are left commented out. >> --- >> libavcodec/aarch64/h264pred_init.c | 12 +++ >> libavcodec/aarch64/h264pred_neon.S | 117 +++++++++++++++++++++++++++++ >> 2 files changed, 129 insertions(+) >> >> diff --git a/libavcodec/aarch64/h264pred_init.c >> b/libavcodec/aarch64/h264pred_init.c >> index fc8989ae0d..9a1f13910d 100644 >> --- a/libavcodec/aarch64/h264pred_init.c >> +++ b/libavcodec/aarch64/h264pred_init.c >> @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t >> stride); >> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); >> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); >> >> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); >> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); >> + >> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, >> const int bit_depth, >> const int chroma_format_idc) >> @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext >> *h, int codec_id, >> codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) >> h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; >> } >> + if (bit_depth == 10) { >> + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; >> + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; >> + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; >> + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; >> + } >> } >> >> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, >> @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, >> int codec_id, >> if (have_neon(cpu_flags)) >> h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); >> } >> + > >Stray newline added > >> diff --git a/libavcodec/aarch64/h264pred_neon.S >> b/libavcodec/aarch64/h264pred_neon.S >> index 213b40b3e7..5ce50323f8 100644 >> --- a/libavcodec/aarch64/h264pred_neon.S >> +++ b/libavcodec/aarch64/h264pred_neon.S >> @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 >> dup v1.8b, v1.b[0] >> b .L_pred8x8_dc_end >> endfunc >> + >> +.macro ldcol.16 rd, rs, rt, n=4, hi=0 >> +.if \n >= 4 || \hi == 0 >> + ld1 {\rd\().h}[0], [\rs], \rt >> + ld1 {\rd\().h}[1], [\rs], \rt >> +.endif >> +.if \n >= 4 || \hi == 1 >> + ld1 {\rd\().h}[2], [\rs], \rt >> + ld1 {\rd\().h}[3], [\rs], \rt >> +.endif >> +.if \n == 8 >> + ld1 {\rd\().h}[4], [\rs], \rt >> + ld1 {\rd\().h}[5], [\rs], \rt >> + ld1 {\rd\().h}[6], [\rs], \rt >> + ld1 {\rd\().h}[7], [\rs], \rt >> +.endif >> +.endm >> + >> +// slower than C >> +/* >> +function ff_pred16x16_128_dc_neon_10, export=1 >> + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) >> + >> + b .L_pred16x16_dc_10_end >> +endfunc >> +*/ >> + >> +function ff_pred16x16_top_dc_neon_10, export=1 >> + sub x2, x0, x1 >> + >> + ld1 {v0.8h, v1.8h}, [x2] >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> +
>> + add v0.4h, v0.4h, v1.4h ld1+addv+addv+add ==> latency 5+5+5+2=17, throughput 1+1+1+1/2=3.5 Dynamic range analyze: sum 16 of 10-bits is up to 14-bits New instructions may: ld1+add+addp+addp+addp ==> latency 5+2+2+2+2=13, throughput 1+1/2+1/2+1/2+1/2=3 or ld1+add+addv ==> latency 5+2+5=12, throughput 1+1/2+1=2.5 btw: we may replace LD1 by LDP to get more bandwidth. >> + >> + rshrn v0.4h, v0.4s, #4 >> + dup v0.8h, v0.h[0] >> + b .L_pred16x16_dc_10_end >> +endfunc >> + >> +// slower than C >> +/* >> +function ff_pred16x16_left_dc_neon_10, export=1 >> + sub x2, x0, #2 // access to the "left" column >> + ldcol.16 v0, x2, x1, 8 >> + ldcol.16 v1, x2, x1, 8 // load "left" column >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> + >> + add v0.4h, v0.4h, v1.4h >> + >> + rshrn v0.4h, v0.4s, #4 >> + dup v0.8h, v0.h[0] >> + b .L_pred16x16_dc_10_end >> +endfunc >> +*/ >> + >> +function ff_pred16x16_dc_neon_10, export=1 >> + sub x2, x0, x1 // access to the "top" row >> + sub x3, x0, #2 // access to the "left" column >> + >> + ld1 {v0.8h, v1.8h}, [x2] >> + ldcol.16 v2, x3, x1, 8 >> + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and >> "left" col >> + >> + addv h0, v0.8h >> + addv h1, v1.8h >> + addv h2, v2.8h // sum all pixels in the "top" row and >> "left" col >> + addv h3, v3.8h // (sum stays in h0-h3 registers) >> + >> + add v0.4h, v0.4h, v1.4h >> + add v0.4h, v0.4h, v2.4h >> + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 > >Instead of doing ((v0+v1)+v2)+v3, try doing (v0+v1)+(v2+v3). That reduces >the latency chain by one instruction, when the v2+v3 addition can start >independent of the v0+v1 addition. > >> + >> + rshrn v0.4h, v0.4s, #5 >> + dup v0.8h, v0.h[0] >> +.L_pred16x16_dc_10_end: >> + mov v1.16b, v0.16b >> + mov w3, #8 >> +6: st1 {v0.8h, v1.8h}, [x0], x1 >> + subs w3, w3, #1 >> + st1 {v0.8h, v1.8h}, [x0], x1 >> + b.ne 6b >> + ret >> +endfunc >> + >> +function ff_pred16x16_hor_neon_10, export=1 >> + sub x2, x0, #2 >> + add x3, x0, #16 >> + >> + mov w4, #16 >> +1: ld1r {v0.8h}, [x2], x1 >> + subs w4, w4, #1 >> + st1 {v0.8h}, [x0], x1 >> + st1 {v0.8h}, [x3], x1 >> + b.ne 1b >> + ret >> +endfunc >> + >> +function ff_pred16x16_vert_neon_10, export=1 >> + sub x2, x0, x1 >> + add x1, x1, x1 >> + >> + ld1 {v0.8h, v1.8h}, [x2], x1 >> + >> + mov w3, #8 >> +1: subs w3, w3, #1 >> + st1 {v0.8h, v1.8h}, [x0], x1 >> + st1 {v0.8h, v1.8h}, [x2], x1 >> + >> + b.ne 1b >> + ret >> +endfunc >> + > >Stray newline at the end of the file, git complains about it. > >Other than that, the code looks straightforward and ok to me. > >// Martin > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".