thank you, i've tried your suggestions and here is what I got (the left column is A53 and the right is A72)
current code: pred16x16_top_dc_10_c: 106.0 93.2 pred16x16_top_dc_10_neon: 87.7 77.5 ld1, add, addv variant: pred16x16_top_dc_10_c: 106.0 95.5 pred16x16_top_dc_10_neon: 87.7 77.2 ld1, add, addp, addp, addp variant: pred16x16_top_dc_10_c: 106.0 93.7 pred16x16_top_dc_10_neon: 93.7 81.2 ldp variant: pred16x16_top_dc_10_c: 106.0 93.7 pred16x16_top_dc_10_neon: 87.7 77.5 so I will change code to use one addv instruction ср, 14 апр. 2021 г. в 16:18, chen <chenm...@163.com>: > > Inlined a few comments for ff_pred16x16_top_dc_neon_10, other are similar. > > At 2021-04-14 20:35:44, "Martin Storsjö" <mar...@martin.st> wrote: > >On Tue, 13 Apr 2021, Mikhail Nitenko wrote: > > > >> Benchmarks: > >> pred16x16_dc_10_c: 124.0 > >> pred16x16_dc_10_neon: 97.2 > >> pred16x16_horizontal_10_c: 71.7 > >> pred16x16_horizontal_10_neon: 66.2 > >> pred16x16_top_dc_10_c: 90.7 > >> pred16x16_top_dc_10_neon: 71.5 > >> pred16x16_vertical_10_c: 64.7 > >> pred16x16_vertical_10_neon: 61.7 > > > >When posting benchmark numbers, it's nice if you'd mention what CPU it was > >benchmarked on, as the numbers sometimes differ quite a bit between > >various CPUs. > > > >> > >> Some functions work slower than C and are left commented out. > >> --- > >> libavcodec/aarch64/h264pred_init.c | 12 +++ > >> libavcodec/aarch64/h264pred_neon.S | 117 +++++++++++++++++++++++++++++ > >> 2 files changed, 129 insertions(+) > >> > >> diff --git a/libavcodec/aarch64/h264pred_init.c > >> b/libavcodec/aarch64/h264pred_init.c > >> index fc8989ae0d..9a1f13910d 100644 > >> --- a/libavcodec/aarch64/h264pred_init.c > >> +++ b/libavcodec/aarch64/h264pred_init.c > >> @@ -45,6 +45,11 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t > >> stride); > >> void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); > >> void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); > >> > >> +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); > >> +void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); > >> + > >> static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, > >> const int bit_depth, > >> const int chroma_format_idc) > >> @@ -78,6 +83,12 @@ static av_cold void h264_pred_init_neon(H264PredContext > >> *h, int codec_id, > >> codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) > >> h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; > >> } > >> + if (bit_depth == 10) { > >> + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; > >> + h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; > >> + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; > >> + h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; > >> + } > >> } > >> > >> av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, > >> @@ -88,3 +99,4 @@ av_cold void ff_h264_pred_init_aarch64(H264PredContext > >> *h, int codec_id, > >> if (have_neon(cpu_flags)) > >> h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); > >> } > >> + > > > >Stray newline added > > > >> diff --git a/libavcodec/aarch64/h264pred_neon.S > >> b/libavcodec/aarch64/h264pred_neon.S > >> index 213b40b3e7..5ce50323f8 100644 > >> --- a/libavcodec/aarch64/h264pred_neon.S > >> +++ b/libavcodec/aarch64/h264pred_neon.S > >> @@ -359,3 +359,120 @@ function ff_pred8x8_0l0_dc_neon, export=1 > >> dup v1.8b, v1.b[0] > >> b .L_pred8x8_dc_end > >> endfunc > >> + > >> +.macro ldcol.16 rd, rs, rt, n=4, hi=0 > >> +.if \n >= 4 || \hi == 0 > >> + ld1 {\rd\().h}[0], [\rs], \rt > >> + ld1 {\rd\().h}[1], [\rs], \rt > >> +.endif > >> +.if \n >= 4 || \hi == 1 > >> + ld1 {\rd\().h}[2], [\rs], \rt > >> + ld1 {\rd\().h}[3], [\rs], \rt > >> +.endif > >> +.if \n == 8 > >> + ld1 {\rd\().h}[4], [\rs], \rt > >> + ld1 {\rd\().h}[5], [\rs], \rt > >> + ld1 {\rd\().h}[6], [\rs], \rt > >> + ld1 {\rd\().h}[7], [\rs], \rt > >> +.endif > >> +.endm > >> + > >> +// slower than C > >> +/* > >> +function ff_pred16x16_128_dc_neon_10, export=1 > >> + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) > >> + > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> +*/ > >> + > >> +function ff_pred16x16_top_dc_neon_10, export=1 > >> + sub x2, x0, x1 > >> + > >> + ld1 {v0.8h, v1.8h}, [x2] > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + > > >> + add v0.4h, v0.4h, v1.4h > ld1+addv+addv+add ==> latency 5+5+5+2=17, throughput 1+1+1+1/2=3.5 > > > Dynamic range analyze: sum 16 of 10-bits is up to 14-bits > > > New instructions may: > ld1+add+addp+addp+addp ==> latency 5+2+2+2+2=13, throughput > 1+1/2+1/2+1/2+1/2=3 > or > ld1+add+addv ==> latency 5+2+5=12, throughput 1+1/2+1=2.5 > > > btw: we may replace LD1 by LDP to get more bandwidth. > > > >> + > >> + rshrn v0.4h, v0.4s, #4 > >> + dup v0.8h, v0.h[0] > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> + > >> +// slower than C > >> +/* > >> +function ff_pred16x16_left_dc_neon_10, export=1 > >> + sub x2, x0, #2 // access to the "left" column > >> + ldcol.16 v0, x2, x1, 8 > >> + ldcol.16 v1, x2, x1, 8 // load "left" column > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + > >> + add v0.4h, v0.4h, v1.4h > >> + > >> + rshrn v0.4h, v0.4s, #4 > >> + dup v0.8h, v0.h[0] > >> + b .L_pred16x16_dc_10_end > >> +endfunc > >> +*/ > >> + > >> +function ff_pred16x16_dc_neon_10, export=1 > >> + sub x2, x0, x1 // access to the "top" row > >> + sub x3, x0, #2 // access to the "left" column > >> + > >> + ld1 {v0.8h, v1.8h}, [x2] > >> + ldcol.16 v2, x3, x1, 8 > >> + ldcol.16 v3, x3, x1, 8 // load pixels in "top" row and > >> "left" col > >> + > >> + addv h0, v0.8h > >> + addv h1, v1.8h > >> + addv h2, v2.8h // sum all pixels in the "top" row and > >> "left" col > >> + addv h3, v3.8h // (sum stays in h0-h3 registers) > >> + > >> + add v0.4h, v0.4h, v1.4h > >> + add v0.4h, v0.4h, v2.4h > >> + add v0.4h, v0.4h, v3.4h // sum registers h0-h3 > > > >Instead of doing ((v0+v1)+v2)+v3, try doing (v0+v1)+(v2+v3). That reduces > >the latency chain by one instruction, when the v2+v3 addition can start > >independent of the v0+v1 addition. > > > >> + > >> + rshrn v0.4h, v0.4s, #5 > >> + dup v0.8h, v0.h[0] > >> +.L_pred16x16_dc_10_end: > >> + mov v1.16b, v0.16b > >> + mov w3, #8 > >> +6: st1 {v0.8h, v1.8h}, [x0], x1 > >> + subs w3, w3, #1 > >> + st1 {v0.8h, v1.8h}, [x0], x1 > >> + b.ne 6b > >> + ret > >> +endfunc > >> + > >> +function ff_pred16x16_hor_neon_10, export=1 > >> + sub x2, x0, #2 > >> + add x3, x0, #16 > >> + > >> + mov w4, #16 > >> +1: ld1r {v0.8h}, [x2], x1 > >> + subs w4, w4, #1 > >> + st1 {v0.8h}, [x0], x1 > >> + st1 {v0.8h}, [x3], x1 > >> + b.ne 1b > >> + ret > >> +endfunc > >> + > >> +function ff_pred16x16_vert_neon_10, export=1 > >> + sub x2, x0, x1 > >> + add x1, x1, x1 > >> + > >> + ld1 {v0.8h, v1.8h}, [x2], x1 > >> + > >> + mov w3, #8 > >> +1: subs w3, w3, #1 > >> + st1 {v0.8h, v1.8h}, [x0], x1 > >> + st1 {v0.8h, v1.8h}, [x2], x1 > >> + > >> + b.ne 1b > >> + ret > >> +endfunc > >> + > > > >Stray newline at the end of the file, git complains about it. > > > >Other than that, the code looks straightforward and ok to me. > > > >// Martin > > > >_______________________________________________ > >ffmpeg-devel mailing list > >ffmpeg-devel@ffmpeg.org > >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > >To unsubscribe, visit link above, or email > >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". -- Mikhail Nitenko _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".