Benchmarks: A53 A72 pred8x8_dc_10_c: 64.2 49.5 pred8x8_dc_10_neon: 62.7 54.5 pred8x8_dc_128_10_c: 26.0 15.5 pred8x8_dc_128_10_neon: 28.2 16.0 pred8x8_horizontal_10_c: 60.0 27.7 pred8x8_horizontal_10_neon: 34.2 27.7 pred8x8_left_dc_10_c: 42.5 27.5 pred8x8_left_dc_10_neon: 50.7 41.2 pred8x8_mad_cow_dc_0l0_10_c: 55.7 37.2 pred8x8_mad_cow_dc_0l0_10_neon: 46.0 36.5 pred8x8_mad_cow_dc_0lt_10_c: 89.2 67.0 pred8x8_mad_cow_dc_0lt_10_neon: 50.2 46.7 pred8x8_mad_cow_dc_l0t_10_c: 75.5 51.0 pred8x8_mad_cow_dc_l0t_10_neon: 49.7 44.7 pred8x8_mad_cow_dc_l00_10_c: 58.0 38.0 pred8x8_mad_cow_dc_l00_10_neon: 41.0 37.5 pred8x8_plane_10_c: 347.5 288.7 pred8x8_plane_10_neon: 150.2 108.5 pred8x8_top_dc_10_c: 44.5 30.5 pred8x8_top_dc_10_neon: 39.7 31.5 pred8x8_vertical_10_c: 27.5 16.0 pred8x8_vertical_10_neon: 27.7 15.0 pred16x16_plane_10_c: 1245.5 1069.7 pred16x16_plane_10_neon: 349.0 208.7
Signed-off-by: Mikhail Nitenko <mnite...@gmail.com> --- libavcodec/aarch64/h264pred_init.c | 40 +++- libavcodec/aarch64/h264pred_neon.S | 369 ++++++++++++++++++++++++++++- 2 files changed, 402 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/h264pred_init.c b/libavcodec/aarch64/h264pred_init.c index 325a86bfcd..0ae8f70d23 100644 --- a/libavcodec/aarch64/h264pred_init.c +++ b/libavcodec/aarch64/h264pred_init.c @@ -45,10 +45,23 @@ void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); void ff_pred16x16_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred16x16_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); + +void ff_pred8x8_vert_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_hor_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_plane_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_128_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_left_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_top_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l0t_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0lt_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_l00_dc_neon_10(uint8_t *src, ptrdiff_t stride); +void ff_pred8x8_0l0_dc_neon_10(uint8_t *src, ptrdiff_t stride); static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, const int bit_depth, @@ -84,10 +97,31 @@ static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; } if (bit_depth == 10) { + if (chroma_format_idc <= 1) { + h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon_10; + h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon_10; + if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon_10; + h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon_10; + if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 && + codec_id != AV_CODEC_ID_VP8) { + h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon_10; + h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon_10; + h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon_10; + h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon_10; + } + } + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon_10; h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon_10; h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon_10; h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon_10; + if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && + codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8) + h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon_10; } } diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index e40bdc8d53..735d20b49c 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -360,16 +360,24 @@ function ff_pred8x8_0l0_dc_neon, export=1 b .L_pred8x8_dc_end endfunc +const p16weight_10, align=4 + .word 1,2,3,4,5,6,7,8 +endconst +const p16weight_10_new, align=4 + .word 0,1,2,3,4,5,6,7 +endconst +const p8weight_10, align=4 + .word 1,2,3,4,1,2,3,4 +endconst + .macro ldcol.16 rd, rs, rt, n=4, hi=0 -.if \n >= 4 || \hi == 0 +.if \n >= 4 && \hi == 0 ld1 {\rd\().h}[0], [\rs], \rt ld1 {\rd\().h}[1], [\rs], \rt -.endif -.if \n >= 4 || \hi == 1 ld1 {\rd\().h}[2], [\rs], \rt ld1 {\rd\().h}[3], [\rs], \rt .endif -.if \n == 8 +.if \n == 8 || \hi == 1 ld1 {\rd\().h}[4], [\rs], \rt ld1 {\rd\().h}[5], [\rs], \rt ld1 {\rd\().h}[6], [\rs], \rt @@ -467,3 +475,356 @@ function ff_pred16x16_vert_neon_10, export=1 b.ne 1b ret endfunc + +function ff_pred16x16_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p16weight_10 + add x2, x3, #16 + sub x3, x3, #2 + + ld1 {v0.8h}, [x3] + ld1 {v2.8h}, [x2] + ldcol.16 v1, x3, x1, 8 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 8 + + rev64 v16.8h, v0.8h + trn1 v0.2d, v16.2d, v16.2d + trn2 v0.2d, v16.2d, v0.2d + + rev64 v16.8h, v1.8h + trn1 v1.2d, v16.2d, v16.2d + trn2 v1.2d, v16.2d, v1.2d + + uaddl v7.4s, v2.4h, v3.4h + uaddl2 v16.4s, v2.8h, v3.8h + usubl v4.4s, v2.4h, v0.4h + usubl2 v5.4s, v2.8h, v0.8h + usubl v2.4s, v3.4h, v1.4h + usubl2 v3.4s, v3.8h, v1.8h + + ld1 {v0.4s, v1.4s}, [x4] + + mul v4.4s, v4.4s, v0.4s + mul v5.4s, v5.4s, v1.4s + mul v2.4s, v2.4s, v0.4s + mul v3.4s, v3.4s, v1.4s + + addp v4.4s, v4.4s, v5.4s + addp v2.4s, v2.4s, v3.4s + + addp v4.4s, v4.4s, v4.4s + addp v2.4s, v2.4s, v2.4s + + addp v4.2s, v4.2s, v4.2s + addp v2.2s, v2.2s, v2.2s + mov v2.s[0], v4.s[0] // H and V + + sshll v3.2d, v2.2s, #2 + saddw v2.2d, v3.2d, v2.2s + rshrn v4.2s, v2.2d, #6 + dup v5.4s, v4.s[1] + + add v2.2s, v4.2s, v5.2s + shl v3.4s, v2.4s, #3 + + mov w2, v7.s[0] + mov v7.s[0], v16.s[3] + mov v16.s[3], w2 + + sub v3.4s, v3.4s, v2.4s // 7 * (b + c) + add v7.4s, v7.4s, v0.4s + + shl v2.4s, v7.4s, #4 + sub v2.4s, v2.4s, v3.4s + shl v3.4s, v4.4s, #4 + + movrel x5, p16weight_10_new + ld1 {v0.4s, v1.4s}, [x5] + + sub v6.4s, v5.4s, v3.4s + mul v0.4s, v0.4s, v4.s[0] + mul v1.4s, v1.4s, v4.s[0] + dup v16.4s, v2.s[0] + dup v17.4s, v2.s[0] + dup v18.4s, v4.s[0] + dup v19.4s, v4.s[0] + dup v20.4s, v6.s[0] + dup v21.4s, v6.s[0] + shl v18.4s, v18.4s, #3 + shl v19.4s, v19.4s, #3 + add v16.4s, v16.4s, v0.4s + add v17.4s, v17.4s, v1.4s + add v20.4s, v20.4s, v18.4s + add v21.4s, v21.4s, v19.4s + mov w3, #16 + mov w2, #1023 // for clipping + dup v3.8h, w2 +1: + sqshrun v0.4h, v16.4s, #5 + sqshrun2 v0.8h, v17.4s, #5 + + add v16.4s, v16.4s, v18.4s + add v17.4s, v17.4s, v19.4s + + sqshrun v1.4h, v16.4s, #5 + sqshrun2 v1.8h, v17.4s, #5 + + add v16.4s, v16.4s, v20.4s + add v17.4s, v17.4s, v21.4s + + subs w3, w3, #1 + + smin v0.8h, v0.8h, v3.8h + smin v1.8h, v1.8h, v3.8h + st1 {v0.8h, v1.8h}, [x0], x1 + b.ne 1b + ret +endfunc + + +function ff_pred8x8_hor_neon_10, export=1 + sub x2, x0, #2 + mov w3, #8 + +1: ld1r {v0.8h}, [x2], x1 + subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_vert_neon_10, export=1 + sub x2, x0, x1 + lsl x1, x1, #1 + + ld1 {v0.8h}, [x2], x1 + mov w3, #4 +1: subs w3, w3, #1 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x2], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_plane_neon_10, export=1 + sub x3, x0, x1 + movrel x4, p8weight_10 + movrel x5, p16weight_10 + add x2, x3, #8 + sub x3, x3, #2 + + ld1 {v0.d}[0], [x3] + ld1 {v2.d}[0], [x2], x1 + ldcol.16 v0, x3, x1, hi=1 + add x3, x3, x1 + ldcol.16 v3, x3, x1, 4 + + uaddl v7.4s, v2.4h, v3.4h + rev64 v0.8h, v0.8h + trn1 v2.2d, v2.2d, v3.2d + + usubl2 v3.4s, v2.8h, v0.8h + usubl v2.4s, v2.4h, v0.4h + + ld1 {v6.4s}, [x4] + mul v2.4s, v2.4s, v6.4s + mul v3.4s, v3.4s, v6.4s + ld1 {v0.4s}, [x5] + + saddlp v2.2d, v2.4s + saddlp v3.2d, v3.4s + addp v2.2d, v2.2d, v2.2d + addp v3.2d, v3.2d, v3.2d + mov v2.d[1], v3.d[0] + shl v3.2d, v2.2d, #4 + add v2.2d, v3.2d, v2.2d + rshrn v5.2s, v2.2d, #5 + addp v2.4s, v5.4s, v5.4s + shl v3.4s, v2.4s, #1 + add v3.4s, v3.4s, v2.4s + + rev64 v1.4s, v7.4s + trn1 v7.2d, v1.2d, v1.2d + trn2 v7.2d, v1.2d, v7.2d + + + add v7.4s, v7.4s, v0.4s + shl v2.4s, v7.4s, #4 + sub v2.4s, v2.4s, v3.4s + + movrel x5, p16weight_10_new + ld1 {v6.4s, v7.4s}, [x5] + + mul v6.4s, v6.4s, v5.s[0] + mul v7.4s, v7.4s, v5.s[0] + + dup v1.4s, v2.s[0] + dup v2.4s, v2.s[0] + dup v3.4s, v5.s[1] + + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + + mov w3, #8 + mov w2, #1023 // for clipping + dup v4.8h, w2 +1: + sqshrun v0.4h, v1.4s, #5 + sqshrun2 v0.8h, v2.4s, #5 + + subs w3, w3, #1 + + add v1.4s, v1.4s, v3.4s + add v2.4s, v2.4s, v3.4s + + smin v0.8h, v0.8h, v4.8h + st1 {v0.8h}, [x0], x1 + b.ne 1b + ret +endfunc + +function ff_pred8x8_128_dc_neon_10, export=1 + movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1) + movi v1.8h, #2, lsl #8 + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_top_dc_neon_10, export=1 + sub x2, x0, x1 + ld1 {v0.8h}, [x2] + + uaddlp v0.4s, v0.8h + addp v0.4s, v0.4s, v0.4s + zip1 v0.4s, v0.4s, v0.4s + rshrn v2.4h, v0.4s, #2 + zip1 v0.8h, v2.8h, v2.8h + zip1 v1.8h, v2.8h, v2.8h + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_left_dc_neon_10, export=1 + sub x2, x0, #2 + ldcol.16 v0, x2, x1, 8 + + uaddlp v0.4s, v0.8h + addp v0.4s, v0.4s, v0.4s + rshrn v2.4h, v0.4s, #2 + dup v1.8h, v2.h[1] + dup v0.8h, v2.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 8 + + uaddlp v0.4s, v0.8h + uaddlp v1.4s, v1.8h + trn1 v2.2d, v0.2d, v1.2d + trn2 v3.2d, v0.2d, v1.2d + addp v4.4s, v2.4s, v3.4s + addp v5.4s, v4.4s, v4.4s + rshrn v6.4h, v5.4s, #3 + rshrn v7.4h, v4.4s, #2 + dup v0.8h, v6.h[0] + dup v2.8h, v7.h[2] + dup v1.8h, v7.h[3] + dup v3.8h, v6.h[1] + zip1 v0.2d, v0.2d, v2.2d + zip1 v1.2d, v1.2d, v3.2d +.L_pred8x8_dc_10_end: + mov w3, #4 + add x2, x0, x1, lsl #2 + +6: st1 {v0.8h}, [x0], x1 + subs w3, w3, #1 + st1 {v1.8h}, [x2], x1 + b.ne 6b + ret +endfunc + +function ff_pred8x8_l0t_dc_neon_10, export=1 + sub x2, x0, x1 + sub x3, x0, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, 4 + + uaddlp v0.4s, v0.8h + uaddlp v1.2s, v1.4h + addp v0.4s, v0.4s, v0.4s + addp v1.2s, v1.2s, v1.2s + add v1.2s, v1.2s, v0.2s + + rshrn v2.4h, v0.4s, #2 + rshrn v3.4h, v1.4s, #3 // the pred4x4 part + + dup v4.4h, v3.h[0] + dup v5.4h, v2.h[0] + dup v6.4h, v2.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v6.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_l00_dc_neon_10, export=1 + sub x2, x0, #2 + + ldcol.16 v0, x2, x1, 4 + + uaddlp v0.2s, v0.4h + addp v0.2s, v0.2s, v0.2s + rshrn v0.4h, v0.4s, #2 + + movi v1.8h, #2, lsl #8 // 512 + dup v0.8h, v0.h[0] + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0lt_dc_neon_10, export=1 + add x3, x0, x1, lsl #2 + sub x2, x0, x1 + sub x3, x3, #2 + + ld1 {v0.8h}, [x2] + ldcol.16 v1, x3, x1, hi=1 + + uaddlp v0.4s, v0.8h + uaddlp v1.4s, v1.8h + addp v0.4s, v0.4s, v0.4s + addp v1.4s, v1.4s, v1.4s + zip1 v0.2d, v0.2d, v1.2d + add v1.2s, v0.2s, v1.2s + + rshrn v2.4h, v0.4s, #2 + rshrn v3.4h, v1.4s, #3 + + dup v4.4h, v2.h[0] + dup v5.4h, v2.h[3] + dup v6.4h, v2.h[1] + dup v7.4h, v3.h[1] + + zip1 v0.2d, v4.2d, v6.2d + zip1 v1.2d, v5.2d, v7.2d + b .L_pred8x8_dc_10_end +endfunc + +function ff_pred8x8_0l0_dc_neon_10, export=1 + add x2, x0, x1, lsl #2 + sub x2, x2, #2 + + ldcol.16 v1, x2, x1, 4 + + uaddlp v2.4s, v1.8h + addp v2.4s, v2.4s, v2.4s + rshrn v1.4h, v2.4s, #2 + + movi v0.8h, #2, lsl #8 // 512 + dup v1.8h, v1.h[0] + b .L_pred8x8_dc_10_end +endfunc -- 2.32.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".