The mismatch between neon and C functions can be reproduced using the following bitstream and command line on aarch64 devices.
wget https://streams.videolan.org/ffmpeg/incoming/replay_intra_pred_16x16.h264 ./ffmpeg -cpuflags 0 -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_ref ./ffmpeg -threads 1 -i replay_intra_pred_16x16.h264 -f framemd5 -y md5_neon Signed-off-by: Bin Peng <[email protected]> --- libavcodec/aarch64/h264pred_neon.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S index d0999938ef..795d2ce540 100644 --- a/libavcodec/aarch64/h264pred_neon.S +++ b/libavcodec/aarch64/h264pred_neon.S @@ -489,10 +489,10 @@ function ff_pred16x16_plane_neon_10, export=1 mul v2.8h, v2.8h, v0.8h mul v3.8h, v3.8h, v0.8h addp v2.8h, v2.8h, v3.8h - addp v2.8h, v2.8h, v2.8h - addp v2.4h, v2.4h, v2.4h - sshll v3.4s, v2.4h, #2 - saddw v2.4s, v3.4s, v2.4h + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s + shl v3.4s, v2.4s, #2 + add v2.4s, v3.4s, v2.4s rshrn v4.4h, v2.4s, #6 trn2 v5.4h, v4.4h, v4.4h add v2.4h, v4.4h, v5.4h @@ -506,14 +506,13 @@ function ff_pred16x16_plane_neon_10, export=1 sxtl v6.4s, v5.4h // c mov v0.h[0], wzr - mul v0.8h, v0.8h, v4.h[0] dup v16.4s, v2.s[0] dup v17.4s, v2.s[0] dup v2.8h, v4.h[0] // b dup v3.4s, v6.s[0] // c sshll v2.4s, v2.4h, #3 // b * 8 - saddw v16.4s, v16.4s, v0.4h - saddw2 v17.4s, v17.4s, v0.8h + smlal v16.4s, v0.4h, v4.h[0] + smlal2 v17.4s, v0.8h, v4.h[0] sub v3.4s, v3.4s, v2.4s mov w3, #16 -- 2.25.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
