From: Martin Storsjö <mar...@martin.st> Before: Cortex A53 A72 A73 vsse_5_neon: 74.7 31.5 26.0 After: vsse_5_neon: 62.7 32.5 25.7 --- libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S index 61e4f68335..d8a18cd4b8 100644 --- a/libavcodec/aarch64/me_cmp_neon.S +++ b/libavcodec/aarch64/me_cmp_neon.S @@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1 // x3 ptrdiff_t stride // w4 int h + sub w4, w4, #1 // we need to make h-1 iterations ld1 {v0.8b}, [x1], x3 + cmp w4, #3 movi v16.4s, #0 - sub w4, w4, #1 // we need to make h-1 iterations - cmp w4, #3 b.lt 2f 1: @@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1 ld1 {v2.8b}, [x1], x3 uabd v30.8b, v0.8b, v1.8b ld1 {v3.8b}, [x1], x3 - umull v29.8h, v30.8b, v30.8b uabd v27.8b, v1.8b, v2.8b - uadalp v16.4s, v29.8h - umull v26.8h, v27.8b, v27.8b + umull v29.8h, v30.8b, v30.8b uabd v25.8b, v2.8b, v3.8b - uadalp v16.4s, v26.8h + umull v26.8h, v27.8b, v27.8b + uadalp v16.4s, v29.8h umull v24.8h, v25.8b, v25.8b + uadalp v16.4s, v26.8h sub w4, w4, #3 uadalp v16.4s, v24.8h cmp w4, #3 -- 2.37.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".