On Fri, 16 Jul 2021, Mikhail Nitenko wrote:
Benchmarks: A53 A72
pred8x8_dc_10_c: 64.2 49.5
pred8x8_dc_10_neon: 62.7 54.5
pred8x8_dc_128_10_c: 26.0 15.5
pred8x8_dc_128_10_neon: 28.2 16.0
pred8x8_horizontal_10_c: 60.0 27.7
pred8x8_horizontal_10_neon: 34.2 27.7
pred8x8_left_dc_10_c: 42.5 27.5
pred8x8_left_dc_10_neon: 50.7 41.2
pred8x8_mad_cow_dc_0l0_10_c: 55.7 37.2
pred8x8_mad_cow_dc_0l0_10_neon: 46.0 36.5
pred8x8_mad_cow_dc_0lt_10_c: 89.2 67.0
pred8x8_mad_cow_dc_0lt_10_neon: 50.2 46.7
pred8x8_mad_cow_dc_l0t_10_c: 75.5 51.0
pred8x8_mad_cow_dc_l0t_10_neon: 49.7 44.7
pred8x8_mad_cow_dc_l00_10_c: 58.0 38.0
pred8x8_mad_cow_dc_l00_10_neon: 41.0 37.5
pred8x8_plane_10_c: 347.5 288.7
pred8x8_plane_10_neon: 150.2 108.5
pred8x8_top_dc_10_c: 44.5 30.5
pred8x8_top_dc_10_neon: 39.7 31.5
pred8x8_vertical_10_c: 27.5 16.0
pred8x8_vertical_10_neon: 27.7 15.0
pred16x16_plane_10_c: 1245.5 1069.7
pred16x16_plane_10_neon: 349.0 208.7
Signed-off-by: Mikhail Nitenko <mnite...@gmail.com>
---
libavcodec/aarch64/h264pred_init.c | 40 +++-
libavcodec/aarch64/h264pred_neon.S | 369 ++++++++++++++++++++++++++++-
2 files changed, 402 insertions(+), 7 deletions(-)
diff --git a/libavcodec/aarch64/h264pred_neon.S
b/libavcodec/aarch64/h264pred_neon.S
index e40bdc8d53..735d20b49c 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -467,3 +475,356 @@ function ff_pred16x16_vert_neon_10, export=1
b.ne 1b
ret
endfunc
+
+function ff_pred16x16_plane_neon_10, export=1
+ sub x3, x0, x1
+ movrel x4, p16weight_10
+ add x2, x3, #16
+ sub x3, x3, #2
+
+ ld1 {v0.8h}, [x3]
+ ld1 {v2.8h}, [x2]
+ ldcol.16 v1, x3, x1, 8
+ add x3, x3, x1
+ ldcol.16 v3, x3, x1, 8
+
+ rev64 v16.8h, v0.8h
+ trn1 v0.2d, v16.2d, v16.2d
+ trn2 v0.2d, v16.2d, v0.2d
+
+ rev64 v16.8h, v1.8h
+ trn1 v1.2d, v16.2d, v16.2d
+ trn2 v1.2d, v16.2d, v1.2d
+
Umm, these trn1+trn2 are really confusing to try to figure out here. Do
you want to swap the two halfs of the register, to compensate for not
having a rev128? You can do that with "ext v0.16b, v0.16b, v0.16b, #8"
instead of these two instructions. (And it's better for pipelining to do
two rev64 followed by two ext, instead of interleaving them tightly.)
+ uaddl v7.4s, v2.4h, v3.4h
I don't think you need to go to 32 bit here? If you add two 10 bit pixels
together, the sum (11 bit) still fits in 16 bit elements just fine. (I
haven't checked how large the intermediates become further in this
calculation here, whether you need to go to 32 bit somewhere close to the
end of the calculation or if you can do it all in 16 bit.)
The same applies to the 8x8 version below too.
+ uaddl2 v16.4s, v2.8h, v3.8h
+ usubl v4.4s, v2.4h, v0.4h
+ usubl2 v5.4s, v2.8h, v0.8h
+ usubl v2.4s, v3.4h, v1.4h
+ usubl2 v3.4s, v3.8h, v1.8h
+
+ ld1 {v0.4s, v1.4s}, [x4]
+
+ mul v4.4s, v4.4s, v0.4s
+ mul v5.4s, v5.4s, v1.4s
+ mul v2.4s, v2.4s, v0.4s
+ mul v3.4s, v3.4s, v1.4s
+
+ addp v4.4s, v4.4s, v5.4s
+ addp v2.4s, v2.4s, v3.4s
+
+ addp v4.4s, v4.4s, v4.4s
+ addp v2.4s, v2.4s, v2.4s
+
+ addp v4.2s, v4.2s, v4.2s
+ addp v2.2s, v2.2s, v2.2s
+ mov v2.s[0], v4.s[0] // H and V
I haven't really studied this in detail, but why do you need to do
elementwise fiddling here, when it isn't needed in the 8 bit version of
the function?
+
+ sshll v3.2d, v2.2s, #2
+ saddw v2.2d, v3.2d, v2.2s
+ rshrn v4.2s, v2.2d, #6
+ dup v5.4s, v4.s[1]
+
+ add v2.2s, v4.2s, v5.2s
+ shl v3.4s, v2.4s, #3
+
+ mov w2, v7.s[0]
+ mov v7.s[0], v16.s[3]
+ mov v16.s[3], w2
Same here, there's no corresponding elementwise fiddling in the 8 bit
version, so I don't think it should be needed here either?
+
+ sub v3.4s, v3.4s, v2.4s // 7 * (b + c)
+ add v7.4s, v7.4s, v0.4s
+
+ shl v2.4s, v7.4s, #4
+ sub v2.4s, v2.4s, v3.4s
+ shl v3.4s, v4.4s, #4
+
+ movrel x5, p16weight_10_new
+ ld1 {v0.4s, v1.4s}, [x5]
The 8 bit version uses an "ext; mov v0.h[0], wzr" instead of loading a
whole new set of constants here. Would that work here too, or have you
lost the original constant?
+
+ sub v6.4s, v5.4s, v3.4s
+ mul v0.4s, v0.4s, v4.s[0]
+ mul v1.4s, v1.4s, v4.s[0]
+ dup v16.4s, v2.s[0]
+ dup v17.4s, v2.s[0]
+ dup v18.4s, v4.s[0]
+ dup v19.4s, v4.s[0]
+ dup v20.4s, v6.s[0]
+ dup v21.4s, v6.s[0]
+ shl v18.4s, v18.4s, #3
+ shl v19.4s, v19.4s, #3
+ add v16.4s, v16.4s, v0.4s
+ add v17.4s, v17.4s, v1.4s
+ add v20.4s, v20.4s, v18.4s
+ add v21.4s, v21.4s, v19.4s
+ mov w3, #16
+ mov w2, #1023 // for clipping
+ dup v3.8h, w2
Instead of mov+dup, you can load this constant with "mvni #0xFC, lsl #8"
which equals to loading 0x3ff.
+1:
+ sqshrun v0.4h, v16.4s, #5
+ sqshrun2 v0.8h, v17.4s, #5
+
+ add v16.4s, v16.4s, v18.4s
+ add v17.4s, v17.4s, v19.4s
+
+ sqshrun v1.4h, v16.4s, #5
+ sqshrun2 v1.8h, v17.4s, #5
+
+ add v16.4s, v16.4s, v20.4s
+ add v17.4s, v17.4s, v21.4s
+
+ subs w3, w3, #1
+
+ smin v0.8h, v0.8h, v3.8h
+ smin v1.8h, v1.8h, v3.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+
+function ff_pred8x8_hor_neon_10, export=1
+ sub x2, x0, #2
+ mov w3, #8
+
+1: ld1r {v0.8h}, [x2], x1
+ subs w3, w3, #1
+ st1 {v0.8h}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred8x8_vert_neon_10, export=1
+ sub x2, x0, x1
+ lsl x1, x1, #1
+
+ ld1 {v0.8h}, [x2], x1
+ mov w3, #4
+1: subs w3, w3, #1
+ st1 {v0.8h}, [x0], x1
+ st1 {v0.8h}, [x2], x1
+ b.ne 1b
+ ret
+endfunc
+
+function ff_pred8x8_plane_neon_10, export=1
+ sub x3, x0, x1
+ movrel x4, p8weight_10
+ movrel x5, p16weight_10
+ add x2, x3, #8
+ sub x3, x3, #2
+
+ ld1 {v0.d}[0], [x3]
+ ld1 {v2.d}[0], [x2], x1
+ ldcol.16 v0, x3, x1, hi=1
+ add x3, x3, x1
+ ldcol.16 v3, x3, x1, 4
+
+ uaddl v7.4s, v2.4h, v3.4h
+ rev64 v0.8h, v0.8h
+ trn1 v2.2d, v2.2d, v3.2d
+
+ usubl2 v3.4s, v2.8h, v0.8h
+ usubl v2.4s, v2.4h, v0.4h
+
+ ld1 {v6.4s}, [x4]
+ mul v2.4s, v2.4s, v6.4s
+ mul v3.4s, v3.4s, v6.4s
+ ld1 {v0.4s}, [x5]
+
+ saddlp v2.2d, v2.4s
+ saddlp v3.2d, v3.4s
+ addp v2.2d, v2.2d, v2.2d
+ addp v3.2d, v3.2d, v3.2d
+ mov v2.d[1], v3.d[0]
+ shl v3.2d, v2.2d, #4
+ add v2.2d, v3.2d, v2.2d
+ rshrn v5.2s, v2.2d, #5
+ addp v2.4s, v5.4s, v5.4s
+ shl v3.4s, v2.4s, #1
+ add v3.4s, v3.4s, v2.4s
+
+ rev64 v1.4s, v7.4s
+ trn1 v7.2d, v1.2d, v1.2d
+ trn2 v7.2d, v1.2d, v7.2d
+
+
+ add v7.4s, v7.4s, v0.4s
+ shl v2.4s, v7.4s, #4
+ sub v2.4s, v2.4s, v3.4s
+
+ movrel x5, p16weight_10_new
+ ld1 {v6.4s, v7.4s}, [x5]
+
+ mul v6.4s, v6.4s, v5.s[0]
+ mul v7.4s, v7.4s, v5.s[0]
+
+ dup v1.4s, v2.s[0]
+ dup v2.4s, v2.s[0]
+ dup v3.4s, v5.s[1]
+
+ add v1.4s, v1.4s, v6.4s
+ add v2.4s, v2.4s, v7.4s
+
+ mov w3, #8
+ mov w2, #1023 // for clipping
+ dup v4.8h, w2
+1:
+ sqshrun v0.4h, v1.4s, #5
+ sqshrun2 v0.8h, v2.4s, #5
+
+ subs w3, w3, #1
+
+ add v1.4s, v1.4s, v3.4s
+ add v2.4s, v2.4s, v3.4s
+
+ smin v0.8h, v0.8h, v4.8h
+ st1 {v0.8h}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
Partially the same comments as for 16x16 above also applies to this
function
+
+function ff_pred8x8_128_dc_neon_10, export=1
+ movi v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
+ movi v1.8h, #2, lsl #8
+ b .L_pred8x8_dc_10_end
+endfunc
+
+function ff_pred8x8_top_dc_neon_10, export=1
+ sub x2, x0, x1
+ ld1 {v0.8h}, [x2]
+
+ uaddlp v0.4s, v0.8h
No need to go to 32 bit here; the same applies to most of the other
functions below too.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".