The branch, master has been updated
via f1a155d9754f2f38da121b8935ea1a5483021a5a (commit)
from 0bd5a7d3719456f049f4d29abb313968ccacb28c (commit)
- Log -----------------------------------------------------------------
commit f1a155d9754f2f38da121b8935ea1a5483021a5a
Author: Krzysztof Pyrkosz <[email protected]>
AuthorDate: Fri Sep 5 19:52:11 2025 +0200
Commit: Martin Storsjö <[email protected]>
CommitDate: Sun Sep 21 19:39:27 2025 +0000
avcodec/aarch64/vvc: Optimize dmvr_hv_10
Before and after on A53:
dmvr_hv_10_12x20_neon: 1838.2 ( 3.02x)
dmvr_hv_10_20x12_neon: 1330.2 ( 1.83x)
dmvr_hv_10_20x20_neon: 2148.2 ( 1.85x)
dmvr_hv_12_12x20_neon: 1839.2 ( 3.02x)
dmvr_hv_12_20x12_neon: 1330.6 ( 1.83x)
dmvr_hv_12_20x20_neon: 2147.2 ( 1.85x)
dmvr_hv_10_12x20_neon: 1755.0 ( 3.17x)
dmvr_hv_10_20x12_neon: 1165.8 ( 2.09x)
dmvr_hv_10_20x20_neon: 1876.1 ( 2.12x)
dmvr_hv_12_12x20_neon: 1754.4 ( 3.17x)
dmvr_hv_12_20x12_neon: 1167.8 ( 2.09x)
dmvr_hv_12_20x20_neon: 1878.8 ( 2.12x)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 01d2ff155c..79ff720cdd 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -599,18 +599,13 @@ function ff_vvc_dmvr_hv_8_neon, export=1
endfunc
function ff_vvc_dmvr_hv_12_neon, export=1
- movi v29.4s, #(12 - 6)
- movi v30.4s, #(1 << (12 - 7)) // offset1
+ mvni v29.4s, #(12 - 6 - 1)
b 0f
endfunc
function ff_vvc_dmvr_hv_10_neon, export=1
- movi v29.4s, #(10 - 6)
- movi v30.4s, #(1 << (10 - 7)) // offset1
+ mvni v29.4s, #(10 - 6 - 1)
0:
- movi v31.4s, #8 // offset2
- neg v29.4s, v29.4s
-
sub sp, sp, #(VVC_MAX_PB_SIZE * 4)
movrel x9, X(ff_vvc_inter_luma_dmvr_filters)
@@ -626,7 +621,6 @@ function ff_vvc_dmvr_hv_10_neon, export=1
add x12, x9, my, lsl #1
ldrb w10, [x12]
ldrb w11, [x12, #1]
- sxtw x6, w6
dup v2.8h, w10 // filter_y[0]
dup v3.8h, w11 // filter_y[1]
@@ -635,7 +629,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
mov w10, #0 // start filter_y or not
add height, height, #1
sub dst, dst, #(VVC_MAX_PB_SIZE * 2)
- sub src_stride, src_stride, x6, lsl #1
+ sub src_stride, src_stride, w6, sxtw #1
cset w15, gt // width > 16
1:
mov x12, tmp0
@@ -656,14 +650,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal v18.4s, v17.4h, v1.4h
umlal2 v19.4s, v17.8h, v1.8h
- add v4.4s, v4.4s, v30.4s
- add v5.4s, v5.4s, v30.4s
- add v18.4s, v18.4s, v30.4s
- add v19.4s, v19.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
- ushl v5.4s, v5.4s, v29.4s
- ushl v18.4s, v18.4s, v29.4s
- ushl v19.4s, v19.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
+ urshl v5.4s, v5.4s, v29.4s
+ urshl v18.4s, v18.4s, v29.4s
+ urshl v19.4s, v19.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
uqxtn v7.4h, v18.4s
@@ -681,18 +671,10 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal2 v18.4s, v6.8h, v3.8h
umlal v19.4s, v7.4h, v3.4h
umlal2 v20.4s, v7.8h, v3.8h
- add v17.4s, v17.4s, v31.4s
- add v18.4s, v18.4s, v31.4s
- add v19.4s, v19.4s, v31.4s
- add v20.4s, v20.4s, v31.4s
- ushr v17.4s, v17.4s, #4
- ushr v18.4s, v18.4s, #4
- ushr v19.4s, v19.4s, #4
- ushr v20.4s, v20.4s, #4
- uqxtn v6.4h, v17.4s
- uqxtn2 v6.8h, v18.4s
- uqxtn v7.4h, v19.4s
- uqxtn2 v7.8h, v20.4s
+ uqrshrn v6.4h, v17.4s, #4
+ uqrshrn2 v6.8h, v18.4s, #4
+ uqrshrn v7.4h, v19.4s, #4
+ uqrshrn2 v7.8h, v20.4s, #4
stp q6, q7, [x14], #32
b 3f
2:
@@ -704,10 +686,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umlal v4.4s, v7.4h, v1.4h
umlal2 v5.4s, v7.8h, v1.8h
- add v4.4s, v4.4s, v30.4s
- add v5.4s, v5.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
- ushl v5.4s, v5.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
+ urshl v5.4s, v5.4s, v29.4s
uqxtn v6.4h, v4.4s
uqxtn2 v6.8h, v5.4s
str q6, [x13], #16
@@ -719,10 +699,8 @@ function ff_vvc_dmvr_hv_10_neon, export=1
umull2 v18.4s, v16.8h, v2.8h
umlal v17.4s, v6.4h, v3.4h
umlal2 v18.4s, v6.8h, v3.8h
- add v17.4s, v17.4s, v31.4s
- add v18.4s, v18.4s, v31.4s
- ushr v17.4s, v17.4s, #4
- ushr v18.4s, v18.4s, #4
+ urshr v17.4s, v17.4s, #4
+ urshr v18.4s, v18.4s, #4
uqxtn v16.4h, v17.4s
uqxtn2 v16.8h, v18.4s
str q16, [x14], #16
@@ -731,8 +709,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
ldr d6, [src], #8
umull v4.4s, v7.4h, v1.4h
umlal v4.4s, v6.4h, v0.4h
- add v4.4s, v4.4s, v30.4s
- ushl v4.4s, v4.4s, v29.4s
+ urshl v4.4s, v4.4s, v29.4s
uqxtn v6.4h, v4.4s
str d6, [x13], #8
@@ -741,8 +718,7 @@ function ff_vvc_dmvr_hv_10_neon, export=1
ldr d16, [x12], #8
umull v17.4s, v16.4h, v2.4h
umlal v17.4s, v6.4h, v3.4h
- add v17.4s, v17.4s, v31.4s
- ushr v17.4s, v17.4s, #4
+ urshr v17.4s, v17.4s, #4
uqxtn v16.4h, v17.4s
str d16, [x14], #8
4:
-----------------------------------------------------------------------
Summary of changes:
libavcodec/aarch64/vvc/inter.S | 58 +++++++++++++-----------------------------
1 file changed, 17 insertions(+), 41 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]