The branch, master has been updated
via 03c054d43c594211b89d5b2931430dccd7424d58 (commit)
via 56a638d8365370b287c53768f2c8a34a4cf3e417 (commit)
from e5ac70042e91d19110a04a52b7e6fa4703f61200 (commit)
- Log -----------------------------------------------------------------
commit 03c054d43c594211b89d5b2931430dccd7424d58
Author: Krzysztof Pyrkosz <[email protected]>
AuthorDate: Mon Sep 8 20:56:24 2025 +0200
Commit: Martin Storsjö <[email protected]>
CommitDate: Tue Sep 23 11:20:20 2025 +0000
avcodec/aarch64/vvc: Implement dmvr_v_8
A72
dmvr_v_8_12x20_neon: 207.0 ( 4.15x)
dmvr_v_8_20x12_neon: 170.4 ( 4.37x)
dmvr_v_8_20x20_neon: 273.4 ( 4.58x)
A53
dmvr_v_8_12x20_neon: 450.6 ( 4.21x)
dmvr_v_8_20x12_neon: 342.8 ( 3.70x)
dmvr_v_8_20x20_neon: 550.9 ( 3.79x)
diff --git a/libavcodec/aarch64/vvc/dsp_init.c
b/libavcodec/aarch64/vvc/dsp_init.c
index bdfa142a5a..b7dc1d89f8 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -101,6 +101,7 @@ DMVR_FUN(, 12)
DMVR_FUN(h_, 8)
DMVR_FUN(h_, 10)
DMVR_FUN(h_, 12)
+DMVR_FUN(v_, 8)
DMVR_FUN(hv_, 8)
DMVR_FUN(hv_, 10)
DMVR_FUN(hv_, 12)
@@ -195,6 +196,7 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const
int bd)
c->inter.w_avg = vvc_w_avg_8;
c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
+ c->inter.dmvr[1][0] = ff_vvc_dmvr_v_8_neon;
c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index df6b59510d..a874edf889 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -385,6 +385,62 @@ function ff_vvc_dmvr_12_neon, export=1
ret
endfunc
+function ff_vvc_dmvr_v_8_neon, export=1
+ movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
+ add x7, x7, x5, lsl #1
+ ld2r {v0.16b, v1.16b}, [x7]
+ tbz w6, #4, 12f
+
+ ldr s16, [x1, #16]
+ ld1 {v2.16b}, [x1], x2
+20:
+ ldr s17, [x1, #16]
+ umull v4.8h, v0.8b, v2.8b
+ umull2 v5.8h, v0.16b, v2.16b
+ ld1 {v3.16b}, [x1], x2
+ umull v16.8h, v0.8b, v16.8b
+ umull v6.8h, v1.8b, v3.8b
+ umull2 v7.8h, v1.16b, v3.16b
+ add v4.8h, v4.8h, v6.8h
+ umull v18.8h, v1.8b, v17.8b
+ add v5.8h, v5.8h, v7.8h
+ urshr v4.8h, v4.8h, #2
+ add v19.4h, v16.4h, v18.4h
+ urshr v5.8h, v5.8h, #2
+ urshr v19.4h, v19.4h, #2
+ st1 {v4.8h, v5.8h}, [x0], #32
+ subs w3, w3, #1
+ mov v2.16b, v3.16b
+ st1 {v19.4h}, [x0], #8
+ mov v16.16b, v17.16b
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 32 - 8)
+ b.ne 20b
+ ret
+
+12:
+ ldr s16, [x1, #8]
+ ld1 {v2.8b}, [x1], x2
+2:
+ ldr s17, [x1, #8]
+ umull v4.8h, v0.8b, v2.8b
+ ld1 {v3.8b}, [x1], x2
+ umull v16.8h, v0.8b, v16.8b
+ umull v6.8h, v1.8b, v3.8b
+ add v4.8h, v4.8h, v6.8h
+ umull v18.8h, v1.8b, v17.8b
+ srshr v4.8h, v4.8h, #2
+ add v19.4h, v16.4h, v18.4h
+ srshr v19.4h, v19.4h, #2
+ st1 {v4.8h}, [x0], #16
+ subs w3, w3, #1
+ mov v2.16b, v3.16b
+ st1 {v19.4h}, [x0], #8
+ mov v16.16b, v17.16b
+ add x0, x0, #(VVC_MAX_PB_SIZE * 2 - 16 - 8)
+ b.ne 2b
+ ret
+endfunc
+
function ff_vvc_dmvr_h_8_neon, export=1
movrel x7, X(ff_vvc_inter_luma_dmvr_filters)
add x7, x7, x4, lsl #1
commit 56a638d8365370b287c53768f2c8a34a4cf3e417
Author: Krzysztof Pyrkosz <[email protected]>
AuthorDate: Sun Sep 14 19:13:24 2025 +0200
Commit: Martin Storsjö <[email protected]>
CommitDate: Tue Sep 23 11:20:11 2025 +0000
avcodec/aarch64/vvc: Unroll vvc_bdof_grad_filter_8x_neon
Before and after:
A53:
apply_bdof_8_16x8_neon: 2733.1 ( 4.88x)
apply_bdof_8_16x16_neon: 5458.6 ( 4.86x)
apply_bdof_10_16x8_neon: 2789.8 ( 4.64x)
apply_bdof_10_16x16_neon: 5523.8 ( 4.68x)
apply_bdof_12_16x8_neon: 2792.8 ( 4.58x)
apply_bdof_12_16x16_neon: 5519.5 ( 4.63x)
apply_bdof_8_16x8_neon: 2571.8 ( 5.12x)
apply_bdof_8_16x16_neon: 5173.3 ( 5.12x)
apply_bdof_10_16x8_neon: 2635.1 ( 4.87x)
apply_bdof_10_16x16_neon: 5243.0 ( 4.89x)
apply_bdof_12_16x8_neon: 2613.0 ( 4.89x)
apply_bdof_12_16x16_neon: 5231.7 ( 4.90x)
A78:
apply_bdof_8_16x8_neon: 565.3 ( 8.43x)
apply_bdof_8_16x16_neon: 1109.5 ( 8.60x)
apply_bdof_10_16x8_neon: 568.2 ( 7.92x)
apply_bdof_10_16x16_neon: 1114.1 ( 8.08x)
apply_bdof_12_16x8_neon: 570.2 ( 7.87x)
apply_bdof_12_16x16_neon: 1116.3 ( 8.03x)
apply_bdof_8_16x8_neon: 541.4 ( 8.81x)
apply_bdof_8_16x16_neon: 1065.9 ( 8.97x)
apply_bdof_10_16x8_neon: 543.2 ( 8.32x)
apply_bdof_10_16x16_neon: 1071.5 ( 8.39x)
apply_bdof_12_16x8_neon: 544.2 ( 8.25x)
apply_bdof_12_16x16_neon: 1074.1 ( 8.37x)
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 79ff720cdd..df6b59510d 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -803,28 +803,21 @@ function vvc_bdof_grad_filter_8x_neon, export=0
src1 .req x5
width .req w6
height .req w7
+ tbnz w6, #4, 16f
-1:
- mov x10, src0
- mov w11, width
- mov x12, gh0
- mov x13, gv0
- mov x14, src1
- mov x15, gh1
- mov x16, gv1
-2:
- ldur q0, [x10, #2]
- ldur q1, [x10, #-2]
- ldr q2, [x10, #(VVC_MAX_PB_SIZE << 1)]
- ldr q3, [x10, #-(VVC_MAX_PB_SIZE << 1)]
+8:
+ ldur q0, [src0, #2]
+ ldur q1, [src0, #-2]
+ ldr q2, [src0, #(VVC_MAX_PB_SIZE << 1)]
+ ldr q3, [src0, #-(VVC_MAX_PB_SIZE << 1)]
sshr v0.8h, v0.8h, #6
sshr v1.8h, v1.8h, #6
- ldur q4, [x14, #2]
- ldur q5, [x14, #-2]
+ ldur q4, [src1, #2]
+ ldur q5, [src1, #-2]
sshr v2.8h, v2.8h, #6
sshr v3.8h, v3.8h, #6
- ldr q6, [x14, #(VVC_MAX_PB_SIZE << 1)]
- ldr q7, [x14, #-(VVC_MAX_PB_SIZE << 1)]
+ ldr q6, [src1, #(VVC_MAX_PB_SIZE << 1)]
+ ldr q7, [src1, #-(VVC_MAX_PB_SIZE << 1)]
// results of gradient_h0
sub v0.8h, v0.8h, v1.8h
// results of gradient_v0
@@ -839,26 +832,20 @@ function vvc_bdof_grad_filter_8x_neon, export=0
// results of gradient_v1
sub v6.8h, v6.8h, v7.8h
- add x10, x10, #16
- add x14, x14, #16
-
// (gradient_h0 + gradient_h1) >> 1
shadd v1.8h, v0.8h, v4.8h
// gradient_h0 - gradient_h1
sub v5.8h, v0.8h, v4.8h
- subs w11, w11, #8
-
// (gradient_v0 + gradient_v1) >> 1
shadd v3.8h, v2.8h, v6.8h
// gradient_v0 - gradient_v1
sub v7.8h, v2.8h, v6.8h
- st1 {v1.8h}, [x12], #16
- st1 {v5.8h}, [x15], #16
- st1 {v3.8h}, [x13], #16
- st1 {v7.8h}, [x16], #16
- b.ne 2b
+ st1 {v1.8h}, [gh0]
+ st1 {v5.8h}, [gh1]
+ st1 {v3.8h}, [gv0]
+ st1 {v7.8h}, [gv1]
subs height, height, #1
add gh0, gh0, #(BDOF_BLOCK_SIZE << 1)
@@ -867,7 +854,84 @@ function vvc_bdof_grad_filter_8x_neon, export=0
add gh1, gh1, #(BDOF_BLOCK_SIZE << 1)
add gv1, gv1, #(BDOF_BLOCK_SIZE << 1)
add src1, src1, #(VVC_MAX_PB_SIZE << 1)
- b.ne 1b
+ b.ne 8b
+ ret
+
+16:
+ ldur q0, [src0, #2]
+ ldur q1, [src0, #18]
+ ldur q16, [src0, #-2]
+ sshr v0.8h, v0.8h, #6
+ ldur q17, [src0, #14]
+ sshr v1.8h, v1.8h, #6
+ ldp q18, q19, [src0, #-(VVC_MAX_PB_SIZE << 1)]
+ sshr v16.8h, v16.8h, #6
+ ldp q2, q3, [src0, #(VVC_MAX_PB_SIZE << 1)]!
+ ldur q20, [src1, #2]
+ sshr v17.8h, v17.8h, #6
+ ldur q21, [src1, #18]
+ sshr v2.8h, v2.8h, #6
+ ldur q22, [src1, #-2]
+ sshr v3.8h, v3.8h, #6
+ ldur q23, [src1, #14]
+ sshr v18.8h, v18.8h, #6
+ ldp q26, q27, [src1, #-(VVC_MAX_PB_SIZE << 1)]
+ sshr v19.8h, v19.8h, #6
+ ldp q24, q25, [src1, #(VVC_MAX_PB_SIZE << 1)]!
+
+ // results of gradient_h0
+ sub v0.8h, v0.8h, v16.8h
+ sub v1.8h, v1.8h, v17.8h
+
+ // results of gradient_v0
+ sub v2.8h, v2.8h, v18.8h
+ sub v3.8h, v3.8h, v19.8h
+
+ sshr v20.8h, v20.8h, #6
+ sshr v21.8h, v21.8h, #6
+ sshr v22.8h, v22.8h, #6
+ sshr v23.8h, v23.8h, #6
+
+ // results of gradient_h1
+ sub v20.8h, v20.8h, v22.8h
+ sub v21.8h, v21.8h, v23.8h
+
+ sshr v24.8h, v24.8h, #6
+ sshr v25.8h, v25.8h, #6
+
+ // gradient_h0 - gradient_h1
+ sub v22.8h, v0.8h, v20.8h
+ sub v23.8h, v1.8h, v21.8h
+
+ // (gradient_h0 + gradient_h1) >> 1
+ shadd v16.8h, v0.8h, v20.8h
+ shadd v17.8h, v1.8h, v21.8h
+
+ st1 {v22.8h, v23.8h}, [gh1], #32
+
+ sshr v26.8h, v26.8h, #6
+ sshr v27.8h, v27.8h, #6
+
+ st1 {v16.8h, v17.8h}, [gh0], #32
+
+ // results of gradient_v1
+ sub v24.8h, v24.8h, v26.8h
+ sub v25.8h, v25.8h, v27.8h
+
+ // (gradient_v0 + gradient_v1) >> 1
+ shadd v18.8h, v2.8h, v24.8h
+ shadd v19.8h, v3.8h, v25.8h
+
+ // gradient_v0 - gradient_v1
+ sub v26.8h, v2.8h, v24.8h
+ sub v27.8h, v3.8h, v25.8h
+
+ st1 {v18.8h,v19.8h}, [gv0], #32
+
+ subs height, height, #1
+ st1 {v26.8h,v27.8h}, [gv1], #32
+
+ b.ne 16b
ret
.unreq gh0
-----------------------------------------------------------------------
Summary of changes:
libavcodec/aarch64/vvc/dsp_init.c | 2 +
libavcodec/aarch64/vvc/inter.S | 176 ++++++++++++++++++++++++++++++++------
2 files changed, 150 insertions(+), 28 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]