This is an automated email from the git hooks/post-receive script.
Git pushed a commit to branch master
in repository ffmpeg.
The following commit(s) were added to refs/heads/master by this push:
new 200914853d aarch64/sbrdsp: unroll sum64x5 to 16 floats/iter
200914853d is described below
commit 200914853da88aef7f3ae915d64976c7589d5e80
Author: Zhao Zhili <[email protected]>
AuthorDate: Thu May 21 15:26:30 2026 +0800
Commit: Zhao Zhili <[email protected]>
CommitDate: Wed Jun 3 10:40:20 2026 +0000
aarch64/sbrdsp: unroll sum64x5 to 16 floats/iter
The C version is faster than the previous asm with clang and gcc > 12 on
rpi5, since compiler basically does the same unroll.
sum64x5_neon: before after
Cortex-A76 (gcc 12.4): 72.3 (3.63x) 47.4 (5.56x)
Cortex-A76 (gcc 14.2): 72.3 (0.69x) 47.4 (1.05x)
Apple M1 (clang 16): 0.2 (0.98x) 0.2 (0.99x)
Signed-off-by: Zhao Zhili <[email protected]>
---
libavcodec/aarch64/sbrdsp_neon.S | 45 ++++++++++++++++++++++++++++++----------
1 file changed, 34 insertions(+), 11 deletions(-)
diff --git a/libavcodec/aarch64/sbrdsp_neon.S b/libavcodec/aarch64/sbrdsp_neon.S
index 1fdde6ccb6..d363941717 100644
--- a/libavcodec/aarch64/sbrdsp_neon.S
+++ b/libavcodec/aarch64/sbrdsp_neon.S
@@ -46,17 +46,40 @@ function ff_sbr_sum64x5_neon, export=1
add x3, x0, #192*4
add x4, x0, #256*4
mov x5, #64
-1: ld1 {v0.4s}, [x0]
- ld1 {v1.4s}, [x1], #16
- fadd v0.4s, v0.4s, v1.4s
- ld1 {v2.4s}, [x2], #16
- fadd v0.4s, v0.4s, v2.4s
- ld1 {v3.4s}, [x3], #16
- fadd v0.4s, v0.4s, v3.4s
- ld1 {v4.4s}, [x4], #16
- fadd v0.4s, v0.4s, v4.4s
- st1 {v0.4s}, [x0], #16
- subs x5, x5, #4
+1: ldp q0, q1, [x0]
+ ldp q2, q3, [x0, #32]
+ ldp q4, q5, [x1]
+ ldp q6, q7, [x1, #32]
+ ldp q16, q17, [x2]
+ ldp q18, q19, [x2, #32]
+ add x1, x1, #64
+ add x2, x2, #64
+ subs x5, x5, #16
+ fadd v0.4s, v0.4s, v4.4s
+ fadd v1.4s, v1.4s, v5.4s
+ fadd v2.4s, v2.4s, v6.4s
+ fadd v3.4s, v3.4s, v7.4s
+ ldp q20, q21, [x3]
+ ldp q22, q23, [x3, #32]
+ add x3, x3, #64
+ fadd v0.4s, v0.4s, v16.4s
+ fadd v1.4s, v1.4s, v17.4s
+ fadd v2.4s, v2.4s, v18.4s
+ fadd v3.4s, v3.4s, v19.4s
+ ldp q24, q25, [x4]
+ ldp q26, q27, [x4, #32]
+ add x4, x4, #64
+ fadd v0.4s, v0.4s, v20.4s
+ fadd v1.4s, v1.4s, v21.4s
+ fadd v2.4s, v2.4s, v22.4s
+ fadd v3.4s, v3.4s, v23.4s
+ fadd v0.4s, v0.4s, v24.4s
+ fadd v1.4s, v1.4s, v25.4s
+ fadd v2.4s, v2.4s, v26.4s
+ fadd v3.4s, v3.4s, v27.4s
+ stp q0, q1, [x0]
+ stp q2, q3, [x0, #32]
+ add x0, x0, #64
b.gt 1b
ret
endfunc
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]