ffmpeg | branch: master | Krzysztof Pyrkosz <ffm...@szaka.eu> | Sat Mar 1 13:59:00 2025 +0100| [38929b824bcc4b3307af3e0711c5c03b823a83e3] | committer: Martin Storsjö
swscale/aarch64: Refactor hscale_16_to_15__fs_4 This patch removes the use of stack for temporary state and replaces interleaved ld4 loads with ld1. Before/after: A78 hscale_16_to_15__fs_4_dstW_8_neon: 86.8 ( 1.72x) hscale_16_to_15__fs_4_dstW_24_neon: 147.5 ( 2.73x) hscale_16_to_15__fs_4_dstW_128_neon: 614.0 ( 3.14x) hscale_16_to_15__fs_4_dstW_144_neon: 680.5 ( 3.18x) hscale_16_to_15__fs_4_dstW_256_neon: 1193.2 ( 3.19x) hscale_16_to_15__fs_4_dstW_512_neon: 2305.0 ( 3.27x) hscale_16_to_15__fs_4_dstW_8_neon: 86.0 ( 1.74x) hscale_16_to_15__fs_4_dstW_24_neon: 106.8 ( 3.78x) hscale_16_to_15__fs_4_dstW_128_neon: 404.0 ( 4.81x) hscale_16_to_15__fs_4_dstW_144_neon: 451.8 ( 4.80x) hscale_16_to_15__fs_4_dstW_256_neon: 760.5 ( 5.06x) hscale_16_to_15__fs_4_dstW_512_neon: 1520.0 ( 5.01x) A72 hscale_16_to_15__fs_4_dstW_8_neon: 156.8 ( 1.52x) hscale_16_to_15__fs_4_dstW_24_neon: 217.8 ( 2.52x) hscale_16_to_15__fs_4_dstW_128_neon: 906.8 ( 2.90x) hscale_16_to_15__fs_4_dstW_144_neon: 1014.5 ( 2.91x) hscale_16_to_15__fs_4_dstW_256_neon: 1751.5 ( 2.96x) hscale_16_to_15__fs_4_dstW_512_neon: 3469.3 ( 2.97x) hscale_16_to_15__fs_4_dstW_8_neon: 151.2 ( 1.54x) hscale_16_to_15__fs_4_dstW_24_neon: 173.4 ( 3.15x) hscale_16_to_15__fs_4_dstW_128_neon: 660.0 ( 3.98x) hscale_16_to_15__fs_4_dstW_144_neon: 735.7 ( 4.00x) hscale_16_to_15__fs_4_dstW_256_neon: 1273.5 ( 4.09x) hscale_16_to_15__fs_4_dstW_512_neon: 2488.2 ( 4.16x) Signed-off-by: Martin Storsjö <mar...@martin.st> > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=38929b824bcc4b3307af3e0711c5c03b823a83e3 --- libswscale/aarch64/hscale.S | 183 +++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 113 deletions(-) diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index 435460c1af..4140fa9c60 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -638,6 +638,16 @@ function ff_hscale8to19_X4_neon, export=1 ret endfunc + +.macro hscale_iter src, src2, filter, dst1, dst2 + uxtl \src\().4s, \src\().4h + sxtl v19.4s, \filter\().4h + mul \dst1\().4s, \src\().4s, v19.4s + uxtl \src2\().4s, \src2\().4h + sxtl2 \filter\().4s, \filter\().8h + mul \dst2\().4s, \src2\().4s, \filter\().4s +.endm + function ff_hscale16to15_4_neon_asm, export=1 // w0 int shift // x1 int32_t *dst @@ -664,6 +674,7 @@ function ff_hscale16to15_4_neon_asm, export=1 add x5, x5, #32 // shift all filterPos left by one, as uint16_t will be read + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] lsl x8, x8, #1 lsl x9, x9, #1 lsl x10, x10, #1 @@ -674,154 +685,101 @@ function ff_hscale16to15_4_neon_asm, export=1 lsl x15, x15, #1 // load src with given offset - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] - - sub sp, sp, #64 - // push src on stack so it can be loaded into vectors later - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + ldr d0, [x3, w8, uxtw] + ldr d1, [x3, w9, uxtw] + ldr d2, [x3, w10, uxtw] + ldr d3, [x3, w11, uxtw] + ldr d4, [x3, w12, uxtw] + ldr d5, [x3, w13, uxtw] + ldr d6, [x3, w14, uxtw] + ldr d7, [x3, w15, uxtw] 1: - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7] - - // Each of blocks does the following: - // Extend src and filter to 32 bits with uxtl and sxtl - // multiply or multiply and accumulate results - // Extending to 32 bits is necessary, as unit16_t values can't - // be represented as int16_t without type promotion. - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v28.4s, v0.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v27.4s, v26.4s - sxtl2 v28.4s, v31.8h - sub w2, w2, #8 - mla v6.4s, v28.4s, v0.4s - - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s - - st1 {v5.8h}, [x1], #16 - cmp w2, #16 // load filterPositions into registers for next iteration + + hscale_iter v0, v1, v28, v20, v21 ldp w8, w9, [x5] // filterPos[0], filterPos[1] + hscale_iter v2, v3, v29, v22, v23 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3] + hscale_iter v4, v5, v30, v24, v25 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5] + hscale_iter v6, v7, v31, v26, v27 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7] + subs w2, w2, #8 add x5, x5, #32 + ldp q28, q29, [x4], #32 // filter[0..7] lsl x8, x8, #1 lsl x9, x9, #1 lsl x10, x10, #1 lsl x11, x11, #1 + ldp q30, q31, [x4], #32 // filter[0..7] lsl x12, x12, #1 lsl x13, x13, #1 lsl x14, x14, #1 lsl x15, x15, #1 - ldr x8, [x3, w8, uxtw] - ldr x9, [x3, w9, uxtw] - ldr x10, [x3, w10, uxtw] - ldr x11, [x3, w11, uxtw] - ldr x12, [x3, w12, uxtw] - ldr x13, [x3, w13, uxtw] - ldr x14, [x3, w14, uxtw] - ldr x15, [x3, w15, uxtw] + addp v20.4s, v20.4s, v21.4s + ldr d0, [x3, w8, uxtw] + addp v22.4s, v22.4s, v23.4s + ldr d1, [x3, w9, uxtw] + addp v24.4s, v24.4s, v25.4s + ldr d2, [x3, w10, uxtw] + addp v26.4s, v26.4s, v27.4s + ldr d3, [x3, w11, uxtw] + addp v20.4s, v20.4s, v22.4s + ldr d4, [x3, w12, uxtw] + addp v21.4s, v24.4s, v26.4s + ldr d5, [x3, w13, uxtw] + cmp w2, #16 - stp x8, x9, [sp] - stp x10, x11, [sp, #16] - stp x12, x13, [sp, #32] - stp x14, x15, [sp, #48] + sshl v20.4s, v20.4s, v17.4s + ldr d6, [x3, w14, uxtw] + sshl v21.4s, v21.4s, v17.4s + ldr d7, [x3, w15, uxtw] + smin v20.4s, v20.4s, v18.4s + smin v21.4s, v21.4s, v18.4s + xtn v20.4h, v20.4s + xtn2 v20.8h, v21.4s + + st1 {v20.8h}, [x1], #16 b.ge 1b // here we make last iteration, without updating the registers - ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp] - ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 - - uxtl v26.4s, v0.4h - sxtl v27.4s, v28.4h - uxtl2 v0.4s, v0.8h - mul v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v28.8h - uxtl v26.4s, v1.4h - mul v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v29.4h - uxtl2 v0.4s, v1.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v29.8h - uxtl v26.4s, v2.4h - mla v6.4s, v0.4s, v28.4s - sxtl v27.4s, v30.4h - uxtl2 v0.4s, v2.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v30.8h - uxtl v26.4s, v3.4h - mla v6.4s, v0.4s, v28.4s - - sxtl v27.4s, v31.4h - uxtl2 v0.4s, v3.8h - mla v5.4s, v26.4s, v27.4s - sxtl2 v28.4s, v31.8h + hscale_iter v0, v1, v28, v20, v21 + hscale_iter v2, v3, v29, v22, v23 + hscale_iter v4, v5, v30, v24, v25 + hscale_iter v6, v7, v31, v26, v27 subs w2, w2, #8 - mla v6.4s, v0.4s, v28.4s - sshl v5.4s, v5.4s, v17.4s - sshl v6.4s, v6.4s, v17.4s - smin v5.4s, v5.4s, v18.4s - smin v6.4s, v6.4s, v18.4s - xtn v5.4h, v5.4s - xtn2 v5.8h, v6.4s + addp v20.4s, v20.4s, v21.4s + addp v22.4s, v22.4s, v23.4s + addp v24.4s, v24.4s, v25.4s + addp v26.4s, v26.4s, v27.4s + addp v0.4s, v20.4s, v22.4s + addp v1.4s, v24.4s, v26.4s - st1 {v5.8h}, [x1], #16 - add sp, sp, #64 // restore stack + sshl v0.4s, v0.4s, v17.4s + sshl v1.4s, v1.4s, v17.4s + smin v0.4s, v0.4s, v18.4s + smin v1.4s, v1.4s, v18.4s + xtn v0.4h, v0.4s + xtn2 v0.8h, v1.4s + + st1 {v0.8h}, [x1], #16 cbnz w2, 2f ret 2: ldr w8, [x5], #4 // load filterPos - lsl w8, w8, #1 - add x9, x3, w8, uxtw // src + filterPos + add x9, x3, w8, uxtw #1 // src + filterPos ld1 {v0.4h}, [x9] // load 4 * uint16_t ld1 {v31.4h}, [x4], #8 + sub w2, w2, #1 uxtl v0.4s, v0.4h sxtl v31.4s, v31.4h @@ -830,7 +788,6 @@ function ff_hscale16to15_4_neon_asm, export=1 sshl v0.4s, v0.4s, v17.4s smin v0.4s, v0.4s, v18.4s st1 {v0.h}[0], [x1], #2 - sub w2, w2, #1 cbnz w2, 2b // if iterations remain jump to beginning ret _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".