This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit dad212060c777ad21a8f785ba11f1add3a15e432 Author: DROOdotFOO <[email protected]> AuthorDate: Sat May 30 00:16:11 2026 +0200 Commit: Ramiro Polla <[email protected]> CommitDate: Sat Jun 6 19:38:40 2026 +0200 swscale/aarch64/yuv2rgb_neon: 2 lines at a time, gbrp Six dst pointers exhaust the caller-saved registers; spill x19/x20. yuva420p_to_gbrp_neon is routed through the yuv420p path by the dispatcher (gbrp has no alpha channel). Test Name A55-gcc M1-clang A76-gcc ---------------------------------------------------------------------------------------- nv12_to_gbrp_neon 20017.8 (1.15x) 32.8 (1.34x) 10658.0 (1.27x) nv21_to_gbrp_neon 20020.9 (1.15x) 32.5 (1.36x) 10691.1 (1.26x) yuv420p_to_gbrp_neon 19856.3 (1.14x) 31.4 (1.34x) 10348.0 (1.37x) yuva420p_to_gbrp_neon 19859.8 (1.14x) 30.9 (1.27x) 10350.9 (1.37x) Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/yuv2rgb_neon.S | 100 +++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 7 deletions(-) diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 7ef0e75639..3607f032d9 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -97,6 +97,11 @@ // double as the line-2 luma and dst pointers. #define l2_srcY x16 #define l2_dst0 x17 +// Planar 2-line variant needs three line-2 dst pointers. x16/x17 are +// already taken by l2_srcY/l2_dst0, so l2_dst1/l2_dst2 land in the +// AAPCS callee-saved range and the 2-line gbrp prologue spills them. +#define l2_dst1 x19 +#define l2_dst2 x20 // -------------------------------------------------------------------- // Source-side argument unpacking. @@ -319,6 +324,35 @@ dst_load_args_packed_2l 3 .endm +// 2-lines-at-a-time planar dst loader. \sp_off is the byte offset at +// which the caller's [sp+0] arg now lives (i.e., however many bytes the +// caller pushed before invoking this macro). declare_2l_gbrp spills +// x19/x20 (16 bytes) and passes 16; the on-stack args end up at: +// [sp + sp_off + 0] int linesize0 +// [sp + sp_off + 8] uint8_t *dst1 +// [sp + sp_off + 16] int linesize1 +// [sp + sp_off + 24] uint8_t *dst2 +// [sp + sp_off + 32] int linesize2 +.macro dst_load_args_planar_2l sp_off + ldr dstPadding0w, [sp, #(\sp_off + 0)] + ldr dst1, [sp, #(\sp_off + 8)] + ldr dstPadding1w, [sp, #(\sp_off + 16)] + ldr dst2, [sp, #(\sp_off + 24)] + ldr dstPadding2w, [sp, #(\sp_off + 32)] + sxtw dstPadding0, dstPadding0w + sxtw dstPadding1, dstPadding1w + sxtw dstPadding2, dstPadding2w + add l2_dst0, dst0, dstPadding0 // l2_dst0 = dst0 + linesize0 + add l2_dst1, dst1, dstPadding1 // l2_dst1 = dst1 + linesize1 + add l2_dst2, dst2, dstPadding2 // l2_dst2 = dst2 + linesize2 + lsl dstPadding0, dstPadding0, #1 + lsl dstPadding1, dstPadding1, #1 + lsl dstPadding2, dstPadding2, #1 + sub dstPadding0, dstPadding0, widthx // = 2*linesize0 - width + sub dstPadding1, dstPadding1, widthx + sub dstPadding2, dstPadding2, widthx +.endm + // -------------------------------------------------------------------- // Per-input chroma load (run inside the inner loop). @@ -406,6 +440,15 @@ add l2_dst0, l2_dst0, dstPadding0 .endm +.macro dst_increment_planar_2l + add dst0, dst0, dstPadding0 + add l2_dst0, l2_dst0, dstPadding0 + add dst1, dst1, dstPadding1 + add l2_dst1, l2_dst1, dstPadding1 + add dst2, dst2, dstPadding2 + add l2_dst2, l2_dst2, dstPadding2 +.endm + // -------------------------------------------------------------------- // Shared compute / pack helpers. @@ -511,6 +554,12 @@ st3 { v4.8b, v5.8b, v6.8b}, [\rdst0], #24 st3 {v16.8b,v17.8b,v18.8b}, [\rdst0], #24 .endif +.ifc \ofmt,gbrp + compute_rgb v18, v4, v6, v19, v5, v7 + st1 { v4.8b, v5.8b }, [\rdst0], #16 + st1 { v6.8b, v7.8b }, [\rdst1], #16 + st1 { v18.8b, v19.8b }, [\rdst2], #16 +.endif .endm // Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts @@ -765,16 +814,53 @@ endfunc declare_2l_packed \ifmt, bgr24 .endm -// Vertically-subsampled inputs: packed RGB outputs go through the -// 2-lines path; gbrp stays on the single-row declare_func (extended -// in a follow-up). yuv422p has full-height chroma -- no sharing, so -// it keeps the single-row path for every ofmt. +// 2-lines-at-a-time variant for the gbrp planar output. Six dst pointers +// (three per row) exhaust the caller-saved registers, so x19/x20 are +// spilled AAPCS-style. Stack args for the line-1 dst1/dst2/linesize are +// read after the spill, so dst_load_args_planar_2l uses the shifted +// offsets. +.macro declare_2l_gbrp ifmt +function ff_\ifmt\()_to_gbrp_neon, export=1 + uxtw widthx, width + dup v3.8h, y_offset + dup v0.8h, y_coeff + ld1 {v1.1d}, [table_ptr] + + stp x19, x20, [sp, #-0x10]! // callee-saved (line2 planar ptrs) + + src_load_args_\ifmt\()_2l + dst_load_args_planar_2l 16 // 16 = bytes pushed above + + movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) + mov orig_height, height +1: + mov cur_width, width +2: + load_chroma_\ifmt + chroma_to_rgb_offsets + process_row \ifmt, gbrp, srcY, srcY, dst0, dst1, dst2 + process_row \ifmt, gbrp, l2_srcY, l2_srcY, l2_dst0, l2_dst1, l2_dst2 + subs cur_width, cur_width, #16 + b.gt 2b + dst_increment_planar_2l + src_increment_\ifmt\()_2l + subs height, height, #2 + b.gt 1b + mov w0, orig_height + ldp x19, x20, [sp], #0x10 // restore callee-saved + ret +endfunc +.endm + +// Vertically-subsampled inputs: both packed RGB and gbrp go through the +// 2-lines path. yuv422p has full-height chroma -- no sharing, so it +// keeps the single-row path for every ofmt. declare_rgb_funcs_2l_packed nv12 -declare_func nv12, gbrp +declare_2l_gbrp nv12 declare_rgb_funcs_2l_packed nv21 -declare_func nv21, gbrp +declare_2l_gbrp nv21 declare_rgb_funcs_2l_packed yuv420p -declare_func yuv420p, gbrp +declare_2l_gbrp yuv420p declare_rgb_funcs yuv422p .macro declare_rgb16_funcs ifmt _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
