This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 4b6f7c2a05f456cc27241f6ed1e44e77aa7861d6 Author: DROOdotFOO <[email protected]> AuthorDate: Sat May 30 00:19:45 2026 +0200 Commit: Ramiro Polla <[email protected]> CommitDate: Sat Jun 6 19:38:40 2026 +0200 swscale/aarch64/yuv2rgb_neon: 2 lines at a time, rgb16 pack_rgb16_2l uses v26-v29 as scratch (luma temps, dead by then) instead of v20-v23, so v20-v25 chroma survives the pack step. A .error trips if yuva420p hits rgb16 (v28/v29 would clobber alpha); the dispatcher routes that combination through yuv420p anyway. Test Name A55-gcc M1-clang A76-gcc ---------------------------------------------------------------------------------------- nv12_to_rgb565le_neon 28531.9 (1.12x) 46.8 (1.28x) 19252.9 (1.09x) nv12_to_bgr565le_neon 29018.1 (1.12x) 48.1 (1.17x) 19252.0 (1.09x) nv12_to_rgb555le_neon 28531.3 (1.12x) 47.2 (1.24x) 19253.6 (1.09x) nv12_to_bgr555le_neon 29012.1 (1.12x) 45.8 (1.22x) 19252.5 (1.09x) nv21_to_rgb565le_neon 28532.3 (1.12x) 48.4 (1.15x) 19430.0 (1.09x) nv21_to_bgr565le_neon 29013.8 (1.12x) 47.2 (1.21x) 19428.8 (1.09x) nv21_to_rgb555le_neon 28533.3 (1.12x) 49.7 (1.16x) 19430.5 (1.09x) nv21_to_bgr555le_neon 29011.4 (1.12x) 48.5 (1.18x) 19428.7 (1.09x) yuv420p_to_rgb565le_neon 28351.9 (1.11x) 46.4 (1.18x) 19635.3 (1.08x) yuv420p_to_bgr565le_neon 28831.8 (1.11x) 50.8 (1.09x) 19634.5 (1.08x) yuv420p_to_rgb555le_neon 28351.3 (1.11x) 46.3 (1.23x) 19634.2 (1.08x) yuv420p_to_bgr555le_neon 28829.1 (1.11x) 46.5 (1.21x) 19634.3 (1.08x) yuva420p_to_rgb565le_neon 28349.5 (1.11x) 51.2 (1.06x) 19634.7 (1.08x) yuva420p_to_bgr565le_neon 28833.1 (1.11x) 48.6 (1.17x) 19633.9 (1.08x) yuva420p_to_rgb555le_neon 28351.6 (1.11x) 47.8 (1.16x) 19635.2 (1.08x) yuva420p_to_bgr555le_neon 28831.5 (1.11x) 46.4 (1.14x) 19634.8 (1.08x) Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/yuv2rgb_neon.S | 66 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 3607f032d9..22cbeb8404 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -324,6 +324,22 @@ dst_load_args_packed_2l 3 .endm +.macro dst_load_args_rgb565le_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_bgr565le_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_rgb555le_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_bgr555le_2l + dst_load_args_packed_2l 2 +.endm + // 2-lines-at-a-time planar dst loader. \sp_off is the byte offset at // which the caller's [sp+0] arg now lives (i.e., however many bytes the // caller pushed before invoking this macro). declare_2l_gbrp spills @@ -560,6 +576,22 @@ st1 { v6.8b, v7.8b }, [\rdst1], #16 st1 { v18.8b, v19.8b }, [\rdst2], #16 .endif +.if rgb16 + .ifc \ifmt,yuva420p + .error "yuva420p->rgb16 is dispatched through the yuv420p path (rgb16 has no alpha channel)" + .endif + compute_rgb v4, v5, v6, v16, v17, v18 + .if r_first + // rgb*le: (R << hshift) | (G << 5) | B + pack_rgb16_2l v8, v6, v5, v4, gshift, hshift + pack_rgb16_2l v9, v18, v17, v16, gshift, hshift + .else + // bgr*le: (B << hshift) | (G << 5) | R + pack_rgb16_2l v8, v4, v5, v6, gshift, hshift + pack_rgb16_2l v9, v16, v17, v18, gshift, hshift + .endif + st1 { v8.8h, v9.8h}, [\rdst0], #32 +.endif .endm // Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts @@ -629,6 +661,21 @@ sli \dst\().8h, v23.8h, #\high_shl .endm +// As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after +// compute_rgb), so v20-v25 chroma contributions survive for the +// second luma row. yuva420p->rgb16 is dispatched through the yuv420p +// path, so v28/v29 aliasing alpha is not a concern here. +.macro pack_rgb16_2l dst, low_ch, mid_ch, high_ch, g_shr, high_shl + ushr v26.8b, \high_ch\().8b, #3 + ushr v27.8b, \mid_ch\().8b, #\g_shr + ushr v28.8b, \low_ch\().8b, #3 + uxtl \dst\().8h, v28.8b + uxtl v29.8h, v27.8b + sli \dst\().8h, v29.8h, #5 + uxtl v29.8h, v26.8b + sli \dst\().8h, v29.8h, #\high_shl +.endm + .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 set_rgb16_predicates \ofmt @@ -777,12 +824,14 @@ endfunc // subsampled sources (chrSrcVSubSample > 0). .macro declare_2l_packed ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 + set_rgb16_predicates \ofmt uxtw widthx, width dup v3.8h, y_offset dup v0.8h, y_coeff ld1 {v1.1d}, [table_ptr] src_load_args_\ifmt\()_2l dst_load_args_\ofmt\()_2l + save_d8_d9_if_16bpp movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) movi v30.8b, #255 // alpha = 255 (loop-invariant) @@ -801,6 +850,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 subs height, height, #2 b.gt 1b mov w0, orig_height + restore_d8_d9_if_16bpp ret endfunc .endm @@ -870,10 +920,18 @@ declare_rgb_funcs yuv422p declare_func \ifmt, bgr555le .endm -declare_rgb16_funcs nv12 -declare_rgb16_funcs nv21 -declare_rgb16_funcs yuv420p -declare_rgb16_funcs yuv422p +.macro declare_rgb16_funcs_2l ifmt + declare_2l_packed \ifmt, rgb565le + declare_2l_packed \ifmt, bgr565le + declare_2l_packed \ifmt, rgb555le + declare_2l_packed \ifmt, bgr555le +.endm + +// Subsampled inputs take the 2-line rgb16 path; yuv422p stays single-row. +declare_rgb16_funcs_2l nv12 +declare_rgb16_funcs_2l nv21 +declare_rgb16_funcs_2l yuv420p +declare_rgb16_funcs yuv422p .macro declare_yuva_funcs ifmt declare_func \ifmt, argb _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
