yuv2rgb_neon: 2 lines at a time, rgb16

DROOdotFOO via ffmpeg-cvslog Sat, 06 Jun 2026 10:43:20 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 4b6f7c2a05f456cc27241f6ed1e44e77aa7861d6
Author:     DROOdotFOO <[email protected]>
AuthorDate: Sat May 30 00:19:45 2026 +0200
Commit:     Ramiro Polla <[email protected]>
CommitDate: Sat Jun 6 19:38:40 2026 +0200

    swscale/aarch64/yuv2rgb_neon: 2 lines at a time, rgb16
    
    pack_rgb16_2l uses v26-v29 as scratch (luma temps, dead by then)
    instead of v20-v23, so v20-v25 chroma survives the pack step. A
    .error trips if yuva420p hits rgb16 (v28/v29 would clobber alpha);
    the dispatcher routes that combination through yuv420p anyway.
    
    Test Name                                A55-gcc            M1-clang        
     A76-gcc
    
----------------------------------------------------------------------------------------
    nv12_to_rgb565le_neon            28531.9 (1.12x)        46.8 (1.28x)     
19252.9 (1.09x)
    nv12_to_bgr565le_neon            29018.1 (1.12x)        48.1 (1.17x)     
19252.0 (1.09x)
    nv12_to_rgb555le_neon            28531.3 (1.12x)        47.2 (1.24x)     
19253.6 (1.09x)
    nv12_to_bgr555le_neon            29012.1 (1.12x)        45.8 (1.22x)     
19252.5 (1.09x)
    nv21_to_rgb565le_neon            28532.3 (1.12x)        48.4 (1.15x)     
19430.0 (1.09x)
    nv21_to_bgr565le_neon            29013.8 (1.12x)        47.2 (1.21x)     
19428.8 (1.09x)
    nv21_to_rgb555le_neon            28533.3 (1.12x)        49.7 (1.16x)     
19430.5 (1.09x)
    nv21_to_bgr555le_neon            29011.4 (1.12x)        48.5 (1.18x)     
19428.7 (1.09x)
    yuv420p_to_rgb565le_neon         28351.9 (1.11x)        46.4 (1.18x)     
19635.3 (1.08x)
    yuv420p_to_bgr565le_neon         28831.8 (1.11x)        50.8 (1.09x)     
19634.5 (1.08x)
    yuv420p_to_rgb555le_neon         28351.3 (1.11x)        46.3 (1.23x)     
19634.2 (1.08x)
    yuv420p_to_bgr555le_neon         28829.1 (1.11x)        46.5 (1.21x)     
19634.3 (1.08x)
    yuva420p_to_rgb565le_neon        28349.5 (1.11x)        51.2 (1.06x)     
19634.7 (1.08x)
    yuva420p_to_bgr565le_neon        28833.1 (1.11x)        48.6 (1.17x)     
19633.9 (1.08x)
    yuva420p_to_rgb555le_neon        28351.6 (1.11x)        47.8 (1.16x)     
19635.2 (1.08x)
    yuva420p_to_bgr555le_neon        28831.5 (1.11x)        46.4 (1.14x)     
19634.8 (1.08x)
    
    Co-authored-by: Ramiro Polla <[email protected]>
    Signed-off-by: DROOdotFOO <[email protected]>
---
 libswscale/aarch64/yuv2rgb_neon.S | 66 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/libswscale/aarch64/yuv2rgb_neon.S 
b/libswscale/aarch64/yuv2rgb_neon.S
index 3607f032d9..22cbeb8404 100644
--- a/libswscale/aarch64/yuv2rgb_neon.S
+++ b/libswscale/aarch64/yuv2rgb_neon.S
@@ -324,6 +324,22 @@
         dst_load_args_packed_2l 3
 .endm
 
+.macro dst_load_args_rgb565le_2l
+        dst_load_args_packed_2l 2
+.endm
+
+.macro dst_load_args_bgr565le_2l
+        dst_load_args_packed_2l 2
+.endm
+
+.macro dst_load_args_rgb555le_2l
+        dst_load_args_packed_2l 2
+.endm
+
+.macro dst_load_args_bgr555le_2l
+        dst_load_args_packed_2l 2
+.endm
+
 // 2-lines-at-a-time planar dst loader. \sp_off is the byte offset at
 // which the caller's [sp+0] arg now lives (i.e., however many bytes the
 // caller pushed before invoking this macro). declare_2l_gbrp spills
@@ -560,6 +576,22 @@
         st1             {  v6.8b,  v7.8b }, [\rdst1], #16
         st1             { v18.8b, v19.8b }, [\rdst2], #16
 .endif
+.if rgb16
+ .ifc \ifmt,yuva420p
+        .error "yuva420p->rgb16 is dispatched through the yuv420p path (rgb16 
has no alpha channel)"
+ .endif
+        compute_rgb     v4, v5, v6, v16, v17, v18
+ .if r_first
+        // rgb*le: (R << hshift) | (G << 5) | B
+        pack_rgb16_2l   v8,  v6,  v5,  v4,  gshift, hshift
+        pack_rgb16_2l   v9,  v18, v17, v16, gshift, hshift
+ .else
+        // bgr*le: (B << hshift) | (G << 5) | R
+        pack_rgb16_2l   v8,  v4,  v5,  v6,  gshift, hshift
+        pack_rgb16_2l   v9,  v16, v17, v18, gshift, hshift
+ .endif
+        st1             { v8.8h, v9.8h}, [\rdst0], #32
+.endif
 .endm
 
 // Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts
@@ -629,6 +661,21 @@
         sli             \dst\().8h, v23.8h, #\high_shl
 .endm
 
+// As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after
+// compute_rgb), so v20-v25 chroma contributions survive for the
+// second luma row. yuva420p->rgb16 is dispatched through the yuv420p
+// path, so v28/v29 aliasing alpha is not a concern here.
+.macro pack_rgb16_2l dst, low_ch, mid_ch, high_ch, g_shr, high_shl
+        ushr            v26.8b, \high_ch\().8b, #3
+        ushr            v27.8b, \mid_ch\().8b,  #\g_shr
+        ushr            v28.8b, \low_ch\().8b,  #3
+        uxtl            \dst\().8h, v28.8b
+        uxtl            v29.8h, v27.8b
+        sli             \dst\().8h, v29.8h, #5
+        uxtl            v29.8h, v26.8b
+        sli             \dst\().8h, v29.8h, #\high_shl
+.endm
+
 .macro declare_func ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         set_rgb16_predicates \ofmt
@@ -777,12 +824,14 @@ endfunc
 // subsampled sources (chrSrcVSubSample > 0).
 .macro declare_2l_packed ifmt ofmt
 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
+        set_rgb16_predicates \ofmt
         uxtw            widthx, width
         dup             v3.8h, y_offset
         dup             v0.8h, y_coeff
         ld1             {v1.1d}, [table_ptr]
         src_load_args_\ifmt\()_2l
         dst_load_args_\ofmt\()_2l
+        save_d8_d9_if_16bpp
 
         movi            v31.8h, #4, lsl #8                              // 128 
* (1<<3) (loop-invariant)
         movi            v30.8b, #255                                    // 
alpha = 255  (loop-invariant)
@@ -801,6 +850,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
         subs            height, height, #2
         b.gt            1b
         mov             w0, orig_height
+        restore_d8_d9_if_16bpp
         ret
 endfunc
 .endm
@@ -870,10 +920,18 @@ declare_rgb_funcs           yuv422p
         declare_func    \ifmt, bgr555le
 .endm
 
-declare_rgb16_funcs nv12
-declare_rgb16_funcs nv21
-declare_rgb16_funcs yuv420p
-declare_rgb16_funcs yuv422p
+.macro declare_rgb16_funcs_2l ifmt
+        declare_2l_packed \ifmt, rgb565le
+        declare_2l_packed \ifmt, bgr565le
+        declare_2l_packed \ifmt, rgb555le
+        declare_2l_packed \ifmt, bgr555le
+.endm
+
+// Subsampled inputs take the 2-line rgb16 path; yuv422p stays single-row.
+declare_rgb16_funcs_2l nv12
+declare_rgb16_funcs_2l nv21
+declare_rgb16_funcs_2l yuv420p
+declare_rgb16_funcs    yuv422p
 
 .macro declare_yuva_funcs ifmt
         declare_func    \ifmt, argb

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 06/07: swscale/aarch64/yuv2rgb_neon: 2 lines at a time, rgb16

Reply via email to