This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit cc7c567920946897d7fd64a096c72bef1254b046 Author: DROOdotFOO <[email protected]> AuthorDate: Tue Jun 9 01:38:46 2026 +0200 Commit: Ramiro Polla <[email protected]> CommitDate: Wed Jun 10 17:54:20 2026 +0000 swscale/aarch64/yuv2rgb_neon: add BE 16bpp output formats BE counterparts to the LE paths in 2e142e52ae; pack adds rev16 before store. nv12/nv21 paths are added but bench-only (no C ref, same as 2e142e52ae). Test Name A55-gcc M1-clang A76-gcc ------------------------------------------------------------------------------------- yuv420p_rgb565be_1920_neon 15086.1 ( 3.91x) 5507.0 ( 4.34x) 19229.1 ( 2.02x) yuv420p_bgr565be_1920_neon 15291.7 ( 3.84x) 5476.9 ( 4.37x) 19229.4 ( 2.02x) yuv420p_rgb555be_1920_neon 15091.5 ( 3.67x) 5569.0 ( 3.97x) 19229.3 ( 1.90x) yuv420p_bgr555be_1920_neon 15298.6 ( 3.62x) 5600.6 ( 3.98x) 19228.8 ( 1.90x) yuv422p_rgb565be_1920_neon 16862.3 ( 4.00x) 6378.8 ( 4.64x) 22110.3 ( 2.07x) yuv422p_bgr565be_1920_neon 17139.3 ( 3.93x) 6448.1 ( 4.50x) 22104.1 ( 2.07x) yuv422p_rgb555be_1920_neon 16853.3 ( 3.98x) 6468.8 ( 4.12x) 22106.4 ( 1.98x) yuv422p_bgr555be_1920_neon 17202.2 ( 3.89x) 6467.0 ( 4.12x) 22110.2 ( 1.98x) yuva420p_rgb565be_1920_neon 15050.2 ( 3.92x) 5452.5 ( 4.39x) 19229.5 ( 2.02x) yuva420p_bgr565be_1920_neon 15346.6 ( 3.84x) 5462.4 ( 4.36x) 19228.9 ( 2.02x) yuva420p_rgb555be_1920_neon 15050.8 ( 3.69x) 5463.3 ( 3.95x) 19228.6 ( 1.90x) yuva420p_bgr555be_1920_neon 15352.8 ( 3.61x) 5543.6 ( 3.89x) 19228.6 ( 1.90x) Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/swscale_unscaled.c | 24 +++++++ libswscale/aarch64/yuv2rgb_neon.S | 117 +++++++++++++++++++++++++++++----- tests/checkasm/sw_yuv2rgb.c | 56 +++++++++------- 3 files changed, 158 insertions(+), 39 deletions(-) diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c index 5ae8e393a0..c9041db99f 100644 --- a/libswscale/aarch64/swscale_unscaled.c +++ b/libswscale/aarch64/swscale_unscaled.c @@ -95,6 +95,10 @@ DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb565le) DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr565le) \ DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb555le) \ DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr555le) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb565be) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr565be) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb555be) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr555be) \ DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv12) DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv21) @@ -161,6 +165,10 @@ static int nv24_to_yuv420p_neon_wrapper(SwsInternal *c, const uint8_t *const src SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr565le, BGR565LE, accurate_rnd); \ SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb555le, RGB555LE, accurate_rnd); \ SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr555le, BGR555LE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb565be, RGB565BE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr565be, BGR565BE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb555be, RGB555BE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr555be, BGR555BE, accurate_rnd); \ } while (0) static void get_unscaled_swscale_neon(SwsInternal *c) { @@ -186,6 +194,10 @@ static void get_unscaled_swscale_neon(SwsInternal *c) { SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd); SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd); SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565be, RGB565BE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565be, BGR565BE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555be, RGB555BE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555be, BGR555BE, accurate_rnd); if (c->opts.dst_format == AV_PIX_FMT_YUV420P && (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) && @@ -221,6 +233,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper; case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper; case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper; + case AV_PIX_FMT_RGB565BE: return yuv420p_to_rgb565be_neon_wrapper; + case AV_PIX_FMT_BGR565BE: return yuv420p_to_bgr565be_neon_wrapper; + case AV_PIX_FMT_RGB555BE: return yuv420p_to_rgb555be_neon_wrapper; + case AV_PIX_FMT_BGR555BE: return yuv420p_to_bgr555be_neon_wrapper; } } else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) { switch (c->opts.dst_format) { @@ -238,6 +254,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper; case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper; case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper; + case AV_PIX_FMT_RGB565BE: return yuv420p_to_rgb565be_neon_wrapper; + case AV_PIX_FMT_BGR565BE: return yuv420p_to_bgr565be_neon_wrapper; + case AV_PIX_FMT_RGB555BE: return yuv420p_to_rgb555be_neon_wrapper; + case AV_PIX_FMT_BGR555BE: return yuv420p_to_bgr555be_neon_wrapper; } } else if (c->opts.src_format == AV_PIX_FMT_YUV422P) { switch (c->opts.dst_format) { @@ -252,6 +272,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_BGR565LE: return yuv422p_to_bgr565le_neon_wrapper; case AV_PIX_FMT_RGB555LE: return yuv422p_to_rgb555le_neon_wrapper; case AV_PIX_FMT_BGR555LE: return yuv422p_to_bgr555le_neon_wrapper; + case AV_PIX_FMT_RGB565BE: return yuv422p_to_rgb565be_neon_wrapper; + case AV_PIX_FMT_BGR565BE: return yuv422p_to_bgr565be_neon_wrapper; + case AV_PIX_FMT_RGB555BE: return yuv422p_to_rgb555be_neon_wrapper; + case AV_PIX_FMT_BGR555BE: return yuv422p_to_bgr555be_neon_wrapper; } } return NULL; diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 19e0f1d6a3..734b6dd174 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -301,6 +301,22 @@ dst_load_args_packed 2 .endm +.macro dst_load_args_rgb565be + dst_load_args_packed 2 +.endm + +.macro dst_load_args_bgr565be + dst_load_args_packed 2 +.endm + +.macro dst_load_args_rgb555be + dst_load_args_packed 2 +.endm + +.macro dst_load_args_bgr555be + dst_load_args_packed 2 +.endm + .macro dst_load_args_gbrp dst_load_args_planar .endm @@ -365,6 +381,22 @@ dst_load_args_packed_2l 2 .endm +.macro dst_load_args_rgb565be_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_bgr565be_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_rgb555be_2l + dst_load_args_packed_2l 2 +.endm + +.macro dst_load_args_bgr555be_2l + dst_load_args_packed_2l 2 +.endm + // 2-lines-at-a-time planar dst loader. \sp_off is the byte offset at // which the caller's [sp+0] arg now lives (i.e., however many bytes the // caller pushed before invoking this macro). declare_2l_gbrp spills @@ -639,11 +671,11 @@ .endif compute_rgb v4, v5, v6, v16, v17, v18 .if r_first - // rgb*le: (R << hshift) | (G << 5) | B + // rgb*: (R << hshift) | (G << 5) | B pack_rgb16_2l v8, v6, v5, v4, gshift, hshift pack_rgb16_2l v9, v18, v17, v16, gshift, hshift .else - // bgr*le: (B << hshift) | (G << 5) | R + // bgr*: (B << hshift) | (G << 5) | R pack_rgb16_2l v8, v4, v5, v6, gshift, hshift pack_rgb16_2l v9, v16, v17, v18, gshift, hshift .endif @@ -651,15 +683,16 @@ .endif .endm -// Map ofmt to .set predicates: rgb16=1 for the four 16bpp LE ofmts +// Map ofmt to .set predicates: rgb16=1 for the eight 16bpp ofmts // (r_first=1 for rgb*, 0 for bgr*; gshift/hshift = 2/11 for 565, -// 3/10 for 555), letting sibling macros branch on .if rgb16 instead of -// repeating a four-way .ifc cascade. +// 3/10 for 555; is_be=1 for the BE variants), letting sibling macros +// branch on .if rgb16 / .if is_be instead of repeating .ifc cascades. .macro set_rgb16_predicates ofmt .set rgb16, 0 .set r_first, 0 .set gshift, 0 .set hshift, 0 + .set is_be, 0 .ifc \ofmt,rgb565le .set rgb16, 1 .set r_first, 1 @@ -682,6 +715,32 @@ .set gshift, 3 .set hshift, 10 .endif +.ifc \ofmt,rgb565be + .set rgb16, 1 + .set r_first, 1 + .set gshift, 2 + .set hshift, 11 + .set is_be, 1 +.endif +.ifc \ofmt,bgr565be + .set rgb16, 1 + .set gshift, 2 + .set hshift, 11 + .set is_be, 1 +.endif +.ifc \ofmt,rgb555be + .set rgb16, 1 + .set r_first, 1 + .set gshift, 3 + .set hshift, 10 + .set is_be, 1 +.endif +.ifc \ofmt,bgr555be + .set rgb16, 1 + .set gshift, 3 + .set hshift, 10 + .set is_be, 1 +.endif .endm // 16bpp packing uses v8/v9 as the accumulator. AAPCS-64 requires d8/d9 @@ -704,10 +763,10 @@ // Pack 8 pixels of 16bpp output. The three channels are extracted via ushr, // widened to u16, then merged via shift-left-insert: // dst = (high << high_shl) | (mid << 5) | low -// For RGB565LE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11. -// For BGR565LE pass (R, G, B), g_shr=2, high_shl=11. -// For RGB555LE pass (B, G, R), g_shr=3, high_shl=10. -// For BGR555LE pass (R, G, B), g_shr=3, high_shl=10. +// For RGB565LE/BE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11. +// For BGR565LE/BE pass (R, G, B), g_shr=2, high_shl=11. +// For RGB555LE/BE pass (B, G, R), g_shr=3, high_shl=10. +// For BGR555LE/BE pass (R, G, B), g_shr=3, high_shl=10. // Clobbers v20-v23. .macro pack_rgb16 dst, low_ch, mid_ch, high_ch, g_shr, high_shl ushr v20.8b, \high_ch\().8b, #3 @@ -718,6 +777,9 @@ sli \dst\().8h, v23.8h, #5 uxtl v23.8h, v20.8b sli \dst\().8h, v23.8h, #\high_shl +.if is_be + rev16 \dst\().16b, \dst\().16b +.endif .endm // As pack_rgb16 but uses v26-v29 as scratch (luma temps, dead after @@ -733,6 +795,9 @@ sli \dst\().8h, v29.8h, #5 uxtl v29.8h, v26.8b sli \dst\().8h, v29.8h, #\high_shl +.if is_be + rev16 \dst\().16b, \dst\().16b +.endif .endm .macro declare_func ifmt ofmt @@ -827,11 +892,11 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 .if rgb16 compute_rgb v4,v5,v6, v16,v17,v18 .if r_first - // rgb*le: (R << hshift) | (G << 5) | B + // rgb*: (R << hshift) | (G << 5) | B pack_rgb16 v8, v6, v5, v4, gshift, hshift pack_rgb16 v9, v18, v17, v16, gshift, hshift .else - // bgr*le: (B << hshift) | (G << 5) | R + // bgr*: (B << hshift) | (G << 5) | R pack_rgb16 v8, v4, v5, v6, gshift, hshift pack_rgb16 v9, v16, v17, v18, gshift, hshift .endif @@ -970,25 +1035,43 @@ declare_rgb_funcs_2l_packed yuv420p declare_2l_gbrp yuv420p declare_rgb_funcs yuv422p -.macro declare_rgb16_funcs ifmt +.macro declare_rgb16le_funcs ifmt declare_func \ifmt, rgb565le declare_func \ifmt, bgr565le declare_func \ifmt, rgb555le declare_func \ifmt, bgr555le .endm -.macro declare_rgb16_funcs_2l ifmt +.macro declare_rgb16le_funcs_2l ifmt declare_2l_packed \ifmt, rgb565le declare_2l_packed \ifmt, bgr565le declare_2l_packed \ifmt, rgb555le declare_2l_packed \ifmt, bgr555le .endm +.macro declare_rgb16be_funcs ifmt + declare_func \ifmt, rgb565be + declare_func \ifmt, bgr565be + declare_func \ifmt, rgb555be + declare_func \ifmt, bgr555be +.endm + +.macro declare_rgb16be_funcs_2l ifmt + declare_2l_packed \ifmt, rgb565be + declare_2l_packed \ifmt, bgr565be + declare_2l_packed \ifmt, rgb555be + declare_2l_packed \ifmt, bgr555be +.endm + // Subsampled inputs take the 2-line rgb16 path; yuv422p stays single-row. -declare_rgb16_funcs_2l nv12 -declare_rgb16_funcs_2l nv21 -declare_rgb16_funcs_2l yuv420p -declare_rgb16_funcs yuv422p +declare_rgb16le_funcs_2l nv12 +declare_rgb16be_funcs_2l nv12 +declare_rgb16le_funcs_2l nv21 +declare_rgb16be_funcs_2l nv21 +declare_rgb16le_funcs_2l yuv420p +declare_rgb16be_funcs_2l yuv420p +declare_rgb16le_funcs yuv422p +declare_rgb16be_funcs yuv422p .macro declare_yuva_funcs ifmt declare_func \ifmt, argb diff --git a/tests/checkasm/sw_yuv2rgb.c b/tests/checkasm/sw_yuv2rgb.c index 2b3b1eec61..0f59d09c80 100644 --- a/tests/checkasm/sw_yuv2rgb.c +++ b/tests/checkasm/sw_yuv2rgb.c @@ -46,10 +46,14 @@ static const int dst_fmts[] = { AV_PIX_FMT_BGRA, AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24, - AV_PIX_FMT_RGB565, - AV_PIX_FMT_BGR565, - AV_PIX_FMT_RGB555, - AV_PIX_FMT_BGR555, + AV_PIX_FMT_RGB565LE, + AV_PIX_FMT_BGR565LE, + AV_PIX_FMT_RGB555LE, + AV_PIX_FMT_BGR555LE, + AV_PIX_FMT_RGB565BE, + AV_PIX_FMT_BGR565BE, + AV_PIX_FMT_RGB555BE, + AV_PIX_FMT_BGR555BE, // AV_PIX_FMT_RGB444, // AV_PIX_FMT_BGR444, // AV_PIX_FMT_RGB8, @@ -71,31 +75,31 @@ static int cmp_off_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int a return 0; } -static int cmp_555_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy) +static int cmp_555_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy, int is_be) { - const uint16_t *ref16 = (const uint16_t *) ref; - const uint16_t *test16 = (const uint16_t *) test; for (size_t i = 0; i < n; i++) { - if (abs(( ref16[i] & 0x1f) - ( test16[i] & 0x1f)) > accuracy) + uint16_t r = is_be ? AV_RB16(ref + i * 2) : AV_RL16(ref + i * 2); + uint16_t t = is_be ? AV_RB16(test + i * 2) : AV_RL16(test + i * 2); + if (abs(( r & 0x1f) - ( t & 0x1f)) > accuracy) return 1; - if (abs(((ref16[i] >> 5) & 0x1f) - ((test16[i] >> 5) & 0x1f)) > accuracy) + if (abs(((r >> 5) & 0x1f) - ((t >> 5) & 0x1f)) > accuracy) return 1; - if (abs(((ref16[i] >> 10) & 0x1f) - ((test16[i] >> 10) & 0x1f)) > accuracy) + if (abs(((r >> 10) & 0x1f) - ((t >> 10) & 0x1f)) > accuracy) return 1; } return 0; } -static int cmp_565_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy) +static int cmp_565_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy, int is_be) { - const uint16_t *ref16 = (const uint16_t *) ref; - const uint16_t *test16 = (const uint16_t *) test; for (size_t i = 0; i < n; i++) { - if (abs(( ref16[i] & 0x1f) - ( test16[i] & 0x1f)) > accuracy) + uint16_t r = is_be ? AV_RB16(ref + i * 2) : AV_RL16(ref + i * 2); + uint16_t t = is_be ? AV_RB16(test + i * 2) : AV_RL16(test + i * 2); + if (abs(( r & 0x1f) - ( t & 0x1f)) > accuracy) return 1; - if (abs(((ref16[i] >> 5) & 0x3f) - ((test16[i] >> 5) & 0x3f)) > accuracy) + if (abs(((r >> 5) & 0x3f) - ((t >> 5) & 0x3f)) > accuracy) return 1; - if (abs(((ref16[i] >> 11) & 0x1f) - ((test16[i] >> 11) & 0x1f)) > accuracy) + if (abs(((r >> 11) & 0x1f) - ((t >> 11) & 0x1f)) > accuracy) return 1; } return 0; @@ -199,19 +203,27 @@ static void check_yuv2rgb(int src_pix_fmt) dst1_0 + row * dstStride[0], width * sample_size, 3)) fail(); - } else if (dst_pix_fmt == AV_PIX_FMT_RGB565 || - dst_pix_fmt == AV_PIX_FMT_BGR565) { + } else if (dst_pix_fmt == AV_PIX_FMT_RGB565LE || + dst_pix_fmt == AV_PIX_FMT_BGR565LE || + dst_pix_fmt == AV_PIX_FMT_RGB565BE || + dst_pix_fmt == AV_PIX_FMT_BGR565BE) { + int is_be = dst_pix_fmt == AV_PIX_FMT_RGB565BE || + dst_pix_fmt == AV_PIX_FMT_BGR565BE; for (int row = 0; row < srcSliceH; row++) if (cmp_565_by_n(dst0_0 + row * dstStride[0], dst1_0 + row * dstStride[0], - width, 2)) + width, 2, is_be)) fail(); - } else if (dst_pix_fmt == AV_PIX_FMT_RGB555 || - dst_pix_fmt == AV_PIX_FMT_BGR555) { + } else if (dst_pix_fmt == AV_PIX_FMT_RGB555LE || + dst_pix_fmt == AV_PIX_FMT_BGR555LE || + dst_pix_fmt == AV_PIX_FMT_RGB555BE || + dst_pix_fmt == AV_PIX_FMT_BGR555BE) { + int is_be = dst_pix_fmt == AV_PIX_FMT_RGB555BE || + dst_pix_fmt == AV_PIX_FMT_BGR555BE; for (int row = 0; row < srcSliceH; row++) if (cmp_555_by_n(dst0_0 + row * dstStride[0], dst1_0 + row * dstStride[0], - width, 2)) + width, 2, is_be)) fail(); } else if (dst_pix_fmt == AV_PIX_FMT_GBRP) { for (int p = 0; p < 3; p++) _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
