This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 11b1721b11becf9692d855802b990cb032de6f3c Author: DROOdotFOO <[email protected]> AuthorDate: Fri May 29 23:51:51 2026 +0200 Commit: Ramiro Polla <[email protected]> CommitDate: Sat Jun 6 19:38:40 2026 +0200 swscale/aarch64/yuv2rgb_neon: reorder params, unify signature Pass src[]/srcStride[] as arrays (x5/x6), move y_offset/y_coeff into register args (w2/w3). Only int-after-pointer stack args remain, so Apple and AAPCS64 lay them out identically; every __APPLE__ is gone. nv12/nv21/yuv420p/yuv422p/yuva420p share one signature. Co-authored-by: Ramiro Polla <[email protected]> Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/swscale_unscaled.c | 240 ++++++-------------- libswscale/aarch64/yuv2rgb_neon.S | 404 +++++++++++++++++++--------------- 2 files changed, 297 insertions(+), 347 deletions(-) diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c index aa23f9c955..5ae8e393a0 100644 --- a/libswscale/aarch64/swscale_unscaled.c +++ b/libswscale/aarch64/swscale_unscaled.c @@ -27,15 +27,13 @@ c->yuv2rgb_v2g_coeff, \ c->yuv2rgb_u2b_coeff, \ -#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \ +#define DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(ifmt, ofmt) \ int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ - uint8_t *dst, int linesize, \ - const uint8_t *srcY, int linesizeY, \ - const uint8_t *srcU, int linesizeU, \ - const uint8_t *srcV, int linesizeV, \ - const int16_t *table, \ int y_offset, \ - int y_coeff); \ + int y_coeff, \ + const int16_t *table, \ + const uint8_t *const src[], const int srcStride[], \ + uint8_t *dst, int linesize); \ \ static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \ const int srcStride[], int srcSliceY, \ @@ -44,24 +42,21 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ \ return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ - dst[0] + srcSliceY * dstStride[0], dstStride[0], \ - src[0], srcStride[0], \ - src[1], srcStride[1], \ - src[2], srcStride[2], \ - yuv2rgb_table, \ c->yuv2rgb_y_offset >> 6, \ - c->yuv2rgb_y_coeff); \ + c->yuv2rgb_y_coeff, \ + yuv2rgb_table, \ + src, srcStride, \ + dst[0] + srcSliceY * dstStride[0], \ + dstStride[0]); \ } \ -#define DECLARE_FF_YUVX_TO_GBRP_FUNCS(ifmt, ofmt) \ +#define DECLARE_FF_YUVX_TO_PLANAR_RGB_FUNCS(ifmt, ofmt) \ int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ - uint8_t *dst, int linesize, \ - const uint8_t *srcY, int linesizeY, \ - const uint8_t *srcU, int linesizeU, \ - const uint8_t *srcV, int linesizeV, \ - const int16_t *table, \ int y_offset, \ int y_coeff, \ + const int16_t *table, \ + const uint8_t *const src[], const int srcStride[], \ + uint8_t *dst0, int linesize0, \ uint8_t *dst1, int linesize1, \ uint8_t *dst2, int linesize2); \ \ @@ -72,120 +67,44 @@ static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ \ return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ - dst[0] + srcSliceY * dstStride[0], dstStride[0], \ - src[0], srcStride[0], \ - src[1], srcStride[1], \ - src[2], srcStride[2], \ - yuv2rgb_table, \ c->yuv2rgb_y_offset >> 6, \ c->yuv2rgb_y_coeff, \ + yuv2rgb_table, \ + src, srcStride, \ + dst[0] + srcSliceY * dstStride[0], dstStride[0], \ dst[1] + srcSliceY * dstStride[1], dstStride[1], \ dst[2] + srcSliceY * dstStride[2], dstStride[2]); \ } \ #define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \ -DECLARE_FF_YUVX_TO_GBRP_FUNCS(yuvx, gbrp) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb24) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, argb) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgba) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, abgr) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgra) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb24) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr24) \ +DECLARE_FF_YUVX_TO_PLANAR_RGB_FUNCS(yuvx, gbrp) \ +DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(nv12) +DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(nv21) DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p) DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p) #define DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuvx) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb565le) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr565le) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb555le) \ -DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr555le) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb565le) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr565le) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, rgb555le) \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuvx, bgr555le) \ +DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv12) +DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(nv21) DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv420p) DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv422p) -#define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \ -int ff_yuva420p_to_##ofmt##_neon(int w, int h, \ - uint8_t *dst, int linesize, \ - const uint8_t *srcY, int linesizeY, \ - const uint8_t *srcU, int linesizeU, \ - const uint8_t *srcV, int linesizeV, \ - const int16_t *table, \ - int y_offset, int y_coeff, \ - const uint8_t *srcA, int linesizeA); \ - \ -static int yuva420p_to_##ofmt##_neon_wrapper(SwsInternal *c, \ - const uint8_t *const src[], \ - const int srcStride[], int srcSliceY, \ - int srcSliceH, uint8_t *const dst[], \ - const int dstStride[]) { \ - const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ - \ - return ff_yuva420p_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ - dst[0] + srcSliceY * dstStride[0], dstStride[0], \ - src[0], srcStride[0], \ - src[1], srcStride[1], \ - src[2], srcStride[2], \ - yuv2rgb_table, \ - c->yuv2rgb_y_offset >> 6, \ - c->yuv2rgb_y_coeff, \ - src[3], srcStride[3]); \ -} - -DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(argb) -DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(rgba) -DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(abgr) -DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(bgra) - -#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \ -int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ - uint8_t *dst, int linesize, \ - const uint8_t *srcY, int linesizeY, \ - const uint8_t *srcC, int linesizeC, \ - const int16_t *table, \ - int y_offset, \ - int y_coeff); \ - \ -static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \ - const int srcStride[], int srcSliceY, \ - int srcSliceH, uint8_t *const dst[], \ - const int dstStride[]) { \ - const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ - \ - return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ - dst[0] + srcSliceY * dstStride[0], dstStride[0], \ - src[0], srcStride[0], src[1], srcStride[1], \ - yuv2rgb_table, \ - c->yuv2rgb_y_offset >> 6, \ - c->yuv2rgb_y_coeff); \ -} \ - -#define DECLARE_FF_NVX_TO_GBRP_FUNCS(ifmt, ofmt) \ -int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ - uint8_t *dst, int linesize, \ - const uint8_t *srcY, int linesizeY, \ - const uint8_t *srcC, int linesizeC, \ - const int16_t *table, \ - int y_offset, \ - int y_coeff, \ - uint8_t *dst1, int linesize1, \ - uint8_t *dst2, int linesize2); \ - \ -static int ifmt##_to_##ofmt##_neon_wrapper(SwsInternal *c, const uint8_t *const src[], \ - const int srcStride[], int srcSliceY, \ - int srcSliceH, uint8_t *const dst[], \ - const int dstStride[]) { \ - const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ - \ - return ff_##ifmt##_to_##ofmt##_neon(c->opts.src_w, srcSliceH, \ - dst[0] + srcSliceY * dstStride[0], dstStride[0], \ - src[0], srcStride[0], src[1], srcStride[1], \ - yuv2rgb_table, \ - c->yuv2rgb_y_offset >> 6, \ - c->yuv2rgb_y_coeff, \ - dst[1] + srcSliceY * dstStride[1], dstStride[1], \ - dst[2] + srcSliceY * dstStride[2], dstStride[2]); \ -} \ +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, argb) +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, rgba) +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, abgr) +DECLARE_FF_YUVX_TO_PACKED_RGB_FUNCS(yuva420p, bgra) void ff_nv24_to_yuv420p_chroma_neon(uint8_t *dst1, int dstStride1, uint8_t *dst2, int dstStride2, @@ -214,32 +133,11 @@ static int nv24_to_yuv420p_neon_wrapper(SwsInternal *c, const uint8_t *const src return srcSliceH; } -#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \ -DECLARE_FF_NVX_TO_GBRP_FUNCS(nvx, gbrp) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb24) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24) \ - -DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12) -DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) - -#define DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nvx) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb565le) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr565le) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb555le) \ -DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr555le) \ - -DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv12) -DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv21) - /* We need a 16 pixel width alignment. This constraint can easily be removed * for input reading but for the output which is 4-bytes per pixel (RGBA) the * assembly might be writing as much as 4*15=60 extra bytes at the end of the * line, which won't fit the 32-bytes buffer alignment. */ -#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \ +#define SET_FF_YUVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \ if (c->opts.src_format == AV_PIX_FMT_##IFMT \ && c->opts.dst_format == AV_PIX_FMT_##OFMT \ && !(c->opts.src_h & 1) \ @@ -248,46 +146,46 @@ DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv21) c->convert_unscaled = ifmt##_to_##ofmt##_neon_wrapper; \ } while (0) -#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, gbrp, GBRP, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb24, RGB24, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \ +#define SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuvx, YUVX, accurate_rnd) do { \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, argb, ARGB, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgba, RGBA, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, abgr, ABGR, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgra, BGRA, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, gbrp, GBRP, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb24, RGB24, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr24, BGR24, accurate_rnd); \ } while (0) -#define SET_FF_NVX_TO_ALL_RGB16_FUNC(nvx, NVX, accurate_rnd) do { \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb565le, RGB565LE, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr565le, BGR565LE, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb555le, RGB555LE, accurate_rnd); \ - SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr555le, BGR555LE, accurate_rnd); \ +#define SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuvx, YUVX, accurate_rnd) do { \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb565le, RGB565LE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr565le, BGR565LE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, rgb555le, RGB555LE, accurate_rnd); \ + SET_FF_YUVX_TO_RGBX_FUNC(yuvx, YUVX, bgr555le, BGR555LE, accurate_rnd); \ } while (0) static void get_unscaled_swscale_neon(SwsInternal *c) { int accurate_rnd = c->opts.flags & SWS_ACCURATE_RND; - SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); - SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); - SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); - SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); - SET_FF_NVX_TO_ALL_RGB16_FUNC(nv12, NV12, accurate_rnd); - SET_FF_NVX_TO_ALL_RGB16_FUNC(nv21, NV21, accurate_rnd); - SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv420p, YUV420P, accurate_rnd); - SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv422p, YUV422P, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGB16_FUNC(nv12, NV12, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGB16_FUNC(nv21, NV21, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuv420p, YUV420P, accurate_rnd); + SET_FF_YUVX_TO_ALL_RGB16_FUNC(yuv422p, YUV422P, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuva420p, YUVA420P, bgra, BGRA, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd); /* yuva420p -> 16bpp: alpha is dropped, route through yuv420p NEON path */ - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565le, RGB565LE, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd); - SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565le, RGB565LE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd); + SET_FF_YUVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd); if (c->opts.dst_format == AV_PIX_FMT_YUV420P && (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) && diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index c9a12a06b6..5fb8dfd407 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -1,6 +1,7 @@ /* * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com> * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com> + * Copyright (c) 2026 Ramiro Polla * * This file is part of FFmpeg. * @@ -21,179 +22,218 @@ #include "libavutil/aarch64/asm.S" -// Register aliases for the single-row code. The 32/64-bit view split -// covers prologue-transient roles for x8/x10 (table pointer / scalar -// coefficients are dropped into vector regs and the GPR is reused for -// the loop counter / planar output pointer). +// Calling convention for ff_<ifmt>_to_<ofmt>_neon: +// w0 int w (width, multiple of 16) +// w1 int h +// w2 int y_offset +// w3 int y_coeff +// x4 const int16_t *yuv2rgb_table +// x5 const uint8_t *const src[] (Y, U/C, V, A as needed) +// x6 const int *const srcStride[] +// x7 uint8_t *dst0 +// [sp + 0] int linesize0 +// [sp + 8] uint8_t *dst1 (planar only) +// [sp + 16] int linesize1 (planar only) +// [sp + 24] uint8_t *dst2 (planar only) +// [sp + 32] int linesize2 (planar only) +// Passing src/srcStride as arrays keeps every scalar arg in a register and +// leaves only pointer-followed-by-int on the stack, so Apple's natural +// packing and AAPCS64's 8-byte slotting coincide and no per-ABI offset +// branching is needed. + #define width w0 +#define widthx x0 #define height w1 -#define dst0 x2 -#define dstPadding0 w3 -#define srcY x4 -#define srcPaddingY w5 -#define srcC x6 -#define srcU x6 -#define srcPaddingC w7 -#define srcPaddingU w7 -#define srcV x13 -#define srcPaddingV w14 -#define srcA x15 -#define srcPaddingA w16 -#define dst1 x10 -#define dst2 x15 -#define dstPadding1 w12 -#define dstPadding2 w16 - -// Prologue: load table / y_offset / y_coeff from the stack. -#define table_ptr x8 -#define y_offset w9 -#define y_coeff w10 - -// Body loop state. Reuses x8/w9 once the prologue has consumed them. +#define y_offset w2 +#define y_coeff w3 +#define table_ptr x4 + +// Source plane pointers (loaded from src[] in the prologue; the slots are +// reused for srcY/srcC/srcV/srcA once y_offset/y_coeff/table_ptr are +// consumed by dup/ld1). +#define srcY x2 +#define srcC x3 +#define srcU x3 +#define srcV x4 +#define srcA x5 + +// Source plane padding (sign-extended in the prologue so the row-end +// increment is a single 64-bit add). +#define srcPaddingY x10 +#define srcPaddingC x11 +#define srcPaddingU x11 +#define srcPaddingV x12 +#define srcPaddingA x6 +#define srcPaddingYw w10 +#define srcPaddingCw w11 +#define srcPaddingUw w11 +#define srcPaddingVw w12 +#define srcPaddingAw w6 + +// Destination plane pointers. dst0/dst1/dst2 share x5/x6 with srcA/ +// srcPaddingA, but those aliases never coexist in the same function (yuva +// is packed-only; gbrp is yuv420p/yuv422p/nv12/nv21). +#define dst0 x7 +#define dst1 x6 +#define dst2 x5 + +#define dstPadding0 x13 +#define dstPadding1 x14 +#define dstPadding2 x15 +#define dstPadding0w w13 +#define dstPadding1w w14 +#define dstPadding2w w15 + +// Loop state. #define cur_width w8 #define orig_height w9 -#define chroma_rewind w11 +#define chroma_rewind x16 #define tmp w17 +#define tmpx x17 + +// -------------------------------------------------------------------- +// Source-side argument unpacking. + +.macro src_load_args_nv12 + ldp srcPaddingYw, srcPaddingCw, [x6] // srcStride[0], srcStride[1] + ldp srcY, srcC, [x5] // src[0], src[1] + sxtw srcPaddingY, srcPaddingYw + sxtw srcPaddingC, srcPaddingCw + sub srcPaddingY, srcPaddingY, widthx // = srcStride[0] - width + sub srcPaddingC, srcPaddingC, widthx // = srcStride[1] - width (UV interleaved) + neg chroma_rewind, widthx // chroma_rewind = -width +.endm -.macro load_yoff_ycoeff yoff ycoeff -#if defined(__APPLE__) - ldp y_offset, y_coeff, [sp, #\yoff] -#else - ldr y_offset, [sp, #\yoff] - ldr y_coeff, [sp, #\ycoeff] -#endif -.endm - -.macro load_dst1_dst2 a_dst1 a_linesize1 a_dst2 a_linesize2 -#if defined(__APPLE__) -#define DST_OFFSET 8 -#else -#define DST_OFFSET 0 -#endif - ldr dst1, [sp, #\a_dst1 - DST_OFFSET] - ldr dstPadding1, [sp, #\a_linesize1 - DST_OFFSET] - ldr dst2, [sp, #\a_dst2 - DST_OFFSET] - ldr dstPadding2, [sp, #\a_linesize2 - DST_OFFSET] -#undef DST_OFFSET - sub dstPadding1, dstPadding1, width // padding1 = linesize1 - width - sub dstPadding2, dstPadding2, width // padding2 = linesize2 - width -.endm - -.macro load_args_nv12 ofmt - ldr table_ptr, [sp] // table - load_yoff_ycoeff 8, 16 // y_offset, y_coeff - ld1 {v1.1d}, [table_ptr] - dup v0.8h, y_coeff - dup v3.8h, y_offset -.ifc \ofmt,gbrp - load_dst1_dst2 24, 32, 40, 48 - sub dstPadding0, dstPadding0, width // dstPadding0 = linesize - width (padding) -.else - .ifc \ofmt,rgb24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .ifc \ofmt,bgr24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .if rgb16 - sub dstPadding0, dstPadding0, width, lsl #1 // dstPadding0 = linesize - width * 2 (padding) - .else - sub dstPadding0, dstPadding0, width, lsl #2 // dstPadding0 = linesize - width * 4 (padding) - .endif - .endif - .endif -.endif - sub srcPaddingY, srcPaddingY, width // srcPaddingY = linesizeY - width (paddingY) - sub srcPaddingC, srcPaddingC, width // srcPaddingC = linesizeC - width (paddingC) - neg chroma_rewind, width +.macro src_load_args_nv21 + src_load_args_nv12 .endm -.macro load_args_nv21 ofmt - load_args_nv12 \ofmt +.macro src_load_args_yuv420p + ldp srcPaddingYw, srcPaddingUw, [x6] // srcStride[0], srcStride[1] + ldr srcPaddingVw, [x6, #8] // srcStride[2] + ldp srcY, srcU, [x5] // src[0], src[1] + ldr srcV, [x5, #16] // src[2] + sxtw srcPaddingY, srcPaddingYw + sxtw srcPaddingU, srcPaddingUw + sxtw srcPaddingV, srcPaddingVw + sub srcPaddingY, srcPaddingY, widthx // = srcStride[0] - width + sub srcPaddingU, srcPaddingU, widthx, lsr #1 // = srcStride[1] - width/2 + sub srcPaddingV, srcPaddingV, widthx, lsr #1 // = srcStride[2] - width/2 + neg chroma_rewind, widthx + asr chroma_rewind, chroma_rewind, #1 // chroma_rewind = -width/2 .endm -.macro load_args_yuv420p ofmt - ldr srcV, [sp] // srcV - ldr srcPaddingV, [sp, #8] // linesizeV - ldr table_ptr, [sp, #16] // table - load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1d}, [table_ptr] - dup v0.8h, y_coeff - dup v3.8h, y_offset -.ifc \ofmt,gbrp - load_dst1_dst2 40, 48, 56, 64 - sub dstPadding0, dstPadding0, width // dstPadding0 = linesize - width (padding) -.else - .ifc \ofmt,rgb24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .ifc \ofmt,bgr24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .if rgb16 - sub dstPadding0, dstPadding0, width, lsl #1 // dstPadding0 = linesize - width * 2 (padding) - .else - sub dstPadding0, dstPadding0, width, lsl #2 // dstPadding0 = linesize - width * 4 (padding) - .endif - .endif - .endif +.macro src_load_args_yuv422p + ldp srcPaddingYw, srcPaddingUw, [x6] + ldr srcPaddingVw, [x6, #8] + ldp srcY, srcU, [x5] + ldr srcV, [x5, #16] + sxtw srcPaddingY, srcPaddingYw + sxtw srcPaddingU, srcPaddingUw + sxtw srcPaddingV, srcPaddingVw + sub srcPaddingY, srcPaddingY, widthx + sub srcPaddingU, srcPaddingU, widthx, lsr #1 + sub srcPaddingV, srcPaddingV, widthx, lsr #1 +.endm + +.macro src_load_args_yuva420p + ldp srcPaddingYw, srcPaddingUw, [x6] + ldr srcPaddingVw, [x6, #8] // srcStride[2] + ldr srcPaddingAw, [x6, #12] // srcStride[3] + ldp srcY, srcU, [x5] + ldr srcV, [x5, #16] + ldr srcA, [x5, #24] // src[3] + sxtw srcPaddingY, srcPaddingYw + sxtw srcPaddingU, srcPaddingUw + sxtw srcPaddingV, srcPaddingVw + sxtw srcPaddingA, srcPaddingAw + sub srcPaddingY, srcPaddingY, widthx + sub srcPaddingU, srcPaddingU, widthx, lsr #1 + sub srcPaddingV, srcPaddingV, widthx, lsr #1 + sub srcPaddingA, srcPaddingA, widthx // alpha is full resolution + neg chroma_rewind, widthx + asr chroma_rewind, chroma_rewind, #1 +.endm + +// -------------------------------------------------------------------- +// Destination-side argument unpacking. + +.macro dst_load_args_packed bpp + ldr dstPadding0w, [sp] // linesize0 + sxtw dstPadding0, dstPadding0w +.ifc \bpp,2 + sub dstPadding0, dstPadding0, widthx, lsl #1 // = linesize0 - width*2 .endif - sub srcPaddingY, srcPaddingY, width // srcPaddingY = linesizeY - width (paddingY) - sub srcPaddingU, srcPaddingU, width, lsr #1 // srcPaddingU = linesizeU - width / 2 (paddingU) - sub srcPaddingV, srcPaddingV, width, lsr #1 // srcPaddingV = linesizeV - width / 2 (paddingV) - lsr chroma_rewind, width, #1 - neg chroma_rewind, chroma_rewind -.endm - -.macro load_args_yuva420p ofmt - load_args_yuv420p \ofmt -#if defined(__APPLE__) - ldr srcA, [sp, #32] // srcA - ldr srcPaddingA, [sp, #40] // linesizeA -#else - ldr srcA, [sp, #40] // srcA - ldr srcPaddingA, [sp, #48] // linesizeA -#endif - sub srcPaddingA, srcPaddingA, width // srcPaddingA = linesizeA - width (paddingA) -.endm - -.macro load_args_yuv422p ofmt - ldr srcV, [sp] // srcV - ldr srcPaddingV, [sp, #8] // linesizeV - ldr table_ptr, [sp, #16] // table - load_yoff_ycoeff 24, 32 // y_offset, y_coeff - ld1 {v1.1d}, [table_ptr] - dup v0.8h, y_coeff - dup v3.8h, y_offset -.ifc \ofmt,gbrp - load_dst1_dst2 40, 48, 56, 64 - sub dstPadding0, dstPadding0, width // dstPadding0 = linesize - width (padding) -.else - .ifc \ofmt,rgb24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .ifc \ofmt,bgr24 - add tmp, width, width, lsl #1 - sub dstPadding0, dstPadding0, tmp // dstPadding0 = linesize - width * 3 (padding) - .else - .if rgb16 - sub dstPadding0, dstPadding0, width, lsl #1 // dstPadding0 = linesize - width * 2 (padding) - .else - sub dstPadding0, dstPadding0, width, lsl #2 // dstPadding0 = linesize - width * 4 (padding) - .endif - .endif - .endif +.ifc \bpp,3 + sub dstPadding0, dstPadding0, widthx, lsl #1 + sub dstPadding0, dstPadding0, widthx // = linesize0 - width*3 +.endif +.ifc \bpp,4 + sub dstPadding0, dstPadding0, widthx, lsl #2 // = linesize0 - width*4 .endif - sub srcPaddingY, srcPaddingY, width // srcPaddingY = linesizeY - width (paddingY) - sub srcPaddingU, srcPaddingU, width, lsr #1 // srcPaddingU = linesizeU - width / 2 (paddingU) - sub srcPaddingV, srcPaddingV, width, lsr #1 // srcPaddingV = linesizeV - width / 2 (paddingV) .endm +.macro dst_load_args_planar + ldr dstPadding0w, [sp] // linesize0 + ldr dst1, [sp, #8] // dst1 + ldr dstPadding1w, [sp, #16] // linesize1 + ldr dst2, [sp, #24] // dst2 + ldr dstPadding2w, [sp, #32] // linesize2 + sxtw dstPadding0, dstPadding0w + sxtw dstPadding1, dstPadding1w + sxtw dstPadding2, dstPadding2w + sub dstPadding0, dstPadding0, widthx + sub dstPadding1, dstPadding1, widthx + sub dstPadding2, dstPadding2, widthx +.endm + +.macro dst_load_args_argb + dst_load_args_packed 4 +.endm + +.macro dst_load_args_rgba + dst_load_args_packed 4 +.endm + +.macro dst_load_args_abgr + dst_load_args_packed 4 +.endm + +.macro dst_load_args_bgra + dst_load_args_packed 4 +.endm + +.macro dst_load_args_rgb24 + dst_load_args_packed 3 +.endm + +.macro dst_load_args_bgr24 + dst_load_args_packed 3 +.endm + +.macro dst_load_args_rgb565le + dst_load_args_packed 2 +.endm + +.macro dst_load_args_bgr565le + dst_load_args_packed 2 +.endm + +.macro dst_load_args_rgb555le + dst_load_args_packed 2 +.endm + +.macro dst_load_args_bgr555le + dst_load_args_packed 2 +.endm + +.macro dst_load_args_gbrp + dst_load_args_planar +.endm + +// -------------------------------------------------------------------- +// Per-input chroma load (run inside the inner loop). + .macro load_chroma_nv12 ld2 {v16.8b, v17.8b}, [srcC], #16 ushll v18.8h, v16.8b, #3 @@ -221,10 +261,14 @@ load_chroma_yuv420p .endm +// -------------------------------------------------------------------- +// Row-end chroma increments (single-row code shares one chroma row +// between two consecutive output rows by rewinding on even rows). + .macro increment_nv12 ands tmp, height, #1 - csel tmp, srcPaddingC, chroma_rewind, ne // incC = (h & 1) ? paddingC : -width - add srcC, srcC, tmp, sxtw // srcC += incC + csel tmpx, srcPaddingC, chroma_rewind, ne // incC = (h & 1) ? srcPaddingC : -width + add srcC, srcC, tmpx // srcC += incC .endm .macro increment_nv21 @@ -233,22 +277,25 @@ .macro increment_yuv420p ands tmp, height, #1 - csel tmp, srcPaddingU, chroma_rewind, ne // incU = (h & 1) ? paddingU : -width/2 - add srcU, srcU, tmp, sxtw // srcU += incU - csel tmp, srcPaddingV, chroma_rewind, ne // incV = (h & 1) ? paddingV : -width/2 - add srcV, srcV, tmp, sxtw // srcV += incV + csel tmpx, srcPaddingU, chroma_rewind, ne // incU = (h & 1) ? srcPaddingU : -width/2 + add srcU, srcU, tmpx // srcU += incU + csel tmpx, srcPaddingV, chroma_rewind, ne // incV = (h & 1) ? srcPaddingV : -width/2 + add srcV, srcV, tmpx // srcV += incV .endm .macro increment_yuva420p increment_yuv420p - add srcA, srcA, srcPaddingA, sxtw // srcA += paddingA (every row) + add srcA, srcA, srcPaddingA // srcA += srcPaddingA (every row) .endm .macro increment_yuv422p - add srcU, srcU, srcPaddingU, sxtw // srcU += paddingU - add srcV, srcV, srcPaddingV, sxtw // srcV += paddingV + add srcU, srcU, srcPaddingU // srcU += srcPaddingU + add srcV, srcV, srcPaddingV // srcV += srcPaddingV .endm +// -------------------------------------------------------------------- +// Shared compute / pack helpers. + .macro compute_rgb r1 g1 b1 r2 g2 b2 add \r1\().8h, v26.8h, v20.8h // Y1 + R1 add \r2\().8h, v27.8h, v21.8h // Y2 + R2 @@ -326,7 +373,7 @@ // Pack 8 pixels of 16bpp output. The three channels are extracted via ushr, // widened to u16, then merged via shift-left-insert: -// dst0 = (high << high_shl) | (mid << 5) | low +// dst = (high << high_shl) | (mid << 5) | low // For RGB565LE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11. // For BGR565LE pass (R, G, B), g_shr=2, high_shl=11. // For RGB555LE pass (B, G, R), g_shr=3, high_shl=10. @@ -346,7 +393,12 @@ .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 set_rgb16_predicates \ofmt - load_args_\ifmt \ofmt + uxtw widthx, width // ensure upper 32 bits of widthx are zero + dup v3.8h, y_offset // broadcast y_offset before w2 is reused + dup v0.8h, y_coeff // broadcast y_coeff before w3 is reused + ld1 {v1.1d}, [table_ptr] // load yuv2rgb_table before x4 is reused + src_load_args_\ifmt + dst_load_args_\ofmt save_d8_d9_if_16bpp movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) @@ -424,7 +476,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 .else .ifc \ofmt,gbrp compute_rgb v18,v4,v6, v19,v5,v7 - st1 { v4.8b, v5.8b }, [dst0], #16 + st1 { v4.8b, v5.8b }, [dst0], #16 st1 { v6.8b, v7.8b }, [dst1], #16 st1 { v18.8b, v19.8b }, [dst2], #16 .else @@ -447,14 +499,14 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 .endif .endif .endif - subs cur_width, cur_width, #16 // width -= 16 + subs cur_width, cur_width, #16 // cur_width -= 16 b.gt 2b - add dst0, dst0, dstPadding0, sxtw // dst0 += padding + add dst0, dst0, dstPadding0 // dst0 += padding .ifc \ofmt,gbrp - add dst1, dst1, dstPadding1, sxtw // dst1 += padding1 - add dst2, dst2, dstPadding2, sxtw // dst2 += padding2 + add dst1, dst1, dstPadding1 // dst1 += padding1 + add dst2, dst2, dstPadding2 // dst2 += padding2 .endif - add srcY, srcY, srcPaddingY, sxtw // srcY += paddingY + add srcY, srcY, srcPaddingY // srcY += paddingY increment_\ifmt subs height, height, #1 // height -= 1 b.gt 1b _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
