>From 7260822a578130a713c1455cca6cdd06f1540db8 Mon Sep 17 00:00:00 2001 From: Harshitha Suresh <harshi...@multicorewareinc.com> Date: Mon, 19 May 2025 22:37:20 +0530 Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2nv12cX_c()
yuv2nv12cX_2_512_accurate_c: 3508.8 ( 1.00x) yuv2nv12cX_2_512_accurate_neon: 369.2 ( 9.50x) yuv2nv12cX_2_512_approximate_c: 3499.0 ( 1.00x) yuv2nv12cX_2_512_approximate_neon: 370.2 ( 9.45x) yuv2nv12cX_4_512_accurate_c: 4683.0 ( 1.00x) yuv2nv12cX_4_512_accurate_neon: 568.8 ( 8.23x) yuv2nv12cX_4_512_approximate_c: 4682.6 ( 1.00x) yuv2nv12cX_4_512_approximate_neon: 569.9 ( 8.22x) yuv2nv12cX_8_512_accurate_c: 7243.0 ( 1.00x) yuv2nv12cX_8_512_accurate_neon: 937.6 ( 7.72x) yuv2nv12cX_8_512_approximate_c: 7235.9 ( 1.00x) yuv2nv12cX_8_512_approximate_neon: 938.3 ( 7.71x) yuv2nv12cX_16_512_accurate_c: 13749.7 ( 1.00x) yuv2nv12cX_16_512_accurate_neon: 1708.1 ( 8.05x) yuv2nv12cX_16_512_approximate_c: 13750.0 ( 1.00x) yuv2nv12cX_16_512_approximate_neon: 1708.6 ( 8.05x) --- libswscale/aarch64/output.S | 308 +++++++++++++++++++++++++++++++++++ libswscale/aarch64/swscale.c | 18 ++ 2 files changed, 326 insertions(+) diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index 190c438870..8eb89e8b54 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -226,3 +226,311 @@ function ff_yuv2plane1_8_neon, export=1 b.gt 2b // loop until width consumed ret endfunc + +// void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither, +// const int16_t *chrFilter, int chrFilterSize, +// const int16_t **chrUSrc, const int16_t **chrVSrc, +// uint8_t *dest, int chrDstW) + +function ff_yuv2nv12cX_notswapped_neon, export=1 + // x0 - dstFormat (unused) + // x1 - uint8_t *chrDither + // x2 - int16_t *chrFilter + // x3 - int chrFilterSize + // x4 - int16_t **chrUSrc + // x5 - int16_t **chrVSrc + // x6 - uint8_t *dest + // x7 - int chrDstW + + // Load dither pattern and compute U and V dither vectors + ld1 {v0.8b}, [x1] // chrDither[0..7] + ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + + ushll v2.4s, v0.4h, #12 // U dither low + ushll2 v3.4s, v0.8h, #12 // U dither high + ushll v4.4s, v1.4h, #12 // V dither low + ushll2 v5.4s, v1.8h, #12 // V dither high + + // Check if we can process 16 pixels at a time + tst w7, #15 // Check if chrDstW % 16 == 0 + b.ne .Lprocess_8_pixels // If not, use 8-pixel version + + // ============================================= + // 16-pixel processing path + // ============================================= + mov x8, #0 // i = 0 +.Lloop_16_pixels: + + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov v20.16b, v2.16b + mov v21.16b, v3.16b + mov v22.16b, v4.16b + mov v23.16b, v5.16b + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +.Lfilter_loop_16: + ldr h6, [x10], #2 // Load filter coefficient + + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + add x15, x13, #16 // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes) + add x16, x14, #16 + + ld1 {v24.8h}, [x13] // U samples 0-7 + ld1 {v25.8h}, [x14] // V samples 0-7 + + ld1 {v26.8h}, [x15] // U samples 8-15 + ld1 {v27.8h}, [x16] // V samples 8-15 + + smlal v16.4s, v24.4h, v6.h[0] + smlal2 v17.4s, v24.8h, v6.h[0] + smlal v18.4s, v25.4h, v6.h[0] + smlal2 v19.4s, v25.8h, v6.h[0] + + smlal v20.4s, v26.4h, v6.h[0] + smlal2 v21.4s, v26.8h, v6.h[0] + smlal v22.4s, v27.4h, v6.h[0] + smlal2 v23.4s, v27.8h, v6.h[0] + + subs w9, w9, #1 + b.gt .Lfilter_loop_16 + + // Process and store first 8 pixels + sqshrun v28.4h, v16.4s, #16 + sqshrun2 v28.8h, v17.4s, #16 + sqshrun v29.4h, v18.4s, #16 + sqshrun2 v29.8h, v19.4s, #16 + uqshrn v30.8b, v28.8h, #3 // U + uqshrn v31.8b, v29.8h, #3 // V + + // Process and store next 8 pixels + sqshrun v28.4h, v20.4s, #16 + sqshrun2 v28.8h, v21.4s, #16 + sqshrun v29.4h, v22.4s, #16 + sqshrun2 v29.8h, v23.4s, #16 + uqshrn v24.8b, v28.8h, #3 // U + uqshrn v25.8b, v29.8h, #3 // V + + // Store both 8-pixel blocks + st2 {v30.8b, v31.8b}, [x6], #16 + st2 {v24.8b, v25.8b}, [x6], #16 + + subs w7, w7, #16 + add x8, x8, #16 + b.gt .Lloop_16_pixels + ret + + // ============================================= + // 8-pixel processing path (original code) + // ============================================= +.Lprocess_8_pixels: + mov x8, #0 // i = 0 +.Lloop_8_pixels: + // Initialize accumulators with dither + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +.Lfilter_loop_8: + ldr h6, [x10], #2 // Load filter coefficient + + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + + ld1 {v20.8h}, [x13] // U samples + ld1 {v21.8h}, [x14] // V samples + + smlal v16.4s, v20.4h, v6.h[0] + smlal2 v17.4s, v20.8h, v6.h[0] + smlal v18.4s, v21.4h, v6.h[0] + smlal2 v19.4s, v21.8h, v6.h[0] + + subs w9, w9, #1 + b.gt .Lfilter_loop_8 + + // Final processing and store + sqshrun v26.4h, v16.4s, #16 + sqshrun2 v26.8h, v17.4s, #16 + sqshrun v27.4h, v18.4s, #16 + sqshrun2 v27.8h, v19.4s, #16 + uqshrn v28.8b, v26.8h, #3 // U + uqshrn v29.8b, v27.8h, #3 // V + + st2 {v28.8b, v29.8b}, [x6], #16 + + subs w7, w7, #8 + add x8, x8, #8 + b.gt .Lloop_8_pixels + ret +endfunc + +function ff_yuv2nv12cX_swapped_neon, export=1 + // x0 - dstFormat (unused) + // x1 - uint8_t *chrDither + // x2 - int16_t *chrFilter + // x3 - int chrFilterSize + // x4 - int16_t **chrUSrc + // x5 - int16_t **chrVSrc + // x6 - uint8_t *dest + // x7 - int chrDstW + + ld1 {v0.8b}, [x1] // chrDither[0..7] + ext v1.8b, v0.8b, v0.8b, #3 // Rotate for V: (i+3)&7 + + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + + ushll v2.4s, v0.4h, #12 // U dither low + ushll2 v3.4s, v0.8h, #12 // U dither high + ushll v4.4s, v1.4h, #12 // V dither low + ushll2 v5.4s, v1.8h, #12 // V dither high + + // Check if we can process 16 pixels at a time + tst w7, #15 // Check if chrDstW % 16 == 0 + b.ne .Lswapped_process_8_pixels // If not, use 8-pixel version + + // ============================================= + // 16-pixel processing path + // ============================================= + mov x8, #0 // i = 0 +.Lswapped_loop_16_pixels: + + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov v20.16b, v2.16b + mov v21.16b, v3.16b + mov v22.16b, v4.16b + mov v23.16b, v5.16b + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +.Lswapped_filter_loop_16: + ldr h6, [x10], #2 // Load filter coefficient + + // Load pointers for first 8 pixels + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + add x15, x13, #16 // x15 = &chrUSrc[j][i+8] (8 samples * 2 bytes) + add x16, x14, #16 + + ld1 {v24.8h}, [x13] // U samples 0-7 + ld1 {v25.8h}, [x14] // V samples 0-7 + + ld1 {v26.8h}, [x15] // U samples 8-15 + ld1 {v27.8h}, [x16] // V samples 8-15 + + smlal v16.4s, v24.4h, v6.h[0] + smlal2 v17.4s, v24.8h, v6.h[0] + smlal v18.4s, v25.4h, v6.h[0] + smlal2 v19.4s, v25.8h, v6.h[0] + + smlal v20.4s, v26.4h, v6.h[0] + smlal2 v21.4s, v26.8h, v6.h[0] + smlal v22.4s, v27.4h, v6.h[0] + smlal2 v23.4s, v27.8h, v6.h[0] + + subs w9, w9, #1 + b.gt .Lswapped_filter_loop_16 + + sqshrun v28.4h, v16.4s, #16 + sqshrun2 v28.8h, v17.4s, #16 + sqshrun v29.4h, v18.4s, #16 + sqshrun2 v29.8h, v19.4s, #16 + uqshrn v30.8b, v28.8h, #3 // U + uqshrn v31.8b, v29.8h, #3 // V + + sqshrun v28.4h, v20.4s, #16 + sqshrun2 v28.8h, v21.4s, #16 + sqshrun v29.4h, v22.4s, #16 + sqshrun2 v29.8h, v23.4s, #16 + uqshrn v24.8b, v28.8h, #3 // U + uqshrn v25.8b, v29.8h, #3 // V + + // Store both 8-pixel blocks + st2 {v30.8b, v31.8b}, [x6], #16 + st2 {v24.8b, v25.8b}, [x6], #16 + + subs w7, w7, #16 + add x8, x8, #16 + b.gt .Lswapped_loop_16_pixels + ret + + // ============================================= + // 8-pixel processing path + // ============================================= +.Lswapped_process_8_pixels: + mov x8, #0 // i = 0 +.Lswapped_loop_8_pixels: + // Initialize accumulators with dither + mov v16.16b, v2.16b // U acc low + mov v17.16b, v3.16b // U acc high + mov v18.16b, v4.16b // V acc low + mov v19.16b, v5.16b // V acc high + + mov w9, w3 // chrFilterSize counter + mov x10, x2 // chrFilter pointer + mov x11, x4 // chrUSrc base + mov x12, x5 // chrVSrc base + +.Lswapped_filter_loop_8: + ldr h6, [x10], #2 // Load filter coefficient + + ldr x13, [x11], #8 // chrUSrc[j] + ldr x14, [x12], #8 // chrVSrc[j] + add x13, x13, x8, lsl #1 // &chrUSrc[j][i] + add x14, x14, x8, lsl #1 // &chrVSrc[j][i] + + ld1 {v20.8h}, [x13] // U samples + ld1 {v21.8h}, [x14] // V samples + + smlal v16.4s, v20.4h, v6.h[0] + smlal2 v17.4s, v20.8h, v6.h[0] + smlal v18.4s, v21.4h, v6.h[0] + smlal2 v19.4s, v21.8h, v6.h[0] + + subs w9, w9, #1 + b.gt .Lswapped_filter_loop_8 + + sqshrun v26.4h, v16.4s, #16 + sqshrun2 v26.8h, v17.4s, #16 + sqshrun v27.4h, v18.4s, #16 + sqshrun2 v27.8h, v19.4s, #16 + uqshrn v28.8b, v26.8h, #3 // U + uqshrn v29.8b, v27.8h, #3 // V + + st2 {v28.8b, v29.8b}, [x6], #16 + + subs w7, w7, #8 + add x8, x8, #8 + b.gt .Lswapped_loop_8_pixels + ret +endfunc diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 6e5a721c1f..0e57112f42 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -168,6 +168,16 @@ void ff_yuv2plane1_8_neon( const uint8_t *dither, int offset); +void ff_yuv2nv12cX_notswapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither, + const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW); + +void ff_yuv2nv12cX_swapped_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither, + const int16_t *chrFilter, int chrFilterSize, + const int16_t **chrUSrc, const int16_t **chrVSrc, + uint8_t *dest, int chrDstW); + #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do { \ if (c->srcBpc == 8) { \ if(c->dstBpc <= 14) { \ @@ -201,6 +211,12 @@ void ff_yuv2plane1_8_neon( default: break; \ } +#define ASSIGN_YUV2NV12_FUNC(yuv2nv12fn, opt, dstFormat) \ + if(!isSwappedChroma(dstFormat)) \ + yuv2nv12fn = ff_yuv2nv12cX_notswapped_ ## opt; \ + else \ + yuv2nv12fn = ff_yuv2nv12cX_swapped_ ## opt; + #define NEON_INPUT(name) \ void ff_##name##ToY_neon(uint8_t *dst, const uint8_t *src, const uint8_t *, \ const uint8_t *, int w, uint32_t *coeffs, void *); \ @@ -275,7 +291,9 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon); if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; + ASSIGN_YUV2NV12_FUNC(c->yuv2nv12cX, neon, c->opts.dst_format); } + switch (c->opts.src_format) { case AV_PIX_FMT_ABGR: c->lumToYV12 = ff_abgr32ToY_neon; -- 2.36.0.windows.1
swscale-aarch64-output-Implement-neon-assembly-fo.patch
Description: swscale-aarch64-output-Implement-neon-assembly-fo.patch
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".