Hi, Could you please check and review this patch? ________________________________ From: ffmpeg-devel <ffmpeg-devel-boun...@ffmpeg.org> on behalf of Logaprakash Ramajayam <logaprakash.ramaja...@multicorewareinc.com> Sent: Friday, June 6, 2025 2:14 PM To: Kieran Kunhya via ffmpeg-devel <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanaraya...@multicorewareinc.com>; Harshitha Sarangu Suresh <harshi...@multicorewareinc.com> Subject: [FFmpeg-devel] [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
Checked FATE tests and gha-aarch64 git workflow. >From 34cdef26eaebcf98916e9881b3a04f4f698f09c6 Mon Sep 17 00:00:00 2001 From: Logaprakash Ramajayam <logaprakash.ramaja...@multicorewareinc.com> Date: Thu, 5 Jun 2025 01:33:39 -0700 Subject: [PATCH] swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template() --- libswscale/aarch64/output.S | 167 +++++++++++++++++++++++++++++++++++ libswscale/aarch64/swscale.c | 38 ++++++++ 2 files changed, 205 insertions(+) diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S index 190c438870..e039e820ae 100644 --- a/libswscale/aarch64/output.S +++ b/libswscale/aarch64/output.S @@ -20,6 +20,173 @@ #include "libavutil/aarch64/asm.S" +function ff_yuv2planeX_10_neon, export=1 +// x0 = filter (int16_t*) +// w1 = filterSize +// x2 = src (int16_t**) +// x3 = dest (uint16_t*) +// w4 = dstW +// w5 = big_endian +// w6 = output_bits + + mov w8, #27 + sub w8, w8, w6 // shift = 11 + 16 - output_bits + + sub w9, w8, #1 + mov w10, #1 + lsl w9, w10, w9 // val = 1 << (shift - 1) + + dup v1.4s, w9 + dup v2.4s, w9 // Create vectors with val + + mov w17, #0 + sub w16, w17, w8 + dup v8.4s, w16 // Create (-shift) vector for right shift + + movi v11.4s, #0 + + mov w10, #1 + lsl w10, w10, w6 + sub w10, w10, #1 // (1U << output_bits) - 1 + dup v12.4s, w10 // Create Clip vector for uppr bound + + tst w4, #15 // if dstW divisible by 16, process 16 elements + b.ne 4f // else process 8 elements + + mov x7, #0 // i = 0 +1: // Loop + + mov v3.16b, v1.16b + mov v4.16b, v2.16b + mov v5.16b, v1.16b + mov v6.16b, v2.16b + + mov w11, w1 // tmpfilterSize = filterSize + mov x12, x2 // srcp = src + mov x13, x0 // filterp = filter + +2: // Filter loop + + ldp x14, x15, [x12], #16 // get 2 pointers: src[j] and src[j+1] + ldr s7, [x13], #4 // load filter coefficients + add x14, x14, x7, lsl #1 + add x15, x15, x7, lsl #1 + ld1 {v16.8h, v17.8h}, [x14] + ld1 {v18.8h, v19.8h}, [x15] + + // Multiply-accumulate + smlal v3.4s, v16.4h, v7.h[0] + smlal2 v4.4s, v16.8h, v7.h[0] + smlal v5.4s, v17.4h, v7.h[0] + smlal2 v6.4s, v17.8h, v7.h[0] + + smlal v3.4s, v18.4h, v7.h[1] + smlal2 v4.4s, v18.8h, v7.h[1] + smlal v5.4s, v19.4h, v7.h[1] + smlal2 v6.4s, v19.8h, v7.h[1] + + subs w11, w11, #2 // tmpfilterSize -= 2 + b.gt 2b // continue filter loop + + // Shift results + sshl v3.4s, v3.4s, v8.4s + sshl v4.4s, v4.4s, v8.4s + sshl v5.4s, v5.4s, v8.4s + sshl v6.4s, v6.4s, v8.4s + + // Clamp to 0 + smax v3.4s, v3.4s, v11.4s + smax v4.4s, v4.4s, v11.4s + smax v5.4s, v5.4s, v11.4s + smax v6.4s, v6.4s, v11.4s + + // Clip upper bound + smin v3.4s, v3.4s, v12.4s + smin v4.4s, v4.4s, v12.4s + smin v5.4s, v5.4s, v12.4s + smin v6.4s, v6.4s, v12.4s + + // Narrow to 16-bit + xtn v13.4h, v3.4s + xtn2 v13.8h, v4.4s + xtn v14.4h, v5.4s + xtn2 v14.8h, v6.4s + + cbz w5, 3f // Check if big endian + rev16 v13.16b, v13.16b + rev16 v14.16b, v14.16b // Swap bits for big endian +3: + // Store 16 pixels + st1 {v13.8h}, [x3], #16 + st1 {v14.8h}, [x3], #16 + + add x7, x7, #16 // i = i + 16 + subs w4, w4, #16 // dstW = dstW - 16 + b.gt 1b // Continue loop + b 8f // end + +4: // Process 8 elements + mov x7, #0 +5: // Loop + + mov v3.16b, v1.16b + mov v4.16b, v2.16b + + mov w11, w1 + mov x12, x2 + mov x13, x0 + +6: // Filter loop + + ldp x14, x15, [x12], #16 + ldr s7, [x13], #4 + add x14, x14, x7, lsl #1 + add x15, x15, x7, lsl #1 + ld1 {v5.8h}, [x14] + ld1 {v6.8h}, [x15] + + // Multiply-accumulate + smlal v3.4s, v5.4h, v7.h[0] + smlal2 v4.4s, v5.8h, v7.h[0] + smlal v3.4s, v6.4h, v7.h[1] + smlal2 v4.4s, v6.8h, v7.h[1] + + subs w11, w11, #2 // tmpfilterSize -= 2 + b.gt 6b // loop until filterSize consumed + + // Shift results + sshl v3.4s, v3.4s, v8.4s + sshl v4.4s, v4.4s, v8.4s + + // Clamp to 0 + smax v3.4s, v3.4s, v11.4s + smax v4.4s, v4.4s, v11.4s + + // Clip upper bound + smin v3.4s, v3.4s, v12.4s + smin v4.4s, v4.4s, v12.4s + + // Narrow to 16-bit + xtn v9.4h, v3.4s + xtn v10.4h, v4.4s + + cbz w5, 7f // Check if big endian + rev16 v9.8b, v9.8b + rev16 v10.8b, v10.8b // Swap bits for big endian + +7: + // Store 8 pixels + st1 {v9.4h}, [x3], #8 + st1 {v10.4h}, [x3], #8 + + add x7, x7, #8 // i = i + 8 + subs w4, w4, #8 // dstW = dstW - 8 + b.gt 5b // Continue Loop + +8: + ret +endfunc + function ff_yuv2planeX_8_neon, export=1 // x0 - const int16_t *filter, // x1 - int filterSize, diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 6e5a721c1f..23cdb7d26e 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ ALL_SCALE_FUNCS(neon); +void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize, + const int16_t **src, uint16_t *dest, int dstW, + int big_endian, int output_bits); + +#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \ +static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize, \ + const int16_t **src, uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset)\ +{ \ + ff_yuv2planeX_## template_size ## _neon(filter, \ + filterSize, (const typeX_t **) src, \ + (uint16_t *) dest, dstW, is_be, bits); \ +} + +yuv2NBPS( 9, BE, 1, 10, int16_t) +yuv2NBPS( 9, LE, 0, 10, int16_t) +yuv2NBPS(10, BE, 1, 10, int16_t) +yuv2NBPS(10, LE, 0, 10, int16_t) +yuv2NBPS(12, BE, 1, 10, int16_t) +yuv2NBPS(12, LE, 0, 10, int16_t) +yuv2NBPS(14, BE, 1, 10, int16_t) +yuv2NBPS(14, LE, 0, 10, int16_t) + void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset); @@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) { int cpu_flags = av_get_cpu_flags(); + enum AVPixelFormat dstFormat = c->opts.dst_format; + const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat); if (have_neon(cpu_flags)) { ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon); @@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) if (c->dstBpc == 8) { c->yuv2planeX = ff_yuv2planeX_8_neon; } + + if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)) { + if (desc->comp[0].depth == 9) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon : yuv2planeX_9LE_neon; + } else if (desc->comp[0].depth == 10) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon : yuv2planeX_10LE_neon; + } else if (desc->comp[0].depth == 12) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon : yuv2planeX_12LE_neon; + } else if (desc->comp[0].depth == 14) { + c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon : yuv2planeX_14LE_neon; + } else + av_assert0(0); + } switch (c->opts.src_format) { case AV_PIX_FMT_ABGR: c->lumToYV12 = ff_abgr32ToY_neon; -- 2.36.0.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".