I have added assembly implementation and sent it as a new patch. Thanks ________________________________ From: ffmpeg-devel <ffmpeg-devel-boun...@ffmpeg.org> on behalf of Zhao Zhili <quinkblack-at-foxmail....@ffmpeg.org> Sent: 26 May 2025 15:04 To: FFmpeg development discussions and patches <ffmpeg-devel@ffmpeg.org> Cc: Dash Santosh Sathyanarayanan <dash.sathyanaraya...@multicorewareinc.com> Subject: Re: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c()
> On May 26, 2025, at 16:40, Harshitha Sarangu Suresh > <harshi...@multicorewareinc.com> wrote: > > Hi, > Did you get a a chance to review this patch? Thank you for your contribution. However, we use manual assembly instead of intrinsics for neon. > > Get Outlook for Android<https://aka.ms/AAb9ysg> > ________________________________ > From: Harshitha Sarangu Suresh > Sent: Thursday, May 22, 2025 7:24:15 PM > To: ffmpeg-devel@ffmpeg.org <ffmpeg-devel@ffmpeg.org> > Cc: Dash Santosh Sathyanarayanan <dash.sathyanaraya...@multicorewareinc.com> > Subject: [FFmpeg-devel] [PATCH] swscale/output: Implement neon intrinsics for > yuv2nv12cX_c() > > This optimization provides 6x improvement for the module. The boost in > performance was calculated by adding C timers inside the C function and the > optimized neon intrinsic function. > > > From 1deceb0394a5acdf70677870dc252fd66a91dd9f Mon Sep 17 00:00:00 2001 > From: Harshitha Suresh <harshi...@multicorewareinc.com> > Date: Mon, 19 May 2025 22:37:20 +0530 > Subject: [PATCH] swscale/output: Implement neon intrinsics for yuv2nv12cX_c() > > --- > libswscale/aarch64/swscale.c | 151 +++++++++++++++++++++++++++++++++++ > 1 file changed, 151 insertions(+) > > diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c > index 6e5a721c1f..fb59c3f1b0 100644 > --- a/libswscale/aarch64/swscale.c > +++ b/libswscale/aarch64/swscale.c > @@ -21,6 +21,9 @@ > #include "libswscale/swscale.h" > #include "libswscale/swscale_internal.h" > #include "libavutil/aarch64/cpu.h" > +#if defined (__aarch64__) > +#include <arm_neon.h> > +#endif > > void ff_hscale16to15_4_neon_asm(int shift, int16_t *_dst, int dstW, > const uint8_t *_src, const int16_t *filter, > @@ -142,6 +145,153 @@ static void ff_hscale16to19_X4_neon(SwsInternal *c, > int16_t *_dst, int dstW, > > } > > +static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t > *chrDither, > + const int16_t *chrFilter, int chrFilterSize, > + const int16_t **chrUSrc, const int16_t **chrVSrc, > + uint8_t *dest, int chrDstW) > +{ > + > + int i; > + int u_dither[8], v_dither[8]; > + for (i = 0; i < 8; i++) { > + u_dither[i] = chrDither[i & 7] << 12; > + v_dither[i] = chrDither[(i + 3) & 7] << 12; > + } > + int32x4_t u0 = vld1q_s32(&u_dither[0]); > + int32x4_t u1 = vld1q_s32(&u_dither[4]); > + int32x4_t v0 = vld1q_s32(&v_dither[0]); > + int32x4_t v1 = vld1q_s32(&v_dither[4]); > + > + if (!isSwappedChroma(dstFormat)) > + { > + for (i = 0; i <= chrDstW - 8; i += 8) > + { > + int32x4_t udst0 = u0; > + int32x4_t udst1 = u1; > + int32x4_t vdst0 = v0; > + int32x4_t vdst1 = v1; > + > + for (int j = 0; j < chrFilterSize; j++) > + { > + int16x8_t usrc0 = vld1q_s16(&chrUSrc[j][i]); > + int16x8_t vsrc0 = vld1q_s16(&chrVSrc[j][i]); > + > + int32x4_t usrc0_low = vmovl_s16(vget_low_s16(usrc0)); > + int32x4_t usrc0_high = vmovl_s16(vget_high_s16(usrc0)); > + int32x4_t vsrc0_low = vmovl_s16(vget_low_s16(vsrc0)); > + int32x4_t vsrc0_high = vmovl_s16(vget_high_s16(vsrc0)); > + > + udst0 = vmlaq_n_s32(udst0, usrc0_low, chrFilter[j]); > + udst1 = vmlaq_n_s32(udst1, usrc0_high, chrFilter[j]); > + vdst0 = vmlaq_n_s32(vdst0, vsrc0_low, chrFilter[j]); > + vdst1 = vmlaq_n_s32(vdst1, vsrc0_high, chrFilter[j]); > + > + } > + // Right shift by 19 > + udst0 = vshrq_n_s32(udst0, 19); > + udst1 = vshrq_n_s32(udst1, 19); > + vdst0 = vshrq_n_s32(vdst0, 19); > + vdst1 = vshrq_n_s32(vdst1, 19); > + > + // Convert to 16-bit and then to uint8, with saturation > + int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), > vqmovn_s32(udst1)); > + int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), > vqmovn_s32(vdst1)); > + > + uint8x8_t u8 = vqmovun_s16(u16); > + uint8x8_t v8 = vqmovun_s16(v16); > + > + // Store interleaved u/v as UV UV UV... > + uint8x8x2_t uv; > + uv.val[0] = u8; > + uv.val[1] = v8; > + vst2_u8(dest + 2 * i, uv); > + } > + > + // Handle remaining pixels with scalar fallback > + for (; i < chrDstW; i++) > + { > + int u = chrDither[i & 7] << 12; > + int v = chrDither[(i + 3) & 7] << 12; > + > + for (int j = 0; j < chrFilterSize; j++) > + { > + u += chrUSrc[j][i] * chrFilter[j]; > + v += chrVSrc[j][i] * chrFilter[j]; > + } > + > + uint8_t uu = av_clip_uint8(u >> 19); > + uint8_t vv = av_clip_uint8(v >> 19); > + dest[2 * i] = uu; > + dest[2 * i + 1] = vv; > + } > + } > + else > + { > + if (!isSwappedChroma(dstFormat)) > + { > + for (i = 0; i <= chrDstW - 8; i += 8) > + { > + int32x4_t udst0 = u0; > + int32x4_t udst1 = u1; > + int32x4_t vdst0 = v0; > + int32x4_t vdst1 = v1; > + > + for (int j = 0; j < chrFilterSize; j++) > + { > + int16x8_t usrc = vld1q_s16(&chrUSrc[j][i]); > + int16x8_t vsrc = vld1q_s16(&chrVSrc[j][i]); > + > + int32x4_t usrc_low = vmovl_s16(vget_low_s16(usrc)); > + int32x4_t usrc_high = vmovl_s16(vget_high_s16(usrc)); > + int32x4_t vsrc_low = vmovl_s16(vget_low_s16(vsrc)); > + int32x4_t vsrc_high = vmovl_s16(vget_high_s16(vsrc)); > + > + udst0 = vmlaq_n_s32(udst0, usrc_low, chrFilter[j]); > + udst1 = vmlaq_n_s32(udst1, usrc_high, chrFilter[j]); > + vdst0 = vmlaq_n_s32(vdst0, vsrc_low, chrFilter[j]); > + vdst1 = vmlaq_n_s32(vdst1, vsrc_high, chrFilter[j]); > + } > + // Right shift by 19 > + udst0 = vshrq_n_s32(udst0, 19); > + udst1 = vshrq_n_s32(udst1, 19); > + vdst0 = vshrq_n_s32(vdst0, 19); > + vdst1 = vshrq_n_s32(vdst1, 19); > + > + // Convert to 16-bit and then to uint8, with saturation > + int16x8_t u16 = vcombine_s16(vqmovn_s32(udst0), > vqmovn_s32(udst1)); > + int16x8_t v16 = vcombine_s16(vqmovn_s32(vdst0), > vqmovn_s32(vdst1)); > + > + uint8x8_t u8 = vqmovun_s16(u16); > + uint8x8_t v8 = vqmovun_s16(v16); > + > + // Store interleaved u/v as UV UV UV... > + uint8x8x2_t uv; > + uv.val[0] = v8; > + uv.val[1] = u8; > + vst2_u8(dest + 2 * i, uv); > + } > + > + // Handle remaining pixels with scalar fallback > + for (; i < chrDstW; i++) > + { > + int u = chrDither[i & 7] << 12; > + int v = chrDither[(i + 3) & 7] << 12; > + > + for (int j = 0; j < chrFilterSize; j++) > + { > + u += chrUSrc[j][i] * chrFilter[j]; > + v += chrVSrc[j][i] * chrFilter[j]; > + } > + > + uint8_t uu = av_clip_uint8(u >> 19); > + uint8_t vv = av_clip_uint8(v >> 19); > + dest[2 * i] = vv; > + dest[2 * i + 1] = uu; > + } > + } > + } > +} > + > #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ > void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ > SwsInternal *c, int16_t > *data, \ > @@ -275,6 +425,7 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) > ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon); > if (c->dstBpc == 8) { > c->yuv2planeX = ff_yuv2planeX_8_neon; > + c->yuv2nv12cX = ff_yuv2nv12cX_neon; > } > switch (c->opts.src_format) { > case AV_PIX_FMT_ABGR: > -- > 2.36.0.windows.1 > > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".