The current algoithm may get improve, may you combin these optimize with your patches? since extra VPERM make code a little more slower.
On Haswell Current alogithm: RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... pand m6, m1; YxYx YxYx... RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY... Latency: 1 + 1 + 1 + 1 + 1 = 5 Proposed: pshufb m6, m2, mX ; UYVY UYVY -> xxxx YYYY pshufb m7, m3, mX punpcklqdq m6, m7 ; YYYY YYYY Latency: 1 + 1 + 1 = 3 I guess the current algorithm optimize for compatible with SSE2, because PSHUFB addition since SSSE3. Now, we try to optimzie with AVX, AVX2 and AVX512, so I suggest we use proposed algorithm to get more performance. Regards, Min Chen At 2021-09-28 13:34:03, "Wu Jianhua" <jianhua...@intel.com> wrote: >With the accelerating by means of AVX2, the uyvytoyuv422 can be faster > >Performance data(Less is better): > uyvytoyuv422_sse2 0.49381 > uyvytoyuv422_avx 0.42981 > uyvytoyuv422_avx2 0.27915 > >Signed-off-by: Wu Jianhua <jianhua...@intel.com> >--- > libswscale/x86/rgb2rgb.c | 6 +++++ > libswscale/x86/rgb_2_rgb.asm | 48 +++++++++++++++++++++++++++--------- > 2 files changed, 42 insertions(+), 12 deletions(-) > >diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c >index c9ff33ab77..a965a1755c 100644 >--- a/libswscale/x86/rgb2rgb.c >+++ b/libswscale/x86/rgb2rgb.c >@@ -164,6 +164,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, >uint8_t *vdst, > void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > const uint8_t *src, int width, int height, > int lumStride, int chromStride, int srcStride); >+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, >+ const uint8_t *src, int width, int height, >+ int lumStride, int chromStride, int srcStride); > #endif > > av_cold void rgb2rgb_init_x86(void) >@@ -216,5 +219,8 @@ av_cold void rgb2rgb_init_x86(void) > if (EXTERNAL_AVX(cpu_flags)) { > uyvytoyuv422 = ff_uyvytoyuv422_avx; > } >+ if (EXTERNAL_AVX2_FAST(cpu_flags)) { >+ uyvytoyuv422 = ff_uyvytoyuv422_avx2; >+ } > #endif > } >diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm >index 3380a1272c..683bd067a5 100644 >--- a/libswscale/x86/rgb_2_rgb.asm >+++ b/libswscale/x86/rgb_2_rgb.asm >@@ -31,9 +31,16 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, >12, 15, 14, 13 > pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 > pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 > pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 >+pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7 > > SECTION .text > >+%macro VPERM 5 >+%if mmsize == %2 >+ vperm%1 %3, %4, %5 >+%endif >+%endmacro >+ > %macro RSHIFT_COPY 3 > ; %1 dst ; %2 src ; %3 shift > %if cpuflag(avx) >@@ -198,11 +205,15 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, >w, h, lum_stride, chrom_s > mov whalfq, wq > shr whalfq, 1 ; whalf = width / 2 > >- lea srcq, [srcq + wq * 2] >+ lea srcq, [srcq + wq * 2] > add ydstq, wq > add udstq, whalfq > add vdstq, whalfq > >+%if mmsize == 32 >+ movu m15, [pd_permd256_uv] >+%endif >+ > .loop_line: > mov xq, wq > mov wtwoq, wq >@@ -251,8 +262,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, >h, lum_stride, chrom_s > > RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... >- > packuswb m6, m7 ; YYYY YYYY... >+ >+ VPERM q, 32, m6, m6, 0xd8 >+ > movu [ydstq + wq], m6 > > ; extract y part 2 >@@ -261,8 +274,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, >h, lum_stride, chrom_s > > RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... > pand m7, m1 ; YxYx YxYx... >+ packuswb m6, m7 ; YYYY YYYY... >+ >+ VPERM q, 32, m6, m6, 0xd8 > >- packuswb m6, m7 ; YYYY YYYY... > movu [ydstq + wq + mmsize], m6 > > ; extract uv >@@ -275,17 +290,21 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, >w, h, lum_stride, chrom_s > packuswb m4, m5 ; UVUV... > > ; U >- pand m6, m2, m1 ; UxUx... >- pand m7, m4, m1 ; UxUx... >+ pand m6, m2, m1 ; UxUx... >+ pand m7, m4, m1 ; UxUx... >+ packuswb m6, m7 ; UUUU > >- packuswb m6, m7 ; UUUU >- movu [udstq + whalfq], m6 >+ VPERM d, 32, m6, m15, m6 > >+ movu [udstq + whalfq], m6 > > ; V >- psrlw m2, 8 ; VxVx... >- psrlw m4, 8 ; VxVx... >- packuswb m2, m4 ; VVVV >+ psrlw m2, 8 ; VxVx... >+ psrlw m4, 8 ; VxVx... >+ packuswb m2, m4 ; VVVV >+ >+ VPERM d, 32, m2, m15, m2 >+ > movu [vdstq + whalfq], m2 > > add whalfq, mmsize >@@ -294,13 +313,13 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, >w, h, lum_stride, chrom_s > jl .loop_simd > > .end_line: >- add srcq, src_strideq >+ add srcq, src_strideq > add ydstq, lum_strideq > add udstq, chrom_strideq > add vdstq, chrom_strideq > > ;restore initial state of line variable >- mov wq, back_wq >+ mov wq, back_wq > mov xq, wq > mov whalfq, wq > shr whalfq, 1 ; whalf = width / 2 >@@ -316,4 +335,9 @@ UYVY_TO_YUV422 > > INIT_XMM avx > UYVY_TO_YUV422 >+ >+%if HAVE_AVX2_EXTERNAL >+INIT_YMM avx2 >+UYVY_TO_YUV422 >+%endif > %endif >-- >2.17.1 > >_______________________________________________ >ffmpeg-devel mailing list >ffmpeg-devel@ffmpeg.org >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > >To unsubscribe, visit link above, or email >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".