On Mon, 3 Mar 2025, 16:38 Shreesh Adiga, <16567adigashre...@gmail.com> wrote:
> On Thu, Feb 20, 2025 at 6:51 PM Shreesh Adiga > <16567adigashre...@gmail.com> wrote: > > > > Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the > following: > > 4 vinsertq to have interleaving of the vector lanes during load from > memory. > > 4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout. > > > > This patch replaces the above 8 instructions with 2 vpermq and > > 2 vpermd with a vector register similar to AVX512ICL version. > > > > Observed the following numbers on various microarchitectures: > > > > On AMD Zen3 laptop: > > Before: > > uyvytoyuv422_c: 51979.7 ( 1.00x) > > uyvytoyuv422_sse2: 5410.5 ( 9.61x) > > uyvytoyuv422_avx: 4642.7 (11.20x) > > uyvytoyuv422_avx2: 4249.0 (12.23x) > > > > After: > > uyvytoyuv422_c: 51659.8 ( 1.00x) > > uyvytoyuv422_sse2: 5420.8 ( 9.53x) > > uyvytoyuv422_avx: 4651.2 (11.11x) > > uyvytoyuv422_avx2: 3953.8 (13.07x) > > > > On Intel Macbook Pro 2019: > > Before: > > uyvytoyuv422_c: 185014.4 ( 1.00x) > > uyvytoyuv422_sse2: 22800.4 ( 8.11x) > > uyvytoyuv422_avx: 19796.9 ( 9.35x) > > uyvytoyuv422_avx2: 13141.9 (14.08x) > > > > After: > > uyvytoyuv422_c: 185093.4 ( 1.00x) > > uyvytoyuv422_sse2: 22795.4 ( 8.12x) > > uyvytoyuv422_avx: 19791.9 ( 9.35x) > > uyvytoyuv422_avx2: 12043.1 (15.37x) > > > > On AMD Zen4 desktop: > > Before: > > uyvytoyuv422_c: 29105.0 ( 1.00x) > > uyvytoyuv422_sse2: 3888.0 ( 7.49x) > > uyvytoyuv422_avx: 3374.2 ( 8.63x) > > uyvytoyuv422_avx2: 2649.8 (10.98x) > > uyvytoyuv422_avx512icl: 1615.0 (18.02x) > > > > After: > > uyvytoyuv422_c: 29093.4 ( 1.00x) > > uyvytoyuv422_sse2: 3874.4 ( 7.51x) > > uyvytoyuv422_avx: 3371.6 ( 8.63x) > > uyvytoyuv422_avx2: 2174.6 (13.38x) > > uyvytoyuv422_avx512icl: 1625.1 (17.90x) > > > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > > --- > > libswscale/x86/rgb_2_rgb.asm | 68 ++++++++++++++++++------------------ > > 1 file changed, 34 insertions(+), 34 deletions(-) > > > > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > > index 6e4df17298..871bb21127 100644 > > --- a/libswscale/x86/rgb_2_rgb.asm > > +++ b/libswscale/x86/rgb_2_rgb.asm > > @@ -49,18 +49,21 @@ shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, > 15, 17, 19, 21, 23, 25, > > 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, > 119, 121, 123, 125, 127 > > %endif > > > > +%if HAVE_AVX2_EXTERNAL > > +; shuffle vector to rearrange packuswb result to be linear > > +shuf_packus_avx2: db 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\ > > + 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, > > +%endif > > + > > SECTION .text > > > > -%macro RSHIFT_COPY 5 > > +%macro RSHIFT_COPY 3 > > ; %1 dst ; %2 src ; %3 shift > > -%if mmsize == 32 > > - vperm2i128 %1, %2, %3, %5 > > - RSHIFT %1, %4 > > -%elif cpuflag(avx) > > - psrldq %1, %2, %4 > > +%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl) > > + psrldq %1, %2, %3 > > %else > > mova %1, %2 > > - RSHIFT %1, %4 > > + RSHIFT %1, %3 > > %endif > > %endmacro > > > > @@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3 > > ; int lumStride, int chromStride, int srcStride) > > > ;----------------------------------------------------------------------------------------------- > > %macro UYVY_TO_YUV422 0 > > -%if mmsize == 64 > > -; need two more registers to store shuffle vectors for AVX512ICL > > -cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, > lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > > -%else > > -cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, > lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > > -%endif > > +cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), > ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, > whalf, tmp, x, back_w > > pxor m0, m0 > > %if mmsize == 64 > > vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) > > movu m8, [shuf_packus] > > movu m9, [shuf_perm2b] > > %else > > + %if cpuflag(avx2) > > + movu m8, [shuf_packus_avx2] > > + %endif > > pcmpeqw m1, m1 > > %endif > > psrlw m1, 8 > > @@ -295,21 +296,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, > src, w, h, lum_stride, chrom_s > > jge .end_line > > > > .loop_simd: > > -%if mmsize == 32 > > - movu xm2, [srcq + wtwoq ] > > - movu xm3, [srcq + wtwoq + 16 ] > > - movu xm4, [srcq + wtwoq + 16 * 2] > > - movu xm5, [srcq + wtwoq + 16 * 3] > > - vinserti128 m2, m2, [srcq + wtwoq + 16 * 4], 1 > > - vinserti128 m3, m3, [srcq + wtwoq + 16 * 5], 1 > > - vinserti128 m4, m4, [srcq + wtwoq + 16 * 6], 1 > > - vinserti128 m5, m5, [srcq + wtwoq + 16 * 7], 1 > > -%else > > movu m2, [srcq + wtwoq ] > > movu m3, [srcq + wtwoq + mmsize ] > > movu m4, [srcq + wtwoq + mmsize * 2] > > movu m5, [srcq + wtwoq + mmsize * 3] > > -%endif > > > > %if mmsize == 64 > > ; extract y part 1 > > @@ -323,23 +313,29 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, > src, w, h, lum_stride, chrom_s > > movu [ydstq + wq + mmsize], m7 > > %else > > ; extract y part 1 > > - RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > > - pand m6, m1; YxYx YxYx... > > + RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY... > > + pand m6, m1 ; YxYx YxYx... > > > > - RSHIFT_COPY m7, m3, m5, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > > - pand m7, m1 ; YxYx YxYx... > > + RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... > > + pand m7, m1 ; YxYx YxYx... > > > > - packuswb m6, m7 ; YYYY YYYY... > > + packuswb m6, m7 ; YYYY YYYY... > > +%if mmsize == 32 > > + vpermq m6, m6, 0xd8 > > +%endif > > movu [ydstq + wq], m6 > > > > ; extract y part 2 > > - RSHIFT_COPY m6, m4, m2, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > > - pand m6, m1; YxYx YxYx... > > + RSHIFT_COPY m6, m4, 1 ; UYVY UYVY -> YVYU YVY... > > + pand m6, m1 ; YxYx YxYx... > > > > - RSHIFT_COPY m7, m5, m3, 1, 0x13 ; UYVY UYVY -> YVYU YVY... > > - pand m7, m1 ; YxYx YxYx... > > + RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY... > > + pand m7, m1 ; YxYx YxYx... > > > > - packuswb m6, m7 ; YYYY YYYY... > > + packuswb m6, m7 ; YYYY YYYY... > > +%if mmsize == 32 > > + vpermq m6, m6, 0xd8 > > +%endif > > movu [ydstq + wq + mmsize], m6 > > %endif > > > > @@ -359,6 +355,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, > src, w, h, lum_stride, chrom_s > > packuswb m6, m7 ; UUUU > > %if mmsize == 64 > > vpermb m6, m8, m6 > > +%elif mmsize == 32 > > + vpermd m6, m8, m6 > > %endif > > movu [udstq + whalfq], m6 > > > > @@ -369,6 +367,8 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, > src, w, h, lum_stride, chrom_s > > packuswb m2, m4 ; VVVV > > %if mmsize == 64 > > vpermb m2, m8, m2 > > +%elif mmsize == 32 > > + vpermd m2, m8, m2 > > %endif > > movu [vdstq + whalfq], m2 > > > > -- > > 2.45.3 > > > > Hello Maintainers, > > Any feedback on this submission? > > Thanks, > Shreesh > LGTM. I will push when I am at a PC Kieran > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".