On Mon, Feb 3, 2025 at 10:03 PM Shreesh Adiga <16567adigashre...@gmail.com> wrote: > > The scalar loop is replaced with masked AVX512 instructions. > For extracting the Y from UYVY, vperm2b is used instead of > various AND and packuswb. > > Instead of loading the vectors with interleaved lanes as done > in AVX2 version, normal load is used. At the end of packuswb, > for U and V, an extra permute operation is done to get the > required layout. > > AMD 7950x Zen 4 benchmark data: > uyvytoyuv422_c: 29105.0 ( 1.00x) > uyvytoyuv422_sse2: 3888.0 ( 7.49x) > uyvytoyuv422_avx: 3374.2 ( 8.63x) > uyvytoyuv422_avx2: 2649.8 (10.98x) > uyvytoyuv422_avx512icl: 1615.0 (18.02x) > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > --- > libswscale/x86/rgb2rgb.c | 6 ++ > libswscale/x86/rgb_2_rgb.asm | 105 +++++++++++++++++++++++++++++++++++ > 2 files changed, 111 insertions(+) > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > index 4cbed54b35..6601dad233 100644 > --- a/libswscale/x86/rgb2rgb.c > +++ b/libswscale/x86/rgb2rgb.c > @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, > uint8_t *vdst, > void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > const uint8_t *src, int width, int height, > int lumStride, int chromStride, int srcStride); > +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > + const uint8_t *src, int width, int height, > + int lumStride, int chromStride, int > srcStride); > #endif > > #define DEINTERLEAVE_BYTES(cpuext) > \ > @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) > } > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > uyvytoyuv422 = ff_uyvytoyuv422_avx2; > + } > + if (EXTERNAL_AVX512ICL(cpu_flags)) { > + uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; > #endif > } > #endif > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index ca7a481255..6e4df17298 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, > 14, 12, 13, 15 > pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 > pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 > > +%if HAVE_AVX512ICL_EXTERNAL > +; shuffle vector to rearrange packuswb result to be linear > +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, > 51,\ > + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, > 55,\ > + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, > 59,\ > + 12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, > 63 > + > +; shuffle vector to combine odd elements from two vectors to extract Y > +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, > 25, 27, 29, 31,\ > + 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, > 57, 59, 61, 63,\ > + 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, > 89, 91, 93, 95,\ > + 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, > 121, 123, 125, 127 > +%endif > + > SECTION .text > > %macro RSHIFT_COPY 5 > @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 > ; int lumStride, int chromStride, int srcStride) > > ;----------------------------------------------------------------------------------------------- > %macro UYVY_TO_YUV422 0 > +%if mmsize == 64 > +; need two more registers to store shuffle vectors for AVX512ICL > +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > +%else > cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > +%endif > pxor m0, m0 > +%if mmsize == 64 > + vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) > + movu m8, [shuf_packus] > + movu m9, [shuf_perm2b] > +%else > pcmpeqw m1, m1 > +%endif > psrlw m1, 8 > > movsxdifnidn wq, wd > @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, > w, h, lum_stride, chrom_s > and xq, mmsize * 2 - 1 > je .loop_simd > > +%if mmsize == 64 > + shr xq, 1 > + mov tmpq, -1 > + shlx tmpq, tmpq, xq > + not tmpq > + kmovq k7, tmpq ; write mask for U/V > + kmovd k1, tmpd ; write mask for 1st half of Y > + kmovw k3, tmpd ; read mask for 1st vector > + shr tmpq, 16 > + kmovw k4, tmpd ; read mask for 2nd vector > + shr tmpq, 16 > + kmovd k2, tmpd ; write mask for 2nd half of Y > + kmovw k5, tmpd ; read mask for 3rd vector > + shr tmpd, 16 > + kmovw k6, tmpd ; read mask for 4th vector > + > + vmovdqu32 m2{k3}{z}, [srcq + wtwoq ] > + vmovdqu32 m3{k4}{z}, [srcq + wtwoq + mmsize ] > + vmovdqu32 m4{k5}{z}, [srcq + wtwoq + mmsize * 2] > + vmovdqu32 m5{k6}{z}, [srcq + wtwoq + mmsize * 3] > + > + ; extract y part 1 > + mova m6, m9 > + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using > permute > + vmovdqu16 [ydstq + wq]{k1}, m6 > + > + ; extract y part 2 > + mova m7, m9 > + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using > permute > + vmovdqu16 [ydstq + wq + mmsize]{k2}, m7 > + > + ; extract uv > + pand m2, m1 ; UxVx... > + pand m3, m1 ; UxVx... > + pand m4, m1 ; UxVx... > + pand m5, m1 ; UxVx... > + packuswb m2, m3 ; UVUV... > + packuswb m4, m5 ; UVUV... > + > + ; U > + pand m6, m2, m1 ; UxUx... > + pand m7, m4, m1 ; UxUx... > + packuswb m6, m7 ; UUUU > + vpermb m6, m8, m6 > + vmovdqu8 [udstq + whalfq]{k7}, m6 > + > + ; V > + psrlw m2, 8 ; VxVx... > + psrlw m4, 8 ; VxVx... > + packuswb m2, m4 ; VVVV > + vpermb m2, m8, m2 > + vmovdqu8 [vdstq + whalfq]{k7}, m2 > + > + lea wq, [ wq + 2 * xq] > + lea wtwoq, [wtwoq + 4 * xq] > + add whalfq, xq > +%else > .loop_scalar: > mov tmpb, [srcq + wtwoq + 0] > mov [udstq + whalfq], tmpb > @@ -206,6 +288,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, > h, lum_stride, chrom_s > add whalfq, 1 > sub xq, 2 > jg .loop_scalar > +%endif > > ; check if simd loop is need > cmp wq, 0 > @@ -228,6 +311,17 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, > w, h, lum_stride, chrom_s > movu m5, [srcq + wtwoq + mmsize * 3] > %endif > > +%if mmsize == 64 > + ; extract y part 1 > + mova m6, m9 > + vpermi2b m6, m2, m3 ; UYVY UYVY -> YYYY using permute > + movu [ydstq + wq], m6 > + > + ; extract y part 2 > + mova m7, m9 > + vpermi2b m7, m4, m5 ; UYVY UYVY -> YYYY using permute > + movu [ydstq + wq + mmsize], m7 > +%else > ; extract y part 1 > RSHIFT_COPY m6, m2, m4, 1, 0x20 ; UYVY UYVY -> YVYU YVY... > pand m6, m1; YxYx YxYx... > @@ -247,6 +341,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, > h, lum_stride, chrom_s > > packuswb m6, m7 ; YYYY YYYY... > movu [ydstq + wq + mmsize], m6 > +%endif > > ; extract uv > pand m2, m1 ; UxVx... > @@ -262,6 +357,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, > h, lum_stride, chrom_s > pand m7, m4, m1 ; UxUx... > > packuswb m6, m7 ; UUUU > +%if mmsize == 64 > + vpermb m6, m8, m6 > +%endif > movu [udstq + whalfq], m6 > > > @@ -269,6 +367,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, > h, lum_stride, chrom_s > psrlw m2, 8 ; VxVx... > psrlw m4, 8 ; VxVx... > packuswb m2, m4 ; VVVV > +%if mmsize == 64 > + vpermb m2, m8, m2 > +%endif > movu [vdstq + whalfq], m2 > > add whalfq, mmsize > @@ -303,4 +404,8 @@ UYVY_TO_YUV422 > INIT_YMM avx2 > UYVY_TO_YUV422 > %endif > +%if HAVE_AVX512ICL_EXTERNAL > +INIT_ZMM avx512icl > +UYVY_TO_YUV422 > +%endif > %endif > -- > 2.45.3 >
Hi maintainers, Would anyone be willing to review this and provide inputs on getting this accepted? As a new contributor interested in contributing ASM, I was hoping to work on https://trac.ffmpeg.org/wiki/SmallASMTasks mentioned AVX512ICL work. Thanks, Shreesh _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".