Hi Andreas, I am not sure if that is needed. I can add the data observed on my machine (AMD 7950x Zen 4), I think this will vary from machine to machine. It is expected to be around 2x compared to AVX2 and there is no core change apart from processing the scalar loop with masked instructions.
The data doesn't entirely look consistent as per my expectations. All the shuffle variants are equivalent in the work they do, yet the speedups are not consistent as per the report. shuffle_bytes_0321_c: 56.5 ( 1.00x) shuffle_bytes_0321_ssse3: 15.2 ( 3.70x) shuffle_bytes_0321_avx2: 10.2 ( 5.51x) shuffle_bytes_0321_avx512icl: 9.2 ( 6.11x) shuffle_bytes_1230_c: 84.5 ( 1.00x) shuffle_bytes_1230_ssse3: 14.2 ( 5.93x) shuffle_bytes_1230_avx2: 15.2 ( 5.54x) shuffle_bytes_1230_avx512icl: 11.2 ( 7.51x) shuffle_bytes_2103_c: 48.5 ( 1.00x) shuffle_bytes_2103_ssse3: 21.2 ( 2.28x) shuffle_bytes_2103_avx2: 13.8 ( 3.53x) shuffle_bytes_2103_avx512icl: 9.2 ( 5.24x) shuffle_bytes_3012_c: 84.5 ( 1.00x) shuffle_bytes_3012_ssse3: 14.2 ( 5.93x) shuffle_bytes_3012_avx2: 16.2 ( 5.20x) shuffle_bytes_3012_avx512icl: 10.2 ( 8.24x) shuffle_bytes_3210_c: 89.2 ( 1.00x) shuffle_bytes_3210_ssse3: 24.2 ( 3.68x) shuffle_bytes_3210_avx2: 16.2 ( 5.49x) shuffle_bytes_3210_avx512icl: 9.2 ( 9.65x) I can add the details to commit message if you can confirm if it is needed. Thanks, Shreesh On Wed, Jan 29, 2025 at 5:46 PM Andreas Rheinhardt < andreas.rheinha...@outlook.com> wrote: > Shreesh Adiga: > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > > --- > > v3: Fix build failure on older nasm by replacing "kmovw k, tmpw" > > with "kmov k, tmpd" which matches "kmovw k, r32" syntax. > > v2: Tried to align operands and improve indentation for ASM routine. > > libswscale/x86/rgb2rgb.c | 21 +++++++++ > > libswscale/x86/rgb_2_rgb.asm | 90 +++++++++++++++++++++++------------- > > 2 files changed, 80 insertions(+), 31 deletions(-) > > > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > > index 6790551a38..4cbed54b35 100644 > > --- a/libswscale/x86/rgb2rgb.c > > +++ b/libswscale/x86/rgb2rgb.c > > @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t > *src, uint8_t *dst, int src_size); > > void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int > src_size); > > void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int > src_size); > > > > +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > + > > void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > > const uint8_t *src, int width, int height, > > int lumStride, int chromStride, int > srcStride); > > @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void) > > shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; > > shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; > > } > > + if (EXTERNAL_AVX512ICL(cpu_flags)) { > > + shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; > > + shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; > > + shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; > > + shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; > > + shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; > > + shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; > > + shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; > > + shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; > > + shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; > > + } > > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > > uyvytoyuv422 = ff_uyvytoyuv422_avx2; > > #endif > > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > > index b468beb12d..ca7a481255 100644 > > --- a/libswscale/x86/rgb_2_rgb.asm > > +++ b/libswscale/x86/rgb_2_rgb.asm > > @@ -57,40 +57,53 @@ SECTION .text > > %macro SHUFFLE_BYTES 4 > > cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x > > VBROADCASTI128 m0, [pb_shuffle%1%2%3%4] > > - movsxdifnidn wq, wd > > - mov xq, wq > > - > > - add srcq, wq > > - add dstq, wq > > - neg wq > > - > > -;calc scalar loop > > + movsxdifnidn wq, wd > > + mov xq, wq > > + > > + add srcq, wq > > + add dstq, wq > > + neg wq > > + > > +%if mmsize == 64 > > + and xq, mmsize - 4 > > + shr xq, 2 > > + mov tmpd, -1 > > + shlx tmpd, tmpd, xd > > + not tmpd > > + kmovw k7, tmpd > > + vmovdqu32 m1{k7}{z}, [srcq + wq] > > + pshufb m1, m0 > > + vmovdqu32 [dstq + wq]{k7}, m1 > > + lea wq, [wq + 4 * xq] > > +%else > > + ;calc scalar loop > > and xq, mmsize-4 > > je .loop_simd > > > > -.loop_scalar: > > - mov tmpb, [srcq + wq + %1] > > - mov [dstq+wq + 0], tmpb > > - mov tmpb, [srcq + wq + %2] > > - mov [dstq+wq + 1], tmpb > > - mov tmpb, [srcq + wq + %3] > > - mov [dstq+wq + 2], tmpb > > - mov tmpb, [srcq + wq + %4] > > - mov [dstq+wq + 3], tmpb > > - add wq, 4 > > - sub xq, 4 > > - jg .loop_scalar > > - > > -;check if src_size < mmsize > > -cmp wq, 0 > > -jge .end > > - > > -.loop_simd: > > - movu m1, [srcq+wq] > > - pshufb m1, m0 > > - movu [dstq+wq], m1 > > - add wq, mmsize > > - jl .loop_simd > > + .loop_scalar: > > + mov tmpb, [srcq + wq + %1] > > + mov [dstq+wq + 0], tmpb > > + mov tmpb, [srcq + wq + %2] > > + mov [dstq+wq + 1], tmpb > > + mov tmpb, [srcq + wq + %3] > > + mov [dstq+wq + 2], tmpb > > + mov tmpb, [srcq + wq + %4] > > + mov [dstq+wq + 3], tmpb > > + add wq, 4 > > + sub xq, 4 > > + jg .loop_scalar > > +%endif > > + > > + ;check if src_size < mmsize > > + cmp wq, 0 > > + jge .end > > + > > + .loop_simd: > > + movu m1, [srcq + wq] > > + pshufb m1, m0 > > + movu [dstq + wq], m1 > > + add wq, mmsize > > + jl .loop_simd > > > > .end: > > RET > > @@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3 > > %endif > > %endif > > > > +%if ARCH_X86_64 > > +%if HAVE_AVX512ICL_EXTERNAL > > +INIT_ZMM avx512icl > > +SHUFFLE_BYTES 2, 1, 0, 3 > > +SHUFFLE_BYTES 0, 3, 2, 1 > > +SHUFFLE_BYTES 1, 2, 3, 0 > > +SHUFFLE_BYTES 3, 0, 1, 2 > > +SHUFFLE_BYTES 3, 2, 1, 0 > > +SHUFFLE_BYTES 3, 1, 0, 2 > > +SHUFFLE_BYTES 2, 0, 1, 3 > > +SHUFFLE_BYTES 2, 1, 3, 0 > > +SHUFFLE_BYTES 1, 2, 0, 3 > > +%endif > > +%endif > > + > > > ;----------------------------------------------------------------------------------------------- > > ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > > ; const uint8_t *src, int width, int height, > > -- > > 2.45.3 > > > Why does the commit message not contain speedup numbers? > > - Andreas > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".