[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422
The scalar loop is replaced with masked AVX512 instructions. For extracting the Y from UYVY, vperm2b is used instead of various AND and packuswb. Instead of loading the vectors with interleaved lanes as done in AVX2 version, normal load is used. At the end of packuswb, for U and V, an extra permute operation is done to get the required layout. AMD 7950x Zen 4 benchmark data: uyvytoyuv422_c: 29105.0 ( 1.00x) uyvytoyuv422_sse2:3888.0 ( 7.49x) uyvytoyuv422_avx: 3374.2 ( 8.63x) uyvytoyuv422_avx2:2649.8 (10.98x) uyvytoyuv422_avx512icl: 1615.0 (18.02x) Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> --- libswscale/x86/rgb2rgb.c | 6 ++ libswscale/x86/rgb_2_rgb.asm | 105 +++ 2 files changed, 111 insertions(+) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 4cbed54b35..6601dad233 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + const uint8_t *src, int width, int height, + int lumStride, int chromStride, int srcStride); #endif #define DEINTERLEAVE_BYTES(cpuext)\ @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; +} +if (EXTERNAL_AVX512ICL(cpu_flags)) { +uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; #endif } #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index ca7a481255..6e4df17298 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 12, 13, 15 pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 +%if HAVE_AVX512ICL_EXTERNAL +; shuffle vector to rearrange packuswb result to be linear +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51,\ + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 55,\ + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 59,\ +12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63 + +; shuffle vector to combine odd elements from two vectors to extract Y +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,\ +33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,\ +65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95,\ +97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 +%endif + SECTION .text %macro RSHIFT_COPY 5 @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 ; int lumStride, int chromStride, int srcStride) ;--- %macro UYVY_TO_YUV422 0 +%if mmsize == 64 +; need two more registers to store shuffle vectors for AVX512ICL +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%else cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w +%endif pxor m0, m0 +%if mmsize == 64 +vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) +movu m8, [shuf_packus] +movu m9, [shuf_perm2b] +%else pcmpeqw m1, m1 +%endif psrlwm1, 8 movsxdifnidnwq, wd @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s and xq, mmsize * 2 - 1 je .loop_simd +%if mmsize == 64 +shr xq, 1 +mov tmpq, -1 +shlx tmpq, tmpq, xq +not tmpq +kmovq k7, tmpq ; write mask for U/V +kmovd k1, tmpd ; write mask for 1st half of Y +kmovw k3, tmpd ; read mask for 1st vector +shr tmpq, 16 +kmovw k4, tmpd ; read mask for 2nd vector +shr tmpq, 16 +kmovd k2, tmpd ; write mask for 2nd half of Y +kmovw k5, tmpd ; read mask for 3rd vector +shr tmpd, 16 +kmovw k6, tmpd ; read mask for 4th vector + +vmovdqu32 m2{k3}{z}, [srcq + wtwoq ] +vmovdqu32 m3{k4}{z}, [srcq +
Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
Hi Andreas, I am not sure if that is needed. I can add the data observed on my machine (AMD 7950x Zen 4), I think this will vary from machine to machine. It is expected to be around 2x compared to AVX2 and there is no core change apart from processing the scalar loop with masked instructions. The data doesn't entirely look consistent as per my expectations. All the shuffle variants are equivalent in the work they do, yet the speedups are not consistent as per the report. shuffle_bytes_0321_c: 56.5 ( 1.00x) shuffle_bytes_0321_ssse3: 15.2 ( 3.70x) shuffle_bytes_0321_avx2:10.2 ( 5.51x) shuffle_bytes_0321_avx512icl:9.2 ( 6.11x) shuffle_bytes_1230_c: 84.5 ( 1.00x) shuffle_bytes_1230_ssse3: 14.2 ( 5.93x) shuffle_bytes_1230_avx2:15.2 ( 5.54x) shuffle_bytes_1230_avx512icl: 11.2 ( 7.51x) shuffle_bytes_2103_c: 48.5 ( 1.00x) shuffle_bytes_2103_ssse3: 21.2 ( 2.28x) shuffle_bytes_2103_avx2:13.8 ( 3.53x) shuffle_bytes_2103_avx512icl:9.2 ( 5.24x) shuffle_bytes_3012_c: 84.5 ( 1.00x) shuffle_bytes_3012_ssse3: 14.2 ( 5.93x) shuffle_bytes_3012_avx2:16.2 ( 5.20x) shuffle_bytes_3012_avx512icl: 10.2 ( 8.24x) shuffle_bytes_3210_c: 89.2 ( 1.00x) shuffle_bytes_3210_ssse3: 24.2 ( 3.68x) shuffle_bytes_3210_avx2:16.2 ( 5.49x) shuffle_bytes_3210_avx512icl:9.2 ( 9.65x) I can add the details to commit message if you can confirm if it is needed. Thanks, Shreesh On Wed, Jan 29, 2025 at 5:46 PM Andreas Rheinhardt < andreas.rheinha...@outlook.com> wrote: > Shreesh Adiga: > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > > --- > > v3: Fix build failure on older nasm by replacing "kmovw k, tmpw" > > with "kmov k, tmpd" which matches "kmovw k, r32" syntax. > > v2: Tried to align operands and improve indentation for ASM routine. > > libswscale/x86/rgb2rgb.c | 21 + > > libswscale/x86/rgb_2_rgb.asm | 90 +++- > > 2 files changed, 80 insertions(+), 31 deletions(-) > > > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > > index 6790551a38..4cbed54b35 100644 > > --- a/libswscale/x86/rgb2rgb.c > > +++ b/libswscale/x86/rgb2rgb.c > > @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t > *src, uint8_t *dst, int src_size); > > void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int > src_size); > > void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int > src_size); > > > > +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, > int src_size); > > + > > void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > >const uint8_t *src, int width, int height, > >int lumStride, int chromStride, int > srcStride); > > @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void) > > shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; > > shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; > > } > > +if (EXTERNAL_AVX512ICL(cpu_flags)) { > > +shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; > > +shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; > > +shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; > > +shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; > > +shuffle_bytes_3210 = ff_shuffle_bytes_3210_av
Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
> Try running it several times using the same seed, so > "tests/checkasm/checkasm --test=sw_rgb --bench 17575157", and make sure > no power saving feature is enabled (so the CPU frequency doesn't change > based on load). That may help getting consistent results. After running "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" and recompiling ffmpeg with "--enable-linux-perf", I am seeing the below numbers: shuffle_bytes_0321_c: 56.5 ( 1.00x) shuffle_bytes_0321_ssse3: 18.0 ( 3.14x) shuffle_bytes_0321_avx2:10.0 ( 5.65x) shuffle_bytes_0321_avx512icl:9.0 ( 6.28x) shuffle_bytes_1230_c: 84.5 ( 1.00x) shuffle_bytes_1230_ssse3: 18.2 ( 4.63x) shuffle_bytes_1230_avx2:22.2 ( 3.80x) shuffle_bytes_1230_avx512icl: 10.0 ( 8.45x) shuffle_bytes_2103_c: 49.8 ( 1.00x) shuffle_bytes_2103_ssse3: 21.2 ( 2.34x) shuffle_bytes_2103_avx2:17.5 ( 2.84x) shuffle_bytes_2103_avx512icl:7.5 ( 6.63x) shuffle_bytes_3012_c: 84.5 ( 1.00x) shuffle_bytes_3012_ssse3: 17.0 ( 4.97x) shuffle_bytes_3012_avx2:16.0 ( 5.28x) shuffle_bytes_3012_avx512icl: 16.2 ( 5.20x) shuffle_bytes_3210_c: 92.8 ( 1.00x) shuffle_bytes_3210_ssse3: 25.8 ( 3.60x) shuffle_bytes_3210_avx2:14.0 ( 6.62x) shuffle_bytes_3210_avx512icl:9.0 (10.31x) Thanks, Shreesh ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> --- v3: Fix build failure on older nasm by replacing "kmovw k, tmpw" with "kmov k, tmpd" which matches "kmovw k, r32" syntax. v2: Tried to align operands and improve indentation for ASM routine. libswscale/x86/rgb2rgb.c | 21 + libswscale/x86/rgb_2_rgb.asm | 90 +++- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 6790551a38..4cbed54b35 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); + void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; } +if (EXTERNAL_AVX512ICL(cpu_flags)) { +shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; +shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; +shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; +shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; +shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; +shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; +shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; +shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; +shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; +} if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index b468beb12d..ca7a481255 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -57,40 +57,53 @@ SECTION .text %macro SHUFFLE_BYTES 4 cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x VBROADCASTI128m0, [pb_shuffle%1%2%3%4] -movsxdifnidn wq, wd -mov xq, wq - -addsrcq, wq -adddstq, wq -neg wq - -;calc scalar loop +movsxdifnidn wq, wd +mov xq, wq + +add srcq, wq +add dstq, wq +neg wq + +%if mmsize == 64 +andxq, mmsize - 4 +shrxq, 2 +mov tmpd, -1 +shlx tmpd, tmpd, xd +not tmpd +kmovw k7, tmpd +vmovdqu32 m1{k7}{z}, [srcq + wq] +pshufb m1, m0 +vmovdqu32 [dstq + wq]{k7}, m1 +leawq, [wq + 4 * xq] +%else +;calc scalar loop and xq, mmsize-4 je .loop_simd -.loop_scalar: - mov tmpb, [srcq + wq + %1] - mov [dstq+wq + 0], tmpb - mov tmpb, [srcq + wq + %2] - mov [dstq+wq + 1], tmpb - mov tmpb, [srcq + wq + %3] - mov [dstq+wq + 2], tmpb - mov tmpb, [srcq + wq + %4] - mov [dstq+wq + 3], tmpb - addwq, 4 - subxq, 4 - jg .loop_scalar - -;check if src_size < mmsize -cmp wq, 0 -jge .end - -.loop_simd: -movu m1, [srcq+wq] -pshufb m1, m0 -movu[dstq+wq], m1 -addwq, mmsize -jl .loop_simd +.loop_scalar: +mov tmpb, [srcq + wq + %1] +mov [dstq+wq + 0], tmpb +mov tmpb, [srcq + wq + %2] +mov [dstq+wq + 1], tmpb +mov tmpb, [srcq + wq + %3] +mov [dstq+wq + 2], tmpb +mov tmpb, [srcq + wq + %4] +mov [dstq+wq + 3], tmpb +addwq, 4 +subxq, 4 +jg .loop_scalar +%endif + +;check if src_size < mmsize +cmp wq, 0 +jge .end + +.loop_simd: +movu
Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
> Thanks for the patch. Could you please compile and run > tests/checkasm/checkasm with "--test=sw_rgb --bench" and paste the > results for the shuffle_bytes functions, to see if there's a speed up > compared to the AVX2 implementation? I ran the command "tests/checkasm/checkasm --test=sw_rgb --bench" and I see the below output: benchmarking with native FFmpeg timers nop: 45.0 checkasm: using random seed 17575157 checkasm: bench runs 1024 (1 << 10) SSE2: - sw_rgb.uyvytoyuv422 [OK] - sw_rgb.interleave_bytes [OK] - sw_rgb.deinterleave_bytes [OK] - sw_rgb.rgb_to_y [OK] - sw_rgb.rgb_to_uv [OK] SSSE3: - sw_rgb.shuffle_bytes_2103 [OK] - sw_rgb.shuffle_bytes_0321 [OK] - sw_rgb.shuffle_bytes_1230 [OK] - sw_rgb.shuffle_bytes_3012 [OK] - sw_rgb.shuffle_bytes_3210 [OK] - sw_rgb.rgb_to_y [OK] - sw_rgb.rgb_to_uv [OK] AVX: - sw_rgb.uyvytoyuv422 [OK] - sw_rgb.deinterleave_bytes [OK] - sw_rgb.rgb_to_y [OK] - sw_rgb.rgb_to_uv [OK] AVX2: - sw_rgb.shuffle_bytes_2103 [OK] - sw_rgb.shuffle_bytes_0321 [OK] - sw_rgb.shuffle_bytes_1230 [OK] - sw_rgb.shuffle_bytes_3012 [OK] - sw_rgb.shuffle_bytes_3210 [OK] - sw_rgb.uyvytoyuv422 [OK] - sw_rgb.rgb_to_y [OK] - sw_rgb.rgb_to_uv [OK] AVX-512ICL: - sw_rgb.shuffle_bytes_2103 [OK] - sw_rgb.shuffle_bytes_0321 [OK] - sw_rgb.shuffle_bytes_1230 [OK] - sw_rgb.shuffle_bytes_3012 [OK] - sw_rgb.shuffle_bytes_3210 [OK] checkasm: all 184 tests passed shuffle_bytes_0321_c: 45.0 ( 1.00x) shuffle_bytes_0321_ssse3: 11.2 ( 4.00x) shuffle_bytes_0321_avx2:11.2 ( 4.00x) shuffle_bytes_0321_avx512icl: 11.2 ( 4.00x) shuffle_bytes_1230_c: 67.5 ( 1.00x) shuffle_bytes_1230_ssse3: 11.2 ( 6.00x) shuffle_bytes_1230_avx2:11.2 ( 6.00x) shuffle_bytes_1230_avx512icl:0.0 ( 0.00x) shuffle_bytes_2103_c: 45.0 ( 1.00x) shuffle_bytes_2103_ssse3: 11.2 ( 4.00x) shuffle_bytes_2103_avx2: 0.0 ( 0.00x) shuffle_bytes_2103_avx512icl:0.0 ( 0.00x) shuffle_bytes_3012_c: 67.5 ( 1.00x) shuffle_bytes_3012_ssse3: 11.2 ( 6.00x) shuffle_bytes_3012_avx2:11.2 ( 6.00x) shuffle_bytes_3012_avx512icl:0.0 ( 0.00x) shuffle_bytes_3210_c: 67.5 ( 1.00x) shuffle_bytes_3210_ssse3: 11.2 ( 6.00x) shuffle_bytes_3210_avx2:11.2 ( 6.00x) shuffle_bytes_3210_avx512icl:0.0 ( 0.00x) I've not included the other function printed by the bench command. I'm not sure if I'm missing something, the output doesn't look consistent to me. There are many 0.0 and I don't see any difference between ssse3 and avx2 either. I'm running this on AMD Ryzen 7950x Zen4 machine. I've inspected the assembly output for one of the ssse3/avx2/avx512 and it seems to be as per my expectation. Therefore I'm not sure if the checkasm is accurately measuring here. Please let me know if I'm missing something here, I'm new to FFmpeg development and this is my first patch submission. Thanks, Shreesh ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> --- v2: Tried to align operands and improve indentation for ASM routine. libswscale/x86/rgb2rgb.c | 21 + libswscale/x86/rgb_2_rgb.asm | 90 +++- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 6790551a38..4cbed54b35 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); + void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; } +if (EXTERNAL_AVX512ICL(cpu_flags)) { +shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; +shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; +shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; +shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; +shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; +shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; +shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; +shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; +shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; +} if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index b468beb12d..3a5e217111 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -57,40 +57,53 @@ SECTION .text %macro SHUFFLE_BYTES 4 cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x VBROADCASTI128m0, [pb_shuffle%1%2%3%4] -movsxdifnidn wq, wd -mov xq, wq - -addsrcq, wq -adddstq, wq -neg wq - -;calc scalar loop +movsxdifnidn wq, wd +mov xq, wq + +add srcq, wq +add dstq, wq +neg wq + +%if mmsize == 64 +andxq, mmsize - 4 +shrxq, 2 +mov tmpd, -1 +shlx tmpd, tmpd, xd +not tmpd +kmovw k7, tmpw +vmovdqu32 m1{k7}{z}, [srcq + wq] +pshufb m1, m0 +vmovdqu32 [dstq + wq]{k7}, m1 +leawq, [wq + 4 * xq] +%else +;calc scalar loop and xq, mmsize-4 je .loop_simd -.loop_scalar: - mov tmpb, [srcq + wq + %1] - mov [dstq+wq + 0], tmpb - mov tmpb, [srcq + wq + %2] - mov [dstq+wq + 1], tmpb - mov tmpb, [srcq + wq + %3] - mov [dstq+wq + 2], tmpb - mov tmpb, [srcq + wq + %4] - mov [dstq+wq + 3], tmpb - addwq, 4 - subxq, 4 - jg .loop_scalar - -;check if src_size < mmsize -cmp wq, 0 -jge .end - -.loop_simd: -movu m1, [srcq+wq] -pshufb m1, m0 -movu[dstq+wq], m1 -addwq, mmsize -jl .loop_simd +.loop_scalar: +mov tmpb, [srcq + wq + %1] +mov [dstq+wq + 0], tmpb +mov tmpb, [srcq + wq + %2] +mov [dstq+wq + 1], tmpb +mov tmpb, [srcq + wq + %3] +mov [dstq+wq + 2], tmpb +mov tmpb, [srcq + wq + %4] +mov [dstq+wq + 3], tmpb +addwq, 4 +subxq, 4 +jg .loop_scalar +%endif + +;check if src_size < mmsize +cmp wq, 0 +jge .end + +.loop_simd: +movum1, [srcq + wq] +pshufb m1, m0 +movu [dstq + wq], m1 +add wq, mmsize +jl .loop_simd .e
[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> --- libswscale/x86/rgb2rgb.c | 21 + libswscale/x86/rgb_2_rgb.asm | 28 2 files changed, 49 insertions(+) diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index 6790551a38..4cbed54b35 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); + void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride); @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void) shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; } +if (EXTERNAL_AVX512ICL(cpu_flags)) { +shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; +shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; +shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; +shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; +shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; +shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; +shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; +shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; +shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; +} if (EXTERNAL_AVX2_FAST(cpu_flags)) { uyvytoyuv422 = ff_uyvytoyuv422_avx2; #endif diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index b468beb12d..64b0988c4a 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -64,6 +64,18 @@ cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x adddstq, wq neg wq +%if mmsize == 64 +and xq, mmsize-4 +shr xq, 2 +mov tmpd, -1 +shlx tmpd, tmpd, xd +not tmpd +kmovw k7, tmpw +vmovdqu32 m1{k7}{z}, [srcq + wq] +pshufb m1, m0 +vmovdqu32 [dstq + wq]{k7}, m1 +lea wq, [wq + 4 * xq] +%else ;calc scalar loop and xq, mmsize-4 je .loop_simd @@ -80,6 +92,7 @@ cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x addwq, 4 subxq, 4 jg .loop_scalar +%endif ;check if src_size < mmsize cmp wq, 0 @@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3 %endif %endif +%if ARCH_X86_64 +%if HAVE_AVX512ICL_EXTERNAL +INIT_ZMM avx512icl +SHUFFLE_BYTES 2, 1, 0, 3 +SHUFFLE_BYTES 0, 3, 2, 1 +SHUFFLE_BYTES 1, 2, 3, 0 +SHUFFLE_BYTES 3, 0, 1, 2 +SHUFFLE_BYTES 3, 2, 1, 0 +SHUFFLE_BYTES 3, 1, 0, 2 +SHUFFLE_BYTES 2, 0, 1, 3 +SHUFFLE_BYTES 2, 1, 3, 0 +SHUFFLE_BYTES 1, 2, 0, 3 +%endif +%endif + ;--- ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, ; const uint8_t *src, int width, int height, -- 2.45.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422
On Mon, Feb 3, 2025 at 10:03 PM Shreesh Adiga <16567adigashre...@gmail.com> wrote: > > The scalar loop is replaced with masked AVX512 instructions. > For extracting the Y from UYVY, vperm2b is used instead of > various AND and packuswb. > > Instead of loading the vectors with interleaved lanes as done > in AVX2 version, normal load is used. At the end of packuswb, > for U and V, an extra permute operation is done to get the > required layout. > > AMD 7950x Zen 4 benchmark data: > uyvytoyuv422_c: 29105.0 ( 1.00x) > uyvytoyuv422_sse2:3888.0 ( 7.49x) > uyvytoyuv422_avx: 3374.2 ( 8.63x) > uyvytoyuv422_avx2:2649.8 (10.98x) > uyvytoyuv422_avx512icl: 1615.0 (18.02x) > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > --- > libswscale/x86/rgb2rgb.c | 6 ++ > libswscale/x86/rgb_2_rgb.asm | 105 +++ > 2 files changed, 111 insertions(+) > > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c > index 4cbed54b35..6601dad233 100644 > --- a/libswscale/x86/rgb2rgb.c > +++ b/libswscale/x86/rgb2rgb.c > @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, > uint8_t *vdst, > void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, >const uint8_t *src, int width, int height, >int lumStride, int chromStride, int srcStride); > +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, > + const uint8_t *src, int width, int height, > + int lumStride, int chromStride, int > srcStride); > #endif > > #define DEINTERLEAVE_BYTES(cpuext) > \ > @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void) > } > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > uyvytoyuv422 = ff_uyvytoyuv422_avx2; > +} > +if (EXTERNAL_AVX512ICL(cpu_flags)) { > +uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; > #endif > } > #endif > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index ca7a481255..6e4df17298 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, > 14, 12, 13, 15 > pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12 > pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15 > > +%if HAVE_AVX512ICL_EXTERNAL > +; shuffle vector to rearrange packuswb result to be linear > +shuf_packus: db 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, > 51,\ > + 4, 5, 6, 7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, > 55,\ > + 8, 9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, > 59,\ > +12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, > 63 > + > +; shuffle vector to combine odd elements from two vectors to extract Y > +shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, > 25, 27, 29, 31,\ > +33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, > 57, 59, 61, 63,\ > +65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, > 89, 91, 93, 95,\ > +97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, > 121, 123, 125, 127 > +%endif > + > SECTION .text > > %macro RSHIFT_COPY 5 > @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3 > ; int lumStride, int chromStride, int srcStride) > > ;--- > %macro UYVY_TO_YUV422 0 > +%if mmsize == 64 > +; need two more registers to store shuffle vectors for AVX512ICL > +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > +%else > cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > +%endif > pxor m0, m0 > +%if mmsize == 64 > +vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) > +movu m8, [shuf_packus] > +movu m9, [shuf_perm2b] > +%else > pcmpeqw m1, m1 > +%endif > psrlwm1, 8 > > movsxdifnidnwq, wd > @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, > w, h, lum_stride, chrom_s > and xq
Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422
On Thu, Feb 20, 2025 at 6:51 PM Shreesh Adiga <16567adigashre...@gmail.com> wrote: > > Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the > following: > 4 vinsertq to have interleaving of the vector lanes during load from memory. > 4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout. > > This patch replaces the above 8 instructions with 2 vpermq and > 2 vpermd with a vector register similar to AVX512ICL version. > > Observed the following numbers on various microarchitectures: > > On AMD Zen3 laptop: > Before: > uyvytoyuv422_c: 51979.7 ( 1.00x) > uyvytoyuv422_sse2:5410.5 ( 9.61x) > uyvytoyuv422_avx: 4642.7 (11.20x) > uyvytoyuv422_avx2:4249.0 (12.23x) > > After: > uyvytoyuv422_c: 51659.8 ( 1.00x) > uyvytoyuv422_sse2:5420.8 ( 9.53x) > uyvytoyuv422_avx: 4651.2 (11.11x) > uyvytoyuv422_avx2:3953.8 (13.07x) > > On Intel Macbook Pro 2019: > Before: > uyvytoyuv422_c: 185014.4 ( 1.00x) > uyvytoyuv422_sse2: 22800.4 ( 8.11x) > uyvytoyuv422_avx:19796.9 ( 9.35x) > uyvytoyuv422_avx2: 13141.9 (14.08x) > > After: > uyvytoyuv422_c: 185093.4 ( 1.00x) > uyvytoyuv422_sse2: 22795.4 ( 8.12x) > uyvytoyuv422_avx:19791.9 ( 9.35x) > uyvytoyuv422_avx2: 12043.1 (15.37x) > > On AMD Zen4 desktop: > Before: > uyvytoyuv422_c: 29105.0 ( 1.00x) > uyvytoyuv422_sse2:3888.0 ( 7.49x) > uyvytoyuv422_avx: 3374.2 ( 8.63x) > uyvytoyuv422_avx2:2649.8 (10.98x) > uyvytoyuv422_avx512icl: 1615.0 (18.02x) > > After: > uyvytoyuv422_c: 29093.4 ( 1.00x) > uyvytoyuv422_sse2:3874.4 ( 7.51x) > uyvytoyuv422_avx: 3371.6 ( 8.63x) > uyvytoyuv422_avx2:2174.6 (13.38x) > uyvytoyuv422_avx512icl: 1625.1 (17.90x) > > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> > --- > libswscale/x86/rgb_2_rgb.asm | 68 ++-- > 1 file changed, 34 insertions(+), 34 deletions(-) > > diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm > index 6e4df17298..871bb21127 100644 > --- a/libswscale/x86/rgb_2_rgb.asm > +++ b/libswscale/x86/rgb_2_rgb.asm > @@ -49,18 +49,21 @@ shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, > 17, 19, 21, 23, 25, > 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, > 121, 123, 125, 127 > %endif > > +%if HAVE_AVX2_EXTERNAL > +; shuffle vector to rearrange packuswb result to be linear > +shuf_packus_avx2: db 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\ > + 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, > +%endif > + > SECTION .text > > -%macro RSHIFT_COPY 5 > +%macro RSHIFT_COPY 3 > ; %1 dst ; %2 src ; %3 shift > -%if mmsize == 32 > -vperm2i128 %1, %2, %3, %5 > -RSHIFT %1, %4 > -%elif cpuflag(avx) > -psrldq %1, %2, %4 > +%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl) > +psrldq %1, %2, %3 > %else > mova %1, %2 > -RSHIFT %1, %4 > +RSHIFT %1, %3 > %endif > %endmacro > > @@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3 > ; int lumStride, int chromStride, int srcStride) > > ;--- > %macro UYVY_TO_YUV422 0 > -%if mmsize == 64 > -; need two more registers to store shuffle vectors for AVX512ICL > -cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > -%else > -cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, > chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w > -%endif > +cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, > udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, > tmp, x, back_w > pxor m0, m0 > %if mmsize == 64
[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422
Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the following: 4 vinsertq to have interleaving of the vector lanes during load from memory. 4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout. This patch replaces the above 8 instructions with 2 vpermq and 2 vpermd with a vector register similar to AVX512ICL version. Observed the following numbers on various microarchitectures: On AMD Zen3 laptop: Before: uyvytoyuv422_c: 51979.7 ( 1.00x) uyvytoyuv422_sse2:5410.5 ( 9.61x) uyvytoyuv422_avx: 4642.7 (11.20x) uyvytoyuv422_avx2:4249.0 (12.23x) After: uyvytoyuv422_c: 51659.8 ( 1.00x) uyvytoyuv422_sse2:5420.8 ( 9.53x) uyvytoyuv422_avx: 4651.2 (11.11x) uyvytoyuv422_avx2:3953.8 (13.07x) On Intel Macbook Pro 2019: Before: uyvytoyuv422_c: 185014.4 ( 1.00x) uyvytoyuv422_sse2: 22800.4 ( 8.11x) uyvytoyuv422_avx:19796.9 ( 9.35x) uyvytoyuv422_avx2: 13141.9 (14.08x) After: uyvytoyuv422_c: 185093.4 ( 1.00x) uyvytoyuv422_sse2: 22795.4 ( 8.12x) uyvytoyuv422_avx:19791.9 ( 9.35x) uyvytoyuv422_avx2: 12043.1 (15.37x) On AMD Zen4 desktop: Before: uyvytoyuv422_c: 29105.0 ( 1.00x) uyvytoyuv422_sse2:3888.0 ( 7.49x) uyvytoyuv422_avx: 3374.2 ( 8.63x) uyvytoyuv422_avx2:2649.8 (10.98x) uyvytoyuv422_avx512icl: 1615.0 (18.02x) After: uyvytoyuv422_c: 29093.4 ( 1.00x) uyvytoyuv422_sse2:3874.4 ( 7.51x) uyvytoyuv422_avx: 3371.6 ( 8.63x) uyvytoyuv422_avx2:2174.6 (13.38x) uyvytoyuv422_avx512icl: 1625.1 (17.90x) Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com> --- libswscale/x86/rgb_2_rgb.asm | 68 ++-- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm index 6e4df17298..871bb21127 100644 --- a/libswscale/x86/rgb_2_rgb.asm +++ b/libswscale/x86/rgb_2_rgb.asm @@ -49,18 +49,21 @@ shuf_perm2b: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127 %endif +%if HAVE_AVX2_EXTERNAL +; shuffle vector to rearrange packuswb result to be linear +shuf_packus_avx2: db 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\ + 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, +%endif + SECTION .text -%macro RSHIFT_COPY 5 +%macro RSHIFT_COPY 3 ; %1 dst ; %2 src ; %3 shift -%if mmsize == 32 -vperm2i128 %1, %2, %3, %5 -RSHIFT %1, %4 -%elif cpuflag(avx) -psrldq %1, %2, %4 +%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl) +psrldq %1, %2, %3 %else mova %1, %2 -RSHIFT %1, %4 +RSHIFT %1, %3 %endif %endmacro @@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3 ; int lumStride, int chromStride, int srcStride) ;--- %macro UYVY_TO_YUV422 0 -%if mmsize == 64 -; need two more registers to store shuffle vectors for AVX512ICL -cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w -%else -cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w -%endif +cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w pxor m0, m0 %if mmsize == 64 vpternlogd m1, m1, m1, 0xff ; m1 = _mm512_set1_epi8(0xff) movu m8, [shuf_packus] movu m9, [shuf_perm2b] %else +%if cpuflag(avx2) +movu m8, [shuf_packus_avx2] +%endif pcmpeqw m1, m1 %endif psrlwm1, 8 @@ -295,21 +296,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s jge .end_line .loop_simd: -%if mmsize == 32 -movu xm2, [srcq + wtwoq ] -movu xm3, [srcq + wtwoq + 16] -movu xm4, [srcq + wtwoq + 16 * 2] -movu xm5, [srcq + wtwoq +