[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422

2025-02-03 Thread Shreesh Adiga
The scalar loop is replaced with masked AVX512 instructions.
For extracting the Y from UYVY, vperm2b is used instead of
various AND and packuswb.

Instead of loading the vectors with interleaved lanes as done
in AVX2 version, normal load is used. At the end of packuswb,
for U and V, an extra permute operation is done to get the
required layout.

AMD 7950x Zen 4 benchmark data:
uyvytoyuv422_c:  29105.0 ( 1.00x)
uyvytoyuv422_sse2:3888.0 ( 7.49x)
uyvytoyuv422_avx: 3374.2 ( 8.63x)
uyvytoyuv422_avx2:2649.8 (10.98x)
uyvytoyuv422_avx512icl:   1615.0 (18.02x)

Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
---
 libswscale/x86/rgb2rgb.c |   6 ++
 libswscale/x86/rgb_2_rgb.asm | 105 +++
 2 files changed, 111 insertions(+)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 4cbed54b35..6601dad233 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, 
uint8_t *vdst,
 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
   const uint8_t *src, int width, int height,
   int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+   const uint8_t *src, int width, int height,
+   int lumStride, int chromStride, int srcStride);
 #endif
 
 #define DEINTERLEAVE_BYTES(cpuext)\
@@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void)
 }
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 uyvytoyuv422 = ff_uyvytoyuv422_avx2;
+}
+if (EXTERNAL_AVX512ICL(cpu_flags)) {
+uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
 #endif
 }
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index ca7a481255..6e4df17298 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 14, 
12, 13, 15
 pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12
 pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15
 
+%if HAVE_AVX512ICL_EXTERNAL
+; shuffle vector to rearrange packuswb result to be linear
+shuf_packus: db  0,  1,  2,  3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 
51,\
+ 4,  5,  6,  7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 
55,\
+ 8,  9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 
59,\
+12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 63
+
+; shuffle vector to combine odd elements from two vectors to extract Y
+shuf_perm2b: db  1,  3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  25, 
 27,  29,  31,\
+33, 35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57, 
 59,  61,  63,\
+65, 67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89, 
 91,  93,  95,\
+97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 
123, 125, 127
+%endif
+
 SECTION .text
 
 %macro RSHIFT_COPY 5
@@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3
 ;  int lumStride, int chromStride, int srcStride)
 
;---
 %macro UYVY_TO_YUV422 0
+%if mmsize == 64
+; need two more registers to store shuffle vectors for AVX512ICL
+cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, 
chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
+%else
 cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, 
chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
+%endif
 pxor m0, m0
+%if mmsize == 64
+vpternlogd   m1, m1, m1, 0xff  ; m1 = _mm512_set1_epi8(0xff)
+movu m8, [shuf_packus]
+movu m9, [shuf_perm2b]
+%else
 pcmpeqw  m1, m1
+%endif
 psrlwm1, 8
 
 movsxdifnidnwq, wd
@@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, 
h, lum_stride, chrom_s
 and   xq, mmsize * 2 - 1
 je .loop_simd
 
+%if mmsize == 64
+shr xq, 1
+mov   tmpq, -1
+shlx  tmpq, tmpq, xq
+not   tmpq
+kmovq   k7, tmpq ; write mask for U/V
+kmovd   k1, tmpd ; write mask for 1st half of Y
+kmovw   k3, tmpd ; read mask for 1st vector
+shr   tmpq, 16
+kmovw   k4, tmpd ; read mask for 2nd vector
+shr   tmpq, 16
+kmovd   k2, tmpd ; write mask for 2nd half of Y
+kmovw   k5, tmpd ; read mask for 3rd vector
+shr   tmpd, 16
+kmovw   k6, tmpd ; read mask for 4th vector
+
+vmovdqu32  m2{k3}{z}, [srcq + wtwoq ]
+vmovdqu32  m3{k4}{z}, [srcq +

Re: [FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-29 Thread Shreesh Adiga
Hi Andreas,

I am not sure if that is needed. I can add the data observed on my machine
(AMD 7950x Zen 4),
I think this will vary from machine to machine. It is expected to be around
2x
compared to AVX2 and there is no core change apart from processing the
scalar loop with masked instructions.

The data doesn't entirely look consistent as per my expectations.
All the shuffle variants are equivalent in the work they do, yet the
speedups
are not consistent as per the report.

shuffle_bytes_0321_c:   56.5 ( 1.00x)
shuffle_bytes_0321_ssse3:   15.2 ( 3.70x)
shuffle_bytes_0321_avx2:10.2 ( 5.51x)
shuffle_bytes_0321_avx512icl:9.2 ( 6.11x)
shuffle_bytes_1230_c:   84.5 ( 1.00x)
shuffle_bytes_1230_ssse3:   14.2 ( 5.93x)
shuffle_bytes_1230_avx2:15.2 ( 5.54x)
shuffle_bytes_1230_avx512icl:   11.2 ( 7.51x)
shuffle_bytes_2103_c:   48.5 ( 1.00x)
shuffle_bytes_2103_ssse3:   21.2 ( 2.28x)
shuffle_bytes_2103_avx2:13.8 ( 3.53x)
shuffle_bytes_2103_avx512icl:9.2 ( 5.24x)
shuffle_bytes_3012_c:   84.5 ( 1.00x)
shuffle_bytes_3012_ssse3:   14.2 ( 5.93x)
shuffle_bytes_3012_avx2:16.2 ( 5.20x)
shuffle_bytes_3012_avx512icl:   10.2 ( 8.24x)
shuffle_bytes_3210_c:   89.2 ( 1.00x)
shuffle_bytes_3210_ssse3:   24.2 ( 3.68x)
shuffle_bytes_3210_avx2:16.2 ( 5.49x)
shuffle_bytes_3210_avx512icl:9.2 ( 9.65x)

I can add the details to commit message if you can confirm if it is needed.

Thanks,
Shreesh

On Wed, Jan 29, 2025 at 5:46 PM Andreas Rheinhardt <
andreas.rheinha...@outlook.com> wrote:

> Shreesh Adiga:
> > Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
> > ---
> > v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
> > with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
> > v2: Tried to align operands and improve indentation for ASM routine.
> >  libswscale/x86/rgb2rgb.c | 21 +
> >  libswscale/x86/rgb_2_rgb.asm | 90 +++-
> >  2 files changed, 80 insertions(+), 31 deletions(-)
> >
> > diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> > index 6790551a38..4cbed54b35 100644
> > --- a/libswscale/x86/rgb2rgb.c
> > +++ b/libswscale/x86/rgb2rgb.c
> > @@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t
> *src, uint8_t *dst, int src_size);
> >  void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> >  void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int
> src_size);
> >
> > +void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst,
> int src_size);
> > +
> >  void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> >const uint8_t *src, int width, int height,
> >int lumStride, int chromStride, int
> srcStride);
> > @@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
> >  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
> >  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
> >  }
> > +if (EXTERNAL_AVX512ICL(cpu_flags)) {
> > +shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
> > +shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
> > +shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
> > +shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
> > +shuffle_bytes_3210 = ff_shuffle_bytes_3210_av

Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-25 Thread Shreesh Adiga
> Try running it several times using the same seed, so
> "tests/checkasm/checkasm --test=sw_rgb --bench 17575157", and make sure
> no power saving feature is enabled (so the CPU frequency doesn't change
> based on load). That may help getting consistent results.

After running "echo performance | tee
/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor" and
recompiling ffmpeg with "--enable-linux-perf", I am seeing the below
numbers:

shuffle_bytes_0321_c:   56.5 ( 1.00x)
shuffle_bytes_0321_ssse3:   18.0 ( 3.14x)
shuffle_bytes_0321_avx2:10.0 ( 5.65x)
shuffle_bytes_0321_avx512icl:9.0 ( 6.28x)
shuffle_bytes_1230_c:   84.5 ( 1.00x)
shuffle_bytes_1230_ssse3:   18.2 ( 4.63x)
shuffle_bytes_1230_avx2:22.2 ( 3.80x)
shuffle_bytes_1230_avx512icl:   10.0 ( 8.45x)
shuffle_bytes_2103_c:   49.8 ( 1.00x)
shuffle_bytes_2103_ssse3:   21.2 ( 2.34x)
shuffle_bytes_2103_avx2:17.5 ( 2.84x)
shuffle_bytes_2103_avx512icl:7.5 ( 6.63x)
shuffle_bytes_3012_c:   84.5 ( 1.00x)
shuffle_bytes_3012_ssse3:   17.0 ( 4.97x)
shuffle_bytes_3012_avx2:16.0 ( 5.28x)
shuffle_bytes_3012_avx512icl:   16.2 ( 5.20x)
shuffle_bytes_3210_c:   92.8 ( 1.00x)
shuffle_bytes_3210_ssse3:   25.8 ( 3.60x)
shuffle_bytes_3210_avx2:14.0 ( 6.62x)
shuffle_bytes_3210_avx512icl:9.0 (10.31x)

Thanks,
Shreesh
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v3] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-28 Thread Shreesh Adiga
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
---
v3: Fix build failure on older nasm by replacing "kmovw k, tmpw"
with "kmov k, tmpd" which matches "kmovw k, r32" syntax.
v2: Tried to align operands and improve indentation for ASM routine.
 libswscale/x86/rgb2rgb.c | 21 +
 libswscale/x86/rgb_2_rgb.asm | 90 +++-
 2 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 6790551a38..4cbed54b35 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, 
uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);
 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);

+void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
   const uint8_t *src, int width, int height,
   int lumStride, int chromStride, int srcStride);
@@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
 shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
 shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
 }
+if (EXTERNAL_AVX512ICL(cpu_flags)) {
+shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
+shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
+shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
+shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
+shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
+shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
+shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
+shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
+shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
+}
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 uyvytoyuv422 = ff_uyvytoyuv422_avx2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index b468beb12d..ca7a481255 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -57,40 +57,53 @@ SECTION .text
 %macro SHUFFLE_BYTES 4
 cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
 VBROADCASTI128m0, [pb_shuffle%1%2%3%4]
-movsxdifnidn wq, wd
-mov xq, wq
-
-addsrcq, wq
-adddstq, wq
-neg  wq
-
-;calc scalar loop
+movsxdifnidn  wq, wd
+mov   xq, wq
+
+add srcq, wq
+add dstq, wq
+neg   wq
+
+%if mmsize == 64
+andxq, mmsize - 4
+shrxq, 2
+mov  tmpd, -1
+shlx tmpd, tmpd, xd
+not  tmpd
+kmovw  k7, tmpd
+vmovdqu32   m1{k7}{z}, [srcq + wq]
+pshufb m1, m0
+vmovdqu32 [dstq + wq]{k7}, m1
+leawq, [wq + 4 * xq]
+%else
+;calc scalar loop
 and xq, mmsize-4
 je .loop_simd

-.loop_scalar:
-   mov  tmpb, [srcq + wq + %1]
-   mov [dstq+wq + 0], tmpb
-   mov  tmpb, [srcq + wq + %2]
-   mov [dstq+wq + 1], tmpb
-   mov  tmpb, [srcq + wq + %3]
-   mov [dstq+wq + 2], tmpb
-   mov  tmpb, [srcq + wq + %4]
-   mov [dstq+wq + 3], tmpb
-   addwq, 4
-   subxq, 4
-   jg .loop_scalar
-
-;check if src_size < mmsize
-cmp wq, 0
-jge .end
-
-.loop_simd:
-movu   m1, [srcq+wq]
-pshufb m1, m0
-movu[dstq+wq], m1
-addwq, mmsize
-jl .loop_simd
+.loop_scalar:
+mov  tmpb, [srcq + wq + %1]
+mov [dstq+wq + 0], tmpb
+mov  tmpb, [srcq + wq + %2]
+mov [dstq+wq + 1], tmpb
+mov  tmpb, [srcq + wq + %3]
+mov [dstq+wq + 2], tmpb
+mov  tmpb, [srcq + wq + %4]
+mov [dstq+wq + 3], tmpb
+addwq, 4
+subxq, 4
+jg .loop_scalar
+%endif
+
+;check if src_size < mmsize
+cmp wq, 0
+jge .end
+
+.loop_simd:
+movu 

Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-25 Thread Shreesh Adiga
> Thanks for the patch. Could you please compile and run
> tests/checkasm/checkasm with "--test=sw_rgb --bench" and paste the
> results for the shuffle_bytes functions, to see if there's a speed up
> compared to the AVX2 implementation?

I ran the command "tests/checkasm/checkasm --test=sw_rgb --bench" and I see
the below output:
benchmarking with native FFmpeg timers
nop: 45.0
checkasm: using random seed 17575157
checkasm: bench runs 1024 (1 << 10)
SSE2:
 - sw_rgb.uyvytoyuv422   [OK]
 - sw_rgb.interleave_bytes   [OK]
 - sw_rgb.deinterleave_bytes [OK]
 - sw_rgb.rgb_to_y   [OK]
 - sw_rgb.rgb_to_uv  [OK]
SSSE3:
 - sw_rgb.shuffle_bytes_2103 [OK]
 - sw_rgb.shuffle_bytes_0321 [OK]
 - sw_rgb.shuffle_bytes_1230 [OK]
 - sw_rgb.shuffle_bytes_3012 [OK]
 - sw_rgb.shuffle_bytes_3210 [OK]
 - sw_rgb.rgb_to_y   [OK]
 - sw_rgb.rgb_to_uv  [OK]
AVX:
 - sw_rgb.uyvytoyuv422   [OK]
 - sw_rgb.deinterleave_bytes [OK]
 - sw_rgb.rgb_to_y   [OK]
 - sw_rgb.rgb_to_uv  [OK]
AVX2:
 - sw_rgb.shuffle_bytes_2103 [OK]
 - sw_rgb.shuffle_bytes_0321 [OK]
 - sw_rgb.shuffle_bytes_1230 [OK]
 - sw_rgb.shuffle_bytes_3012 [OK]
 - sw_rgb.shuffle_bytes_3210 [OK]
 - sw_rgb.uyvytoyuv422   [OK]
 - sw_rgb.rgb_to_y   [OK]
 - sw_rgb.rgb_to_uv  [OK]
AVX-512ICL:
 - sw_rgb.shuffle_bytes_2103 [OK]
 - sw_rgb.shuffle_bytes_0321 [OK]
 - sw_rgb.shuffle_bytes_1230 [OK]
 - sw_rgb.shuffle_bytes_3012 [OK]
 - sw_rgb.shuffle_bytes_3210 [OK]
checkasm: all 184 tests passed
shuffle_bytes_0321_c:   45.0 ( 1.00x)
shuffle_bytes_0321_ssse3:   11.2 ( 4.00x)
shuffle_bytes_0321_avx2:11.2 ( 4.00x)
shuffle_bytes_0321_avx512icl:   11.2 ( 4.00x)
shuffle_bytes_1230_c:   67.5 ( 1.00x)
shuffle_bytes_1230_ssse3:   11.2 ( 6.00x)
shuffle_bytes_1230_avx2:11.2 ( 6.00x)
shuffle_bytes_1230_avx512icl:0.0 ( 0.00x)
shuffle_bytes_2103_c:   45.0 ( 1.00x)
shuffle_bytes_2103_ssse3:   11.2 ( 4.00x)
shuffle_bytes_2103_avx2: 0.0 ( 0.00x)
shuffle_bytes_2103_avx512icl:0.0 ( 0.00x)
shuffle_bytes_3012_c:   67.5 ( 1.00x)
shuffle_bytes_3012_ssse3:   11.2 ( 6.00x)
shuffle_bytes_3012_avx2:11.2 ( 6.00x)
shuffle_bytes_3012_avx512icl:0.0 ( 0.00x)
shuffle_bytes_3210_c:   67.5 ( 1.00x)
shuffle_bytes_3210_ssse3:   11.2 ( 6.00x)
shuffle_bytes_3210_avx2:11.2 ( 6.00x)
shuffle_bytes_3210_avx512icl:0.0 ( 0.00x)

I've not included the other function printed by the bench command.
I'm not sure if I'm missing something, the output doesn't look consistent
to me.
There are many 0.0 and I don't see any difference between ssse3 and avx2
either.
I'm running this on AMD Ryzen 7950x Zen4 machine.

I've inspected the assembly output for one of the ssse3/avx2/avx512 and it
seems to be as per my expectation.
Therefore I'm not sure if the checkasm is accurately measuring here.
Please let me know if I'm missing something here, I'm new to FFmpeg
development and this is my first patch submission.

Thanks,
Shreesh
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-25 Thread Shreesh Adiga
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
---
v2: Tried to align operands and improve indentation for ASM routine.

 libswscale/x86/rgb2rgb.c | 21 +
 libswscale/x86/rgb_2_rgb.asm | 90 +++-
 2 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 6790551a38..4cbed54b35 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, 
uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);
 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);

+void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
   const uint8_t *src, int width, int height,
   int lumStride, int chromStride, int srcStride);
@@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
 shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
 shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
 }
+if (EXTERNAL_AVX512ICL(cpu_flags)) {
+shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
+shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
+shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
+shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
+shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
+shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
+shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
+shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
+shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
+}
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 uyvytoyuv422 = ff_uyvytoyuv422_avx2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index b468beb12d..3a5e217111 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -57,40 +57,53 @@ SECTION .text
 %macro SHUFFLE_BYTES 4
 cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
 VBROADCASTI128m0, [pb_shuffle%1%2%3%4]
-movsxdifnidn wq, wd
-mov xq, wq
-
-addsrcq, wq
-adddstq, wq
-neg  wq
-
-;calc scalar loop
+movsxdifnidn  wq, wd
+mov   xq, wq
+
+add srcq, wq
+add dstq, wq
+neg   wq
+
+%if mmsize == 64
+andxq, mmsize - 4
+shrxq, 2
+mov  tmpd, -1
+shlx tmpd, tmpd, xd
+not  tmpd
+kmovw  k7, tmpw
+vmovdqu32   m1{k7}{z}, [srcq + wq]
+pshufb m1, m0
+vmovdqu32 [dstq + wq]{k7}, m1
+leawq, [wq + 4 * xq]
+%else
+;calc scalar loop
 and xq, mmsize-4
 je .loop_simd

-.loop_scalar:
-   mov  tmpb, [srcq + wq + %1]
-   mov [dstq+wq + 0], tmpb
-   mov  tmpb, [srcq + wq + %2]
-   mov [dstq+wq + 1], tmpb
-   mov  tmpb, [srcq + wq + %3]
-   mov [dstq+wq + 2], tmpb
-   mov  tmpb, [srcq + wq + %4]
-   mov [dstq+wq + 3], tmpb
-   addwq, 4
-   subxq, 4
-   jg .loop_scalar
-
-;check if src_size < mmsize
-cmp wq, 0
-jge .end
-
-.loop_simd:
-movu   m1, [srcq+wq]
-pshufb m1, m0
-movu[dstq+wq], m1
-addwq, mmsize
-jl .loop_simd
+.loop_scalar:
+mov  tmpb, [srcq + wq + %1]
+mov [dstq+wq + 0], tmpb
+mov  tmpb, [srcq + wq + %2]
+mov [dstq+wq + 1], tmpb
+mov  tmpb, [srcq + wq + %3]
+mov [dstq+wq + 2], tmpb
+mov  tmpb, [srcq + wq + %4]
+mov [dstq+wq + 3], tmpb
+addwq, 4
+subxq, 4
+jg .loop_scalar
+%endif
+
+;check if src_size < mmsize
+cmp wq, 0
+jge .end
+
+.loop_simd:
+movum1, [srcq + wq]
+pshufb  m1, m0
+movu   [dstq + wq], m1
+add wq, mmsize
+jl .loop_simd

 .e

[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL versions of shuffle_bytes

2025-01-25 Thread Shreesh Adiga
Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
---
 libswscale/x86/rgb2rgb.c | 21 +
 libswscale/x86/rgb_2_rgb.asm | 28 
 2 files changed, 49 insertions(+)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 6790551a38..4cbed54b35 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -2364,6 +2364,16 @@ void ff_shuffle_bytes_2013_avx2(const uint8_t *src, 
uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);
 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int 
src_size);
 
+void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int 
src_size);
+
 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
   const uint8_t *src, int width, int height,
   int lumStride, int chromStride, int srcStride);
@@ -2454,6 +2464,17 @@ av_cold void rgb2rgb_init_x86(void)
 shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
 shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
 }
+if (EXTERNAL_AVX512ICL(cpu_flags)) {
+shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
+shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
+shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
+shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
+shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
+shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
+shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
+shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
+shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
+}
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
 uyvytoyuv422 = ff_uyvytoyuv422_avx2;
 #endif
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index b468beb12d..64b0988c4a 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -64,6 +64,18 @@ cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
 adddstq, wq
 neg  wq
 
+%if mmsize == 64
+and xq, mmsize-4
+shr xq, 2
+mov tmpd, -1
+shlx tmpd, tmpd, xd
+not tmpd
+kmovw k7, tmpw
+vmovdqu32 m1{k7}{z}, [srcq + wq]
+pshufb m1, m0
+vmovdqu32 [dstq + wq]{k7}, m1
+lea wq, [wq + 4 * xq]
+%else
 ;calc scalar loop
 and xq, mmsize-4
 je .loop_simd
@@ -80,6 +92,7 @@ cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
addwq, 4
subxq, 4
jg .loop_scalar
+%endif
 
 ;check if src_size < mmsize
 cmp wq, 0
@@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
 %endif
 %endif
 
+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+INIT_ZMM avx512icl
+SHUFFLE_BYTES 2, 1, 0, 3
+SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
+SHUFFLE_BYTES 3, 1, 0, 2
+SHUFFLE_BYTES 2, 0, 1, 3
+SHUFFLE_BYTES 2, 1, 3, 0
+SHUFFLE_BYTES 1, 2, 0, 3
+%endif
+%endif
+
 
;---
 ; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 ;  const uint8_t *src, int width, int height,
-- 
2.45.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: add AVX512ICL version of uyvytoyuv422

2025-02-18 Thread Shreesh Adiga
On Mon, Feb 3, 2025 at 10:03 PM Shreesh Adiga
<16567adigashre...@gmail.com> wrote:
>
> The scalar loop is replaced with masked AVX512 instructions.
> For extracting the Y from UYVY, vperm2b is used instead of
> various AND and packuswb.
>
> Instead of loading the vectors with interleaved lanes as done
> in AVX2 version, normal load is used. At the end of packuswb,
> for U and V, an extra permute operation is done to get the
> required layout.
>
> AMD 7950x Zen 4 benchmark data:
> uyvytoyuv422_c:  29105.0 ( 1.00x)
> uyvytoyuv422_sse2:3888.0 ( 7.49x)
> uyvytoyuv422_avx: 3374.2 ( 8.63x)
> uyvytoyuv422_avx2:2649.8 (10.98x)
> uyvytoyuv422_avx512icl:       1615.0 (18.02x)
>
> Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
> ---
>  libswscale/x86/rgb2rgb.c |   6 ++
>  libswscale/x86/rgb_2_rgb.asm | 105 +++
>  2 files changed, 111 insertions(+)
>
> diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
> index 4cbed54b35..6601dad233 100644
> --- a/libswscale/x86/rgb2rgb.c
> +++ b/libswscale/x86/rgb2rgb.c
> @@ -2383,6 +2383,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, 
> uint8_t *vdst,
>  void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>const uint8_t *src, int width, int height,
>int lumStride, int chromStride, int srcStride);
> +void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> +   const uint8_t *src, int width, int height,
> +   int lumStride, int chromStride, int 
> srcStride);
>  #endif
>
>  #define DEINTERLEAVE_BYTES(cpuext)   
>  \
> @@ -2477,6 +2480,9 @@ av_cold void rgb2rgb_init_x86(void)
>  }
>  if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>  uyvytoyuv422 = ff_uyvytoyuv422_avx2;
> +}
> +if (EXTERNAL_AVX512ICL(cpu_flags)) {
> +uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
>  #endif
>  }
>  #endif
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index ca7a481255..6e4df17298 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -35,6 +35,20 @@ pb_shuffle2013: db 2, 0, 1, 3, 6, 4, 5, 7, 10, 8, 9, 11, 
> 14, 12, 13, 15
>  pb_shuffle2130: db 2, 1, 3, 0, 6, 5, 7, 4, 10, 9, 11, 8, 14, 13, 15, 12
>  pb_shuffle1203: db 1, 2, 0, 3, 5, 6, 4, 7, 9, 10, 8, 11, 13, 14, 12, 15
>
> +%if HAVE_AVX512ICL_EXTERNAL
> +; shuffle vector to rearrange packuswb result to be linear
> +shuf_packus: db  0,  1,  2,  3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 
> 51,\
> + 4,  5,  6,  7, 20, 21, 22, 23, 36, 37, 38, 39, 52, 53, 54, 
> 55,\
> + 8,  9, 10, 11, 24, 25, 26, 27, 40, 41, 42, 43, 56, 57, 58, 
> 59,\
> +12, 13, 14, 15, 28, 29, 30, 31, 44, 45, 46, 47, 60, 61, 62, 
> 63
> +
> +; shuffle vector to combine odd elements from two vectors to extract Y
> +shuf_perm2b: db  1,  3,   5,   7,   9,  11,  13,  15,  17,  19,  21,  23,  
> 25,  27,  29,  31,\
> +33, 35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  
> 57,  59,  61,  63,\
> +65, 67,  69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  
> 89,  91,  93,  95,\
> +97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 
> 121, 123, 125, 127
> +%endif
> +
>  SECTION .text
>
>  %macro RSHIFT_COPY 5
> @@ -156,9 +170,20 @@ SHUFFLE_BYTES 1, 2, 0, 3
>  ;  int lumStride, int chromStride, int srcStride)
>  
> ;---
>  %macro UYVY_TO_YUV422 0
> +%if mmsize == 64
> +; need two more registers to store shuffle vectors for AVX512ICL
> +cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, 
> chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
> +%else
>  cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, 
> chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
> +%endif
>  pxor m0, m0
> +%if mmsize == 64
> +vpternlogd   m1, m1, m1, 0xff  ; m1 = _mm512_set1_epi8(0xff)
> +movu m8, [shuf_packus]
> +movu m9, [shuf_perm2b]
> +%else
>  pcmpeqw  m1, m1
> +%endif
>  psrlwm1, 8
>
>  movsxdifnidnwq, wd
> @@ -188,6 +213,63 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, 
> w, h, lum_stride, chrom_s
>  and   xq

Re: [FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422

2025-03-03 Thread Shreesh Adiga
On Thu, Feb 20, 2025 at 6:51 PM Shreesh Adiga
<16567adigashre...@gmail.com> wrote:
>
> Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the 
> following:
> 4 vinsertq to have interleaving of the vector lanes during load from memory.
> 4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout.
>
> This patch replaces the above 8 instructions with 2 vpermq and
> 2 vpermd with a vector register similar to AVX512ICL version.
>
> Observed the following numbers on various microarchitectures:
>
> On AMD Zen3 laptop:
> Before:
> uyvytoyuv422_c:  51979.7 ( 1.00x)
> uyvytoyuv422_sse2:5410.5 ( 9.61x)
> uyvytoyuv422_avx: 4642.7 (11.20x)
> uyvytoyuv422_avx2:4249.0 (12.23x)
>
> After:
> uyvytoyuv422_c:  51659.8 ( 1.00x)
> uyvytoyuv422_sse2:5420.8 ( 9.53x)
> uyvytoyuv422_avx: 4651.2 (11.11x)
> uyvytoyuv422_avx2:3953.8 (13.07x)
>
> On Intel Macbook Pro 2019:
> Before:
> uyvytoyuv422_c: 185014.4 ( 1.00x)
> uyvytoyuv422_sse2:   22800.4 ( 8.11x)
> uyvytoyuv422_avx:19796.9 ( 9.35x)
> uyvytoyuv422_avx2:   13141.9 (14.08x)
>
> After:
> uyvytoyuv422_c: 185093.4 ( 1.00x)
> uyvytoyuv422_sse2:   22795.4 ( 8.12x)
> uyvytoyuv422_avx:19791.9 ( 9.35x)
> uyvytoyuv422_avx2:   12043.1 (15.37x)
>
> On AMD Zen4 desktop:
> Before:
> uyvytoyuv422_c:  29105.0 ( 1.00x)
> uyvytoyuv422_sse2:3888.0 ( 7.49x)
> uyvytoyuv422_avx: 3374.2 ( 8.63x)
> uyvytoyuv422_avx2:2649.8 (10.98x)
> uyvytoyuv422_avx512icl:   1615.0 (18.02x)
>
> After:
> uyvytoyuv422_c:  29093.4 ( 1.00x)
> uyvytoyuv422_sse2:3874.4 ( 7.51x)
> uyvytoyuv422_avx: 3371.6 ( 8.63x)
> uyvytoyuv422_avx2:2174.6 (13.38x)
> uyvytoyuv422_avx512icl:   1625.1 (17.90x)
>
> Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
> ---
>  libswscale/x86/rgb_2_rgb.asm | 68 ++--
>  1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
> index 6e4df17298..871bb21127 100644
> --- a/libswscale/x86/rgb_2_rgb.asm
> +++ b/libswscale/x86/rgb_2_rgb.asm
> @@ -49,18 +49,21 @@ shuf_perm2b: db  1,  3,   5,   7,   9,  11,  13,  15,  
> 17,  19,  21,  23,  25,
>  97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 
> 121, 123, 125, 127
>  %endif
>
> +%if HAVE_AVX2_EXTERNAL
> +; shuffle vector to rearrange packuswb result to be linear
> +shuf_packus_avx2: db  0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\
> +  2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0,
> +%endif
> +
>  SECTION .text
>
> -%macro RSHIFT_COPY 5
> +%macro RSHIFT_COPY 3
>  ; %1 dst ; %2 src ; %3 shift
> -%if mmsize == 32
> -vperm2i128 %1, %2, %3, %5
> -RSHIFT %1, %4
> -%elif cpuflag(avx)
> -psrldq  %1, %2, %4
> +%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl)
> +psrldq  %1, %2, %3
>  %else
>  mova   %1, %2
> -RSHIFT %1, %4
> +RSHIFT %1, %3
>  %endif
>  %endmacro
>
> @@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3
>  ;  int lumStride, int chromStride, int srcStride)
>  
> ;---
>  %macro UYVY_TO_YUV422 0
> -%if mmsize == 64
> -; need two more registers to store shuffle vectors for AVX512ICL
> -cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, 
> chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
> -%else
> -cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, 
> chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
> -%endif
> +cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, 
> udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, 
> tmp, x, back_w
>  pxor m0, m0
>  %if mmsize == 64

[FFmpeg-devel] [PATCH] swscale/x86/rgb2rgb: optimize AVX2 version of uyvytoyuv422

2025-02-20 Thread Shreesh Adiga
Currently the AVX2 version of uyvytoyuv422 in the SIMD loop does the following:
4 vinsertq to have interleaving of the vector lanes during load from memory.
4 vperm2i128 inside 4 RSHIFT_COPY calls to achieve the desired layout.

This patch replaces the above 8 instructions with 2 vpermq and
2 vpermd with a vector register similar to AVX512ICL version.

Observed the following numbers on various microarchitectures:

On AMD Zen3 laptop:
Before:
uyvytoyuv422_c:  51979.7 ( 1.00x)
uyvytoyuv422_sse2:5410.5 ( 9.61x)
uyvytoyuv422_avx: 4642.7 (11.20x)
uyvytoyuv422_avx2:4249.0 (12.23x)

After:
uyvytoyuv422_c:  51659.8 ( 1.00x)
uyvytoyuv422_sse2:5420.8 ( 9.53x)
uyvytoyuv422_avx: 4651.2 (11.11x)
uyvytoyuv422_avx2:3953.8 (13.07x)

On Intel Macbook Pro 2019:
Before:
uyvytoyuv422_c: 185014.4 ( 1.00x)
uyvytoyuv422_sse2:   22800.4 ( 8.11x)
uyvytoyuv422_avx:19796.9 ( 9.35x)
uyvytoyuv422_avx2:   13141.9 (14.08x)

After:
uyvytoyuv422_c: 185093.4 ( 1.00x)
uyvytoyuv422_sse2:   22795.4 ( 8.12x)
uyvytoyuv422_avx:19791.9 ( 9.35x)
uyvytoyuv422_avx2:   12043.1 (15.37x)

On AMD Zen4 desktop:
Before:
uyvytoyuv422_c:  29105.0 ( 1.00x)
uyvytoyuv422_sse2:3888.0 ( 7.49x)
uyvytoyuv422_avx: 3374.2 ( 8.63x)
uyvytoyuv422_avx2:2649.8 (10.98x)
uyvytoyuv422_avx512icl:   1615.0 (18.02x)

After:
uyvytoyuv422_c:  29093.4 ( 1.00x)
uyvytoyuv422_sse2:3874.4 ( 7.51x)
uyvytoyuv422_avx: 3371.6 ( 8.63x)
uyvytoyuv422_avx2:2174.6 (13.38x)
uyvytoyuv422_avx512icl:   1625.1 (17.90x)

Signed-off-by: Shreesh Adiga <16567adigashre...@gmail.com>
---
 libswscale/x86/rgb_2_rgb.asm | 68 ++--
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 6e4df17298..871bb21127 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -49,18 +49,21 @@ shuf_perm2b: db  1,  3,   5,   7,   9,  11,  13,  15,  17,  
19,  21,  23,  25,
 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 117, 119, 121, 
123, 125, 127
 %endif
 
+%if HAVE_AVX2_EXTERNAL
+; shuffle vector to rearrange packuswb result to be linear
+shuf_packus_avx2: db  0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0,\
+  2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0,
+%endif
+
 SECTION .text
 
-%macro RSHIFT_COPY 5
+%macro RSHIFT_COPY 3
 ; %1 dst ; %2 src ; %3 shift
-%if mmsize == 32
-vperm2i128 %1, %2, %3, %5
-RSHIFT %1, %4
-%elif cpuflag(avx)
-psrldq  %1, %2, %4
+%if cpuflag(avx) || cpuflag(avx2) || cpuflag(avx512icl)
+psrldq  %1, %2, %3
 %else
 mova   %1, %2
-RSHIFT %1, %4
+RSHIFT %1, %3
 %endif
 %endmacro
 
@@ -170,18 +173,16 @@ SHUFFLE_BYTES 1, 2, 0, 3
 ;  int lumStride, int chromStride, int srcStride)
 
;---
 %macro UYVY_TO_YUV422 0
-%if mmsize == 64
-; need two more registers to store shuffle vectors for AVX512ICL
-cglobal uyvytoyuv422, 9, 14, 10, ydst, udst, vdst, src, w, h, lum_stride, 
chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
-%else
-cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, 
chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
-%endif
+cglobal uyvytoyuv422, 9, 14, 8 + cpuflag(avx2) + cpuflag(avx512icl), ydst, 
udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, 
x, back_w
 pxor m0, m0
 %if mmsize == 64
 vpternlogd   m1, m1, m1, 0xff  ; m1 = _mm512_set1_epi8(0xff)
 movu m8, [shuf_packus]
 movu m9, [shuf_perm2b]
 %else
+%if cpuflag(avx2)
+movu m8, [shuf_packus_avx2]
+%endif
 pcmpeqw  m1, m1
 %endif
 psrlwm1, 8
@@ -295,21 +296,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, 
h, lum_stride, chrom_s
 jge .end_line
 
 .loop_simd:
-%if mmsize == 32
-movu   xm2, [srcq + wtwoq ]
-movu   xm3, [srcq + wtwoq + 16]
-movu   xm4, [srcq + wtwoq + 16 * 2]
-movu   xm5, [srcq + wtwoq +