[FFmpeg-cvslog] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv

2024-06-05 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Sun Jun  2 
12:03:33 2024 +0300| [121fb846b97db5afd3a24b0bde2be151400104be] | committer: 
Rémi Denis-Courmont

lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv

This is almost the same story as vp7_idct_add4y. We just have to use
strided loads of 2 64-bit elements to account for the different data
layout in memory.

T-Head C908:
vp7_idct_dc_add4uv_c:   7.5
vp7_idct_dc_add4uv_rvv_i64: 2.0
vp8_idct_dc_add4uv_c:   6.2
vp8_idct_dc_add4uv_rvv_i32: 2.2 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0

SpacemiT X60:
vp7_idct_dc_add4uv_c:   6.7
vp7_idct_dc_add4uv_rvv_i64: 2.2
vp8_idct_dc_add4uv_c:   5.7
vp8_idct_dc_add4uv_rvv_i32: 2.5 (before)
vp8_idct_dc_add4uv_rvv_i64: 2.0

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=121fb846b97db5afd3a24b0bde2be151400104be
---

 libavcodec/riscv/vp7dsp_init.c |  3 +++
 libavcodec/riscv/vp7dsp_rvv.S  |  6 +++--
 libavcodec/riscv/vp8dsp_init.c |  3 +--
 libavcodec/riscv/vp8dsp_rvv.S  | 50 +++---
 4 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c
index fa5fb9d2ae..9b8357ec05 100644
--- a/libavcodec/riscv/vp7dsp_init.c
+++ b/libavcodec/riscv/vp7dsp_init.c
@@ -29,6 +29,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t 
dc[16]);
 void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
 void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
+void ff_vp7_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
 
 static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
ptrdiff_t stride)
@@ -51,6 +52,8 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
 #endif
 c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
 c->vp8_idct_dc_add4y  = ff_vp7_idct_dc_add4y_rvv;
+if (flags & AV_CPU_FLAG_RVV_I64)
+c->vp8_idct_dc_add4uv = ff_vp7_idct_dc_add4uv_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 09dcbf3857..856b0e8c96 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -128,7 +128,8 @@ func ff_vp7_idct_add_rvv, zve32x
 endfunc
 #endif
 
-func ff_vp7_idct_dc_add4y_rvv, zve32x
+.irp type, y, uv
+func ff_vp7_idct_dc_add4\type\()_rvv, zve32x
 li   t0, 32
 vsetivli zero, 4, e16, mf2, ta, ma
 li   t1, 23170
@@ -141,5 +142,6 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
 vadd.vx  v0, v0, t2
 vsetvli  zero, zero, e16, mf2, ta, ma
 vnsra.wi v8, v0, 18   # 4x DC
-tail ff_vp78_idct_dc_add4y_rvv
+tail ff_vp78_idct_dc_add4\type\()_rvv
 endfunc
+.endr
diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 836237b41c..5911d195ba 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -131,9 +131,8 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 #endif
 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv;
 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
-if (flags & AV_CPU_FLAG_RVB_ADDR) {
+if (flags & AV_CPU_FLAG_RVV_I64)
 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
-}
 }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 458eebb306..c83f9eec71 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -157,6 +157,43 @@ func ff_vp78_idct_dc_add4y_rvv, zve32x
 ret
 endfunc
 
+func ff_vp8_idct_dc_add4uv_rvv, zve32x
+li   t0, 32
+vsetivli zero, 4, e16, mf2, ta, ma
+li   t1, 4 - (128 << 3)
+vlse16.v v8, (a1), t0
+vadd.vx  v8, v8, t1
+vsra.vi  v8, v8, 3
+# fall through
+endfunc
+
+.variant_cc ff_vp78_idct_dc_add4uv_rvv
+func ff_vp78_idct_dc_add4uv_rvv, zve64x
+vsetivlizero, 16, e16, m2, ta, ma
+vid.v   v4
+li  a4, 4
+vsrl.vi v4, v4, 2
+li  t1, 128
+vrgather.vv v0, v8, v4 # replicate each DC four times
+sllit2, a2, 2
+vsetivlizero, 2, e64, m1, ta, ma
+1:
+vlse64.vv8, (a0), t2
+addia4, a4, -1
+vsetivlizero, 16, e8, m1, ta, ma
+vwaddu.wv   v16, v0, v8
+sh  zero, (a1)
+vnclip.wi   v8, v16, 0
+addia1, a1, 32
+vxor.vx v8, v8, t1
+vsetivlizero, 2, e64, m1, ta, ma
+vsse64.vv8, (a0), t2
+add a0, a0, a2
+bneza4, 1b
+
+ret
+endfunc
+
 .macro vp8_idct_dc_add
 vlse32.v  v0, (a0), a2
 lha5, 0(a1)
@@ -179,19 +216,6 @@ endfunc
 addi  a1, a1, 32
 .endm
 
-func ff_vp8_idct_dc_add4uv_rvv, zve32x
- 

[FFmpeg-cvslog] lavc/vp8dsp: rework R-V V idct_dc_add4y

2024-06-05 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Sun Jun  2 
12:13:25 2024 +0300| [225de53c9d446ddf1cc3ece6e99c06c8cce9b78f] | committer: 
Rémi Denis-Courmont

lavc/vp8dsp: rework R-V V idct_dc_add4y

DCT-related FFmpeg functions often add an unsigned 8-bit sample to a
signed 16-bit coefficient, then clip the result back to an unsigned
8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so
instead our most common sequence is:
VWADDU.WV
set SEW to 16 bits
VMAX.VV zero # clip negative values to 0
set SEW to 8 bits
VNCLIPU.WI   # clip values over 255 to 255 and narrow

Here we use a different sequence which does not require toggling the
vector type. This assumes that the wide addend vector is biased by
-128:
VWADDU.WV
VNCLIP.WI# clip values to signed 8-bit and narrow
VXOR.VX 0x80 # flip sign bit (convert signed to unsigned)

Also the VMAX is effectively replaced by a VXOR of half-width. In this
function, this comes for free as we anyway add a constant to the wide
vector in the prologue.

On C908, this has no observable effects. On X60, this improves
microbenchmarks by about 20%.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=225de53c9d446ddf1cc3ece6e99c06c8cce9b78f
---

 libavcodec/riscv/vp7dsp_rvv.S |  2 +-
 libavcodec/riscv/vp8dsp_rvv.S | 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S
index 39b23c2e79..09dcbf3857 100644
--- a/libavcodec/riscv/vp7dsp_rvv.S
+++ b/libavcodec/riscv/vp7dsp_rvv.S
@@ -134,7 +134,7 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x
 li   t1, 23170
 vlse16.v v8, (a1), t0 # block[0..3][0]
 vwmul.vx v0, v8, t1
-li   t2, 0x2
+li   t2, 0x2 - (128 << 18)
 vsetvli  zero, zero, e32, m1, ta, ma
 vsra.vi  v0, v0, 14
 vmul.vx  v0, v0, t1
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 8ea0a0c9bd..458eebb306 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -125,31 +125,31 @@ endfunc
 func ff_vp8_idct_dc_add4y_rvv, zve32x
 li   t0, 32
 vsetivli zero, 4, e16, mf2, ta, ma
+li   t1, 4 - (128 << 3)
 vlse16.v v8, (a1), t0
-vadd.vi  v8, v8, 4
+vadd.vx  v8, v8, t1
 vsra.vi  v8, v8, 3
 # fall through
 endfunc
 
 .variant_cc ff_vp78_idct_dc_add4y_rvv
-# v8 = [dc0, dc1, dc2, dc3]
+# v8 = [dc0 - 128, dc1 - 128, dc2 - 128, dc3 - 128]
 func ff_vp78_idct_dc_add4y_rvv, zve32x
 vsetivlizero, 16, e16, m2, ta, ma
 vid.v   v4
+li  a4, 4
 vsrl.vi v4, v4, 2
+li  t1, 128
 vrgather.vv v0, v8, v4 # replicate each DC four times
 vsetvli zero, zero, e8, m1, ta, ma
-li  a4, 4
 1:
 vle8.v  v8, (a0)
 addia4, a4, -1
 vwaddu.wv   v16, v0, v8
 sh  zero, (a1)
-vsetvli zero, zero, e16, m2, ta, ma
-vmax.vx v16, v16, zero
+vnclip.wi   v8, v16, 0
 addia1, a1, 32
-vsetvli zero, zero, e8, m1, ta, ma
-vnclipu.wi  v8, v16, 0
+vxor.vx v8, v8, t1
 vse8.v  v8, (a0)
 add a0, a0, a2
 bneza4, 1b

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] lavc/vp8dsp: remove no longer used macros

2024-06-05 Thread Rémi Denis-Courmont
ffmpeg | branch: master | Rémi Denis-Courmont  | Sun Jun  2 
13:23:24 2024 +0300| [0415bb74c81ab0019e48bd2989ddf48d66918e9e] | committer: 
Rémi Denis-Courmont

lavc/vp8dsp: remove no longer used macros

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0415bb74c81ab0019e48bd2989ddf48d66918e9e
---

 libavcodec/riscv/vp8dsp_rvv.S | 22 --
 1 file changed, 22 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index c83f9eec71..82489a7f14 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -194,28 +194,6 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
 ret
 endfunc
 
-.macro vp8_idct_dc_add
-vlse32.v  v0, (a0), a2
-lha5, 0(a1)
-shzero, 0(a1)
-addi  a5, a5, 4
-srai  t1, a5, 3
-vsetivli  zero, 4*4, e16, m2, ta, ma
-vzext.vf2 v2, v0
-vadd.vx   v2, v2, t1
-vmax.vx   v2, v2, zero
-vsetvli   zero, zero, e8, m1, ta, ma
-vnclipu.wiv0, v2, 0
-vsetivli  zero, 4, e8, mf4, ta, ma
-vsse32.v  v0, (a0), a2
-.endm
-
-.macro vp8_idct_dc_addy
-vp8_idct_dc_add
-addi  a0, a0, 4
-addi  a1, a1, 32
-.endm
-
 .macro bilin_load dst type mn
 .ifc \type,v
 add t5, a2, a3

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] libavcodec/libxvid: code cleanup (replace magic numbers)

2024-06-05 Thread Ramiro Polla
ffmpeg | branch: master | Ramiro Polla  | Thu May 30 
23:20:07 2024 +0200| [01b1f4c9a5d1b7a2421f9bd6d600bcbe2519564a] | committer: 
Ramiro Polla

libavcodec/libxvid: code cleanup (replace magic numbers)

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=01b1f4c9a5d1b7a2421f9bd6d600bcbe2519564a
---

 libavcodec/libxvid.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/libxvid.c b/libavcodec/libxvid.c
index b9ac39429d..a490f16b3f 100644
--- a/libavcodec/libxvid.c
+++ b/libavcodec/libxvid.c
@@ -422,13 +422,13 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx)
 
 /* Decide how we should decide blocks */
 switch (avctx->mb_decision) {
-case 2:
+case FF_MB_DECISION_RD:
 x->vop_flags |=  XVID_VOP_MODEDECISION_RD;
 x->me_flags  |=  XVID_ME_HALFPELREFINE8_RD|
  XVID_ME_QUARTERPELREFINE8_RD |
  XVID_ME_EXTSEARCH_RD |
  XVID_ME_CHECKPREDICTION_RD;
-case 1:
+case FF_MB_DECISION_BITS:
 if (!(x->vop_flags & XVID_VOP_MODEDECISION_RD))
 x->vop_flags |= XVID_VOP_FAST_MODEDECISION_RD;
 x->me_flags |= XVID_ME_HALFPELREFINE16_RD |

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] avcodec/mpegvideo_enc: give magic number a name

2024-06-05 Thread Ramiro Polla
ffmpeg | branch: master | Ramiro Polla  | Tue Jun  4 
15:05:35 2024 +0200| [2d24a80e5e419217f4e9ae22ab85a92a069756e0] | committer: 
Ramiro Polla

avcodec/mpegvideo_enc: give magic number a name

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2d24a80e5e419217f4e9ae22ab85a92a069756e0
---

 libavcodec/mpegvideo_enc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 73a9082265..82bab43e14 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -562,7 +562,7 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx)
 
 if ((s->mpv_flags & FF_MPV_FLAG_QP_RD) &&
 avctx->mb_decision != FF_MB_DECISION_RD) {
-av_log(avctx, AV_LOG_ERROR, "QP RD needs mbd=2\n");
+av_log(avctx, AV_LOG_ERROR, "QP RD needs mbd=rd\n");
 return AVERROR(EINVAL);
 }
 

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] checkasm/sw_rgb: test rgb24/bgr24 to yuv

2024-06-05 Thread Zhao Zhili
ffmpeg | branch: master | Zhao Zhili  | Thu Jun  6 
01:34:45 2024 +0800| [47ba87551c24429a2a838d8da5e3e3e712584173] | committer: 
James Almer

checkasm/sw_rgb: test rgb24/bgr24 to yuv

The line width 8 is supposed to test corner case, while the
performance doesn't matter. Width 1080 is also a case of
unaligned to 16.

Width 1920 meant for benchmark (together with --runs options).

Signed-off-by: James Almer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=47ba87551c24429a2a838d8da5e3e3e712584173
---

 tests/checkasm/sw_rgb.c | 109 
 1 file changed, 109 insertions(+)

diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c
index 7cd815e5be..b51d0836c3 100644
--- a/tests/checkasm/sw_rgb.c
+++ b/tests/checkasm/sw_rgb.c
@@ -22,8 +22,11 @@
 #include "libavutil/common.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
+#include "libavutil/pixdesc.h"
 
 #include "libswscale/rgb2rgb.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
 
 #include "checkasm.h"
 
@@ -179,8 +182,100 @@ static void check_interleave_bytes(void)
 }
 }
 
+#define MAX_LINE_SIZE 1920
+static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE};
+static const enum AVPixelFormat rgb_formats[] = {
+AV_PIX_FMT_RGB24,
+AV_PIX_FMT_BGR24,
+};
+
+static void check_rgb_to_y(struct SwsContext *ctx)
+{
+LOCAL_ALIGNED_32(uint8_t, src, [MAX_LINE_SIZE * 3]);
+LOCAL_ALIGNED_32(uint8_t, dst0_y, [MAX_LINE_SIZE * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst1_y, [MAX_LINE_SIZE * 2]);
+
+declare_func(void, uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2, int width,
+ uint32_t *rgb2yuv, void *opq);
+
+randomize_buffers(src, MAX_LINE_SIZE * 3);
+
+for (int i = 0; i < FF_ARRAY_ELEMS(rgb_formats); i++) {
+const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(rgb_formats[i]);
+
+ctx->srcFormat = rgb_formats[i];
+ff_sws_init_scale(ctx);
+
+for (int j = 0; j < FF_ARRAY_ELEMS(input_sizes); j++) {
+int w = input_sizes[j];
+
+if (check_func(ctx->lumToYV12, "%s_to_y_%d", desc->name, w)) {
+memset(dst0_y, 0xFA, MAX_LINE_SIZE * 2);
+memset(dst1_y, 0xFA, MAX_LINE_SIZE * 2);
+
+call_ref(dst0_y, src, NULL, NULL, w, ctx->input_rgb2yuv_table, 
NULL);
+call_new(dst1_y, src, NULL, NULL, w, ctx->input_rgb2yuv_table, 
NULL);
+
+if (memcmp(dst0_y, dst1_y, w * 2))
+fail();
+
+bench_new(dst1_y, src, NULL, NULL, w, 
ctx->input_rgb2yuv_table, NULL);
+}
+}
+}
+}
+
+static void check_rgb_to_uv(struct SwsContext *ctx)
+{
+LOCAL_ALIGNED_32(uint8_t, src, [MAX_LINE_SIZE * 3]);
+LOCAL_ALIGNED_32(uint8_t, dst0_u, [MAX_LINE_SIZE * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst0_v, [MAX_LINE_SIZE * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst1_u, [MAX_LINE_SIZE * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst1_v, [MAX_LINE_SIZE * 2]);
+
+declare_func(void, uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ int width, uint32_t *pal, void *opq);
+
+randomize_buffers(src, MAX_LINE_SIZE * 3);
+
+for (int i = 0; i < 2 * FF_ARRAY_ELEMS(rgb_formats); i++) {
+enum AVPixelFormat src_fmt = rgb_formats[i / 2];
+const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src_fmt);
+
+ctx->chrSrcHSubSample = (i % 2) ? 0 : 1;
+ctx->srcFormat = src_fmt;
+ctx->dstFormat = ctx->chrSrcHSubSample ? AV_PIX_FMT_YUV420P : 
AV_PIX_FMT_YUV444P;
+ff_sws_init_scale(ctx);
+
+for (int j = 0; j < FF_ARRAY_ELEMS(input_sizes); j++) {
+int w = input_sizes[j] >> ctx->chrSrcHSubSample;
+
+if (check_func(ctx->chrToYV12, "%s_to_uv%s_%d", desc->name,
+   ctx->chrSrcHSubSample ? "_half" : "",
+   input_sizes[j])) {
+memset(dst0_u, 0xFF, MAX_LINE_SIZE * 2);
+memset(dst0_v, 0xFF, MAX_LINE_SIZE * 2);
+memset(dst1_u, 0xFF, MAX_LINE_SIZE * 2);
+memset(dst1_v, 0xFF, MAX_LINE_SIZE * 2);
+
+call_ref(dst0_u, dst0_v, NULL, src, src, w, 
ctx->input_rgb2yuv_table, NULL);
+call_new(dst1_u, dst1_v, NULL, src, src, w, 
ctx->input_rgb2yuv_table, NULL);
+
+if (memcmp(dst0_u, dst1_u, w * 2) || memcmp(dst0_v, dst1_v, w 
* 2))
+fail();
+
+bench_new(dst1_u, dst1_v, NULL, src, src, w, 
ctx->input_rgb2yuv_table, NULL);
+}
+}
+}
+}
+
 void checkasm_check_sw_rgb(void)
 {
+struct SwsContext *ctx;
+
 ff_sws_rgb2rgb_init();
 
 check_shuffle_bytes(shuffle_bytes_2103, "shuffle_bytes_2103");
@@ -203,4 +298,18 @@ void checkasm_check_sw_rgb(void)
 
 check_interleave_bytes()