From 2b3d9c2776e56258b9bb4c15c7f315156d8a09f2 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Tue, 20 Mar 2018 21:02:49 +0100
Subject: [PATCH 2/4] swscale/rgb : add X86 SIMD (SSSE3), for 
 shuffle_bytes_1230, shuffle_bytes_3012, shuffle_bytes_3210

---
 libswscale/rgb2rgb.c          | 23 ++++-------------------
 libswscale/rgb2rgb.h          |  6 +++---
 libswscale/rgb2rgb_template.c | 21 +++++++++++++++++++++
 libswscale/x86/rgb2rgb.c      |  6 ++++++
 libswscale/x86/rgb_2_rgb.asm  |  6 ++++++
 5 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 04b7908b5e..eab8e6aebb 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -53,6 +53,10 @@ void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
 
 void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
 void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size);
+void (*shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size);
+
 
 void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
                    const uint8_t *vsrc, uint8_t *dst,
@@ -319,25 +323,6 @@ void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
     }
 }
 
-
-#define DEFINE_SHUFFLE_BYTES(a, b, c, d)                                \
-void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src,             \
-                                        uint8_t *dst, int src_size)     \
-{                                                                       \
-    int i;                                                              \
-                                                                        \
-    for (i = 0; i < src_size; i += 4) {                                 \
-        dst[i + 0] = src[i + a];                                        \
-        dst[i + 1] = src[i + b];                                        \
-        dst[i + 2] = src[i + c];                                        \
-        dst[i + 3] = src[i + d];                                        \
-    }                                                                   \
-}
-
-DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
-DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
-DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
-
 #define DEFINE_RGB48TOBGR48(need_bswap, swap)                           \
 void rgb48tobgr48_ ## need_bswap(const uint8_t *src,                    \
                                  uint8_t *dst, int src_size)            \
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 6994839299..08de546598 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -76,9 +76,9 @@ void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
 void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
 void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
 
-void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
-void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
-void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size);
+extern void (*shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size);
 
 void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height, int lumStride,
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 499d25b26d..fb7d663ccc 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -342,6 +342,24 @@ static inline void shuffle_bytes_0321_c(const uint8_t *src, uint8_t *dst,
     }
 }
 
+#define DEFINE_SHUFFLE_BYTES(name, a, b, c, d)                          \
+static void shuffle_bytes_##name (const uint8_t *src,                   \
+                                        uint8_t *dst, int src_size)     \
+{                                                                       \
+    int i;                                                              \
+                                                                        \
+    for (i = 0; i < src_size; i += 4) {                                 \
+        dst[i + 0] = src[i + a];                                        \
+        dst[i + 1] = src[i + b];                                        \
+        dst[i + 2] = src[i + c];                                        \
+        dst[i + 3] = src[i + d];                                        \
+    }                                                                   \
+}
+
+DEFINE_SHUFFLE_BYTES(1230_c, 1, 2, 3, 0)
+DEFINE_SHUFFLE_BYTES(3012_c, 3, 0, 1, 2)
+DEFINE_SHUFFLE_BYTES(3210_c, 3, 2, 1, 0)
+
 static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
 {
     unsigned i;
@@ -949,6 +967,9 @@ static av_cold void rgb2rgb_init_c(void)
 #else
     shuffle_bytes_0321 = shuffle_bytes_0321_c;
     shuffle_bytes_2103 = shuffle_bytes_2103_c;
+    shuffle_bytes_1230 = shuffle_bytes_1230_c;
+    shuffle_bytes_3012 = shuffle_bytes_3012_c;
+    shuffle_bytes_3210 = shuffle_bytes_3210_c;
 #endif
     rgb32tobgr16       = rgb32tobgr16_c;
     rgb32tobgr15       = rgb32tobgr15_c;
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 5caabf03ed..e5f318a72c 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -146,6 +146,9 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 
 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 
 av_cold void rgb2rgb_init_x86(void)
 {
@@ -167,5 +170,8 @@ av_cold void rgb2rgb_init_x86(void)
     if (EXTERNAL_SSSE3(cpu_flags)) {
         shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
         shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
+        shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
+        shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
+        shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
     }
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index e4104d9be2..db45e313d8 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -26,6 +26,9 @@ SECTION_RODATA
 
 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
+pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
+pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
 SECTION .text
 
@@ -78,3 +81,6 @@ jge .end
 INIT_XMM ssse3
 SHUFFLE_BYTES 2, 1, 0, 3
 SHUFFLE_BYTES 0, 3, 2, 1
+SHUFFLE_BYTES 1, 2, 3, 0
+SHUFFLE_BYTES 3, 0, 1, 2
+SHUFFLE_BYTES 3, 2, 1, 0
-- 
2.14.3 (Apple Git-98)

