From b844f1992824efe606610283e9f4c3a892bcede7 Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Mon, 8 Oct 2018 19:18:36 +0200
Subject: [PATCH] swscale/x86/rgb2rgb : port shuffle2103 to external asm
 (mmx/mmxext)

and remove the inline asm version
---
 libswscale/x86/rgb2rgb.c          |  9 +++-
 libswscale/x86/rgb2rgb_template.c | 63 --------------------------
 libswscale/x86/rgb_2_rgb.asm      | 94 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 64 deletions(-)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index 1191081440..dd508fab9b 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -143,7 +143,8 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 */
 
 #endif /* HAVE_INLINE_ASM */
-
+void ff_shuffle_bytes_2103_mmx(const uint8_t *src, uint8_t *dst, int src_size);
+void ff_shuffle_bytes_2103_mmxext(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
@@ -176,6 +177,12 @@ av_cold void rgb2rgb_init_x86(void)
         rgb2rgb_init_avx();
 #endif /* HAVE_INLINE_ASM */
 
+    if (EXTERNAL_MMX(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmx;
+    }
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        shuffle_bytes_2103 = ff_shuffle_bytes_2103_mmxext;
+    }
     if (EXTERNAL_SSE2(cpu_flags)) {
 #if ARCH_X86_64
         uyvytoyuv422 = ff_uyvytoyuv422_sse2;
diff --git a/libswscale/x86/rgb2rgb_template.c b/libswscale/x86/rgb2rgb_template.c
index d59bd5679e..ae2469e663 100644
--- a/libswscale/x86/rgb2rgb_template.c
+++ b/libswscale/x86/rgb2rgb_template.c
@@ -1034,68 +1034,6 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_s
     }
 }
 
-static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
-{
-    x86_reg idx = 15 - src_size;
-    const uint8_t *s = src-idx;
-    uint8_t *d = dst-idx;
-    __asm__ volatile(
-        "test          %0, %0           \n\t"
-        "jns           2f               \n\t"
-        PREFETCH"       (%1, %0)        \n\t"
-        "movq          %3, %%mm7        \n\t"
-        "pxor          %4, %%mm7        \n\t"
-        "movq       %%mm7, %%mm6        \n\t"
-        "pxor          %5, %%mm7        \n\t"
-        ".p2align       4               \n\t"
-        "1:                             \n\t"
-        PREFETCH"     32(%1, %0)        \n\t"
-        "movq           (%1, %0), %%mm0 \n\t"
-        "movq          8(%1, %0), %%mm1 \n\t"
-# if COMPILE_TEMPLATE_MMXEXT
-        "pshufw      $177, %%mm0, %%mm3 \n\t"
-        "pshufw      $177, %%mm1, %%mm5 \n\t"
-        "pand       %%mm7, %%mm0        \n\t"
-        "pand       %%mm6, %%mm3        \n\t"
-        "pand       %%mm7, %%mm1        \n\t"
-        "pand       %%mm6, %%mm5        \n\t"
-        "por        %%mm3, %%mm0        \n\t"
-        "por        %%mm5, %%mm1        \n\t"
-# else
-        "movq       %%mm0, %%mm2        \n\t"
-        "movq       %%mm1, %%mm4        \n\t"
-        "pand       %%mm7, %%mm0        \n\t"
-        "pand       %%mm6, %%mm2        \n\t"
-        "pand       %%mm7, %%mm1        \n\t"
-        "pand       %%mm6, %%mm4        \n\t"
-        "movq       %%mm2, %%mm3        \n\t"
-        "movq       %%mm4, %%mm5        \n\t"
-        "pslld        $16, %%mm2        \n\t"
-        "psrld        $16, %%mm3        \n\t"
-        "pslld        $16, %%mm4        \n\t"
-        "psrld        $16, %%mm5        \n\t"
-        "por        %%mm2, %%mm0        \n\t"
-        "por        %%mm4, %%mm1        \n\t"
-        "por        %%mm3, %%mm0        \n\t"
-        "por        %%mm5, %%mm1        \n\t"
-# endif
-        MOVNTQ"     %%mm0,  (%2, %0)    \n\t"
-        MOVNTQ"     %%mm1, 8(%2, %0)    \n\t"
-        "add          $16, %0           \n\t"
-        "js            1b               \n\t"
-        SFENCE"                         \n\t"
-        EMMS"                           \n\t"
-        "2:                             \n\t"
-        : "+&r"(idx)
-        : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
-        : "memory");
-    for (; idx<15; idx+=4) {
-        register unsigned v  = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
-        v &= 0xff00ff;
-        *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
-    }
-}
-
 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
 {
     unsigned i;
@@ -2572,7 +2510,6 @@ static av_cold void RENAME(rgb2rgb_init)(void)
     rgb24to15          = RENAME(rgb24to15);
     rgb24to16          = RENAME(rgb24to16);
     rgb24tobgr24       = RENAME(rgb24tobgr24);
-    shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
     rgb32tobgr16       = RENAME(rgb32tobgr16);
     rgb32tobgr15       = RENAME(rgb32tobgr15);
     yv12toyuy2         = RENAME(yv12toyuy2);
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 156b4d2c74..b753da0ba9 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -24,6 +24,7 @@
 
 SECTION_RODATA
 
+pb_mask_shuffle2103_mmx times 8 dw 255
 pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15
 pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
@@ -42,6 +43,99 @@ SECTION .text
 %endif
 %endmacro
 
+;------------------------------------------------------------------------------
+; shuffle_bytes_2103 mmx/mmext (const uint8_t *src, uint8_t *dst, int src_size)
+;------------------------------------------------------------------------------
+%macro SHUFFLE_BYTES_2103_MMX 0
+cglobal shuffle_bytes_2103, 3, 5, 2, src, dst, w, tmp, x
+    mova   m6, [pb_mask_shuffle2103_mmx]
+    mova   m7, m6
+    psllq  m7, 8
+
+    movsxdifnidn wq, wd
+    mov xq, wq
+
+    add        srcq, wq
+    add        dstq, wq
+    neg          wq
+
+;calc scalar loop
+    and xq, mmsize*2 -4
+    je .loop_simd
+
+.loop_scalar:
+   mov          tmpb, [srcq + wq + 2]
+   mov [dstq+wq + 0], tmpb
+   mov          tmpb, [srcq + wq + 1]
+   mov [dstq+wq + 1], tmpb
+   mov          tmpb, [srcq + wq + 0]
+   mov [dstq+wq + 2], tmpb
+   mov          tmpb, [srcq + wq + 3]
+   mov [dstq+wq + 3], tmpb
+   add            wq, 4
+   sub            xq, 4
+   jg .loop_scalar
+
+;check if src_size < mmsize * 2
+cmp wq, 0
+jge .end
+
+.loop_simd:
+    movu     m0, [srcq+wq]
+    movu     m1, [srcq+wq+8]
+
+%if cpuflag(mmxext)
+    pshufw   m3, m0, 177
+    pshufw   m5, m1, 177
+
+    pand     m0, m7
+    pand     m3, m6
+
+    pand     m1, m7
+    pand     m5, m6
+
+    por      m0, m3
+    por      m1, m5
+%else
+    mova     m2, m0
+    mova     m4, m1
+
+    pand     m0, m7
+    pand     m2, m6
+
+    pand     m1, m7
+    pand     m4, m6
+
+    mova     m3, m2
+    mova     m5, m4
+
+    pslld    m2, 16
+    psrld    m3, 16
+
+    pslld    m4, 16
+    psrld    m5, 16
+
+    por      m0, m2
+    por      m1, m4
+
+    por      m0, m3
+    por      m1, m5
+%endif
+    movu      [dstq+wq], m0
+    movu  [dstq+wq + 8], m1
+
+    add              wq, mmsize*2
+    jl .loop_simd
+
+.end:
+    RET
+%endmacro
+
+INIT_MMX mmx
+SHUFFLE_BYTES_2103_MMX
+
+INIT_MMX mmxext
+SHUFFLE_BYTES_2103_MMX
 ;------------------------------------------------------------------------------
 ; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size)
 ;------------------------------------------------------------------------------
-- 
2.14.3 (Apple Git-98)

