Remove the MMX versions of these functions and modify the SSE
implementations to avoid using MMX registers.

Signed-off-by: Frank Plowman <p...@frankplowman.com>
---
This wasn't wholly straightforward as the existing SSE implementation did
not only use SSE but rather a blend of SSE and MMX.  I would appreciate
some review, as I am not particularly familiar with x86 assembly.  Of
particular interest are the changes to {READ,WRITE}_NUM_BYTES.  The new
implementation does not make economic use of the XMM registers like the
old one did, but it still does not exhaust them -- was this more of a
concern when this was originally written?
---
 libavcodec/x86/videodsp.asm    | 63 ++++++----------------
 libavcodec/x86/videodsp_init.c | 99 +++++++++++++++++-----------------
 2 files changed, 65 insertions(+), 97 deletions(-)

diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
index 3cc07878d3..20dcd3e2b9 100644
--- a/libavcodec/x86/videodsp.asm
+++ b/libavcodec/x86/videodsp.asm
@@ -123,54 +123,43 @@ hvar_fn
 ;         - if (%2 & 8)  fills 8 bytes into xmm$next
 ;         - if (%2 & 4)  fills 4 bytes into xmm$next
 ;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
-; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
-;         - if (%2 & 4)  fills 4 bytes into mm$next
-;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
 ; writing data out is in the same way
 %macro READ_NUM_BYTES 2
 %assign %%off 0     ; offset in source buffer
-%assign %%mmx_idx 0 ; mmx register index
 %assign %%xmm_idx 0 ; xmm register index
 
 %rep %2/mmsize
-%if mmsize == 16
     movu   xmm %+ %%xmm_idx, [srcq+%%off]
 %assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
-    movu    mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
-%endif
 %assign %%off %%off+mmsize
 %endrep ; %2/mmsize
 
-%if mmsize == 16
 %if (%2-%%off) >= 8
 %if %2 > 16 && (%2-%%off) > 8
     movu   xmm %+ %%xmm_idx, [srcq+%2-16]
 %assign %%xmm_idx %%xmm_idx+1
 %assign %%off %2
 %else
-    movq    mm %+ %%mmx_idx, [srcq+%%off]
-%assign %%mmx_idx %%mmx_idx+1
+    movq   xmm %+ %%xmm_idx, [srcq+%%off]
+%assign %%xmm_idx %%xmm_idx+1
 %assign %%off %%off+8
 %endif
 %endif ; (%2-%%off) >= 8
-%endif
 
 %if (%2-%%off) >= 4
 %if %2 > 8 && (%2-%%off) > 4
-    movq    mm %+ %%mmx_idx, [srcq+%2-8]
+    movq   xmm %+ %%xmm_idx, [srcq+%2-8]
 %assign %%off %2
 %else
-    movd    mm %+ %%mmx_idx, [srcq+%%off]
+    movd   xmm %+ %%xmm_idx, [srcq+%%off]
 %assign %%off %%off+4
 %endif
-%assign %%mmx_idx %%mmx_idx+1
+%assign %%xmm_idx %%xmm_idx+1
 %endif ; (%2-%%off) >= 4
 
 %if (%2-%%off) >= 1
 %if %2 >= 4
-    movd mm %+ %%mmx_idx, [srcq+%2-4]
+    movd xmm %+ %%xmm_idx, [srcq+%2-4]
 %elif (%2-%%off) == 1
     mov            valb, [srcq+%2-1]
 %elif (%2-%%off) == 2
@@ -185,48 +174,40 @@ hvar_fn
 
 %macro WRITE_NUM_BYTES 2
 %assign %%off 0     ; offset in destination buffer
-%assign %%mmx_idx 0 ; mmx register index
 %assign %%xmm_idx 0 ; xmm register index
 
 %rep %2/mmsize
-%if mmsize == 16
     movu   [dstq+%%off], xmm %+ %%xmm_idx
 %assign %%xmm_idx %%xmm_idx+1
-%else ; mmx
-    movu   [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
-%endif
 %assign %%off %%off+mmsize
 %endrep ; %2/mmsize
 
-%if mmsize == 16
 %if (%2-%%off) >= 8
 %if %2 > 16 && (%2-%%off) > 8
     movu   [dstq+%2-16], xmm %+ %%xmm_idx
 %assign %%xmm_idx %%xmm_idx+1
 %assign %%off %2
 %else
-    movq   [dstq+%%off], mm %+ %%mmx_idx
-%assign %%mmx_idx %%mmx_idx+1
+    movq   [dstq+%%off], xmm %+ %%xmm_idx
+%assign %%xmm_idx %%xmm_idx+1
 %assign %%off %%off+8
 %endif
 %endif ; (%2-%%off) >= 8
-%endif
 
 %if (%2-%%off) >= 4
 %if %2 > 8 && (%2-%%off) > 4
-    movq    [dstq+%2-8], mm %+ %%mmx_idx
+    movq    [dstq+%2-8], xmm %+ %%xmm_idx
 %assign %%off %2
 %else
-    movd   [dstq+%%off], mm %+ %%mmx_idx
+    movd   [dstq+%%off], xmm %+ %%xmm_idx
 %assign %%off %%off+4
 %endif
-%assign %%mmx_idx %%mmx_idx+1
+%assign %%xmm_idx %%xmm_idx+1
 %endif ; (%2-%%off) >= 4
 
 %if (%2-%%off) >= 1
 %if %2 >= 4
-    movd    [dstq+%2-4], mm %+ %%mmx_idx
+    movd    [dstq+%2-4], xmm %+ %%xmm_idx
 %elif (%2-%%off) == 1
     mov     [dstq+%2-1], valb
 %elif (%2-%%off) == 2
@@ -318,11 +299,8 @@ cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, 
end_y, bh
 %endrep ; 1+%2-%1
 %endmacro ; VERTICAL_EXTEND
 
-INIT_MMX mmx
-VERTICAL_EXTEND 1, 15
-
-INIT_XMM sse
-VERTICAL_EXTEND 16, 22
+INIT_XMM sse2
+VERTICAL_EXTEND 1, 22
 
 ; left/right (horizontal) fast extend functions
 ; these are essentially identical to the vertical extend ones above,
@@ -337,11 +315,7 @@ VERTICAL_EXTEND 16, 22
     imul           vald, 0x01010101
 %if %1 >= 8
     movd             m0, vald
-%if mmsize == 16
     pshufd           m0, m0, q0000
-%else
-    punpckldq        m0, m0
-%endif ; mmsize == 16
 %endif ; %1 > 16
 %endif ; avx2
 %endmacro ; READ_V_PIXEL
@@ -356,7 +330,6 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+mmsize
 %endrep ; %1/mmsize
 
-%if mmsize == 16
 %if %1-%%off >= 8
 %if %1 > 16 && %1-%%off > 8
     movu     [%2+%1-16], m0
@@ -366,7 +339,6 @@ VERTICAL_EXTEND 16, 22
 %assign %%off %%off+8
 %endif
 %endif ; %1-%%off >= 8
-%endif ; mmsize == 16
 
 %if %1-%%off >= 4
 %if %1 > 8 && %1-%%off > 4
@@ -415,18 +387,15 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, 
start_x, bh, val
 %endrep ; 1+(%2-%1)/2
 %endmacro ; H_EXTEND
 
-INIT_MMX mmx
-H_EXTEND 2, 14
-
 INIT_XMM sse2
-H_EXTEND 16, 22
+H_EXTEND 2, 22
 
 %if HAVE_AVX2_EXTERNAL
 INIT_XMM avx2
 H_EXTEND 8, 22
 %endif
 
-INIT_MMX mmxext
+INIT_XMM sse2
 cglobal prefetch, 3, 3, 0, buf, stride, h
 .loop:
     prefetcht0 [bufq]
diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
index ae9db95624..376eec9c23 100644
--- a/libavcodec/x86/videodsp_init.c
+++ b/libavcodec/x86/videodsp_init.c
@@ -37,37 +37,37 @@ typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg 
dst_stride,
                                 x86_reg start_y, x86_reg end_y, x86_reg bh,
                                 x86_reg w);
 
-extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
-extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
-extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
-static emu_edge_vfix_func * const vfixtbl_sse[22] = {
-    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
-    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
-    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
-    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
-    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
-    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
-    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
-    ff_emu_edge_vfix22_sse
+extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2;
+extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2;
+static emu_edge_vfix_func * const vfixtbl_sse2[22] = {
+    ff_emu_edge_vfix1_sse2,  ff_emu_edge_vfix2_sse2,  ff_emu_edge_vfix3_sse2,
+    ff_emu_edge_vfix4_sse2,  ff_emu_edge_vfix5_sse2,  ff_emu_edge_vfix6_sse2,
+    ff_emu_edge_vfix7_sse2,  ff_emu_edge_vfix8_sse2,  ff_emu_edge_vfix9_sse2,
+    ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, ff_emu_edge_vfix12_sse2,
+    ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, ff_emu_edge_vfix15_sse2,
+    ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, ff_emu_edge_vfix18_sse2,
+    ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, ff_emu_edge_vfix21_sse2,
+    ff_emu_edge_vfix22_sse2
 };
 extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
 
@@ -76,21 +76,21 @@ typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg 
dst_stride,
 typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
                                 x86_reg start_x, x86_reg n_words, x86_reg bh);
 
-extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
-extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
+extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2;
+extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
 static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
-    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
-    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
-    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
+    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
+    ff_emu_edge_hfix8_sse2,  ff_emu_edge_hfix10_sse2, ff_emu_edge_hfix12_sse2,
+    ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
     ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
 };
 extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
@@ -104,7 +104,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
 extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
 extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
 static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
-    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
+    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
     ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
     ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
     ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
@@ -196,7 +196,7 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, 
const uint8_t *src,
                                               int h)
 {
     emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
-                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
                      hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
 }
 
@@ -209,24 +209,23 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t 
*buf, const uint8_t *src,
                                               int h)
 {
     emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
-                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
+                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
                      hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
 }
 #endif /* HAVE_AVX2_EXTERNAL */
 #endif /* HAVE_X86ASM */
 
-void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h);
+void ff_prefetch_sse2(const uint8_t *buf, ptrdiff_t stride, int h);
 
 av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
 {
 #if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        ctx->prefetch = ff_prefetch_mmxext;
-    }
-    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
-        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        ctx->prefetch = ff_prefetch_sse2;
+        if (bpc <= 8)
+            ctx->emulated_edge_mc = emulated_edge_mc_sse2;
     }
 #if HAVE_AVX2_EXTERNAL
     if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
-- 
2.46.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to