The branch, master has been updated
       via  32f32537b6242364b42140443bd8e03a0c2a0b92 (commit)
       via  ade54335b2feea2b8c661449d2bf6eaced3fb48c (commit)
       via  625f5c993cf99a2adf446d8eba7b947999f14267 (commit)
       via  a26b99f7933cffd209342905669a6ffa2a537faf (commit)
       via  b03b09aeda1cd890f71f9e6b0bec0a062af4e3be (commit)
       via  a7013f813c7cd13b3ccb066bd0cb1231b9749818 (commit)
       via  86f8adc58e0443024f5ad992ecdb959f0d6d8d95 (commit)
      from  d4e0d5ed48aa9c0e11b9ddeea8c2d14632314089 (commit)


- Log -----------------------------------------------------------------
commit 32f32537b6242364b42140443bd8e03a0c2a0b92
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 05:08:28 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/dvdec,mjpegdec: Remove emms_c
    
    It is no longer necessary now that the IDCTDSP is always ABI-compliant
    (and free of MMX).
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 242708c70a..4799ec96dc 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -36,7 +36,6 @@
  */
 
 #include "libavutil/avassert.h"
-#include "libavutil/emms.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem_internal.h"
 #include "libavutil/thread.h"
@@ -683,8 +682,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx, 
AVFrame *frame,
     avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
                    dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
 
-    emms_c();
-
     /* return image */
     *got_frame = 1;
 
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 5fd77073da..fb39c4e9fd 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -33,7 +33,6 @@
 #include "config_components.h"
 
 #include "libavutil/attributes.h"
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/avassert.h"
 #include "libavutil/mem.h"
@@ -1824,7 +1823,6 @@ next_field:
         }
     }
 
-    emms_c();
     return 0;
  out_of_range:
     av_log(s->avctx, AV_LOG_ERROR, "decode_sos: ac/dc index out of range\n");

commit ade54335b2feea2b8c661449d2bf6eaced3fb48c
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 14:25:54 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/x86/simple_idct: Port to SSE2
    
    Before this commit, the (32-bit only) simple idct came in three
    versions: A pure MMX IDCT and idct-put and idct-add versions
    which use SSE2 at the put and add stage, but still use pure MMX
    for the actual IDCT.
    
    This commit ports said IDCT to SSE2; this was entirely trivial
    for the IDCT1-5 and IDCT7 parts (where one can directly use
    the full register width) and was easy for IDCT6 and IDCT8
    (involving a few movhps and pshufds). Unfortunately, DC_COND_INIT
    and Z_COND_INIT still use only the lower half of the registers.
    
    This saved 4658B here; the benchmarking option of the dct test tool
    showed a 15% speedup.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index e864de6904..f879ab1d42 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -90,7 +90,7 @@ static const struct algo idct_tab_arch[] = {
 #endif
 #else
 #if HAVE_SSE2_EXTERNAL
-    { "SIMPLE-SSE2",   ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, 
AV_CPU_FLAG_SSE2},
+    { "SIMPLE-SSE2",   ff_simple_idct_sse2,  FF_IDCT_PERM_SIMPLE, 
AV_CPU_FLAG_SSE2},
 #endif
 #endif
 #endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 281d143ade..9c7f235b3f 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -76,7 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
             (avctx->idct_algo == FF_IDCT_AUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-                c->idct      = ff_simple_idct_mmx;
+                c->idct      = ff_simple_idct_sse2;
                 c->idct_put  = ff_simple_idct_put_sse2;
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index c79519372a..0dc03738e4 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -1,5 +1,5 @@
 ;
-; Simple IDCT MMX
+; Simple IDCT SSE2
 ;
 ; Copyright (c) 2001, 2002 Michael Niedermayer <[email protected]>
 ;
@@ -30,8 +30,8 @@ SECTION_RODATA
 %if ARCH_X86_32
 cextern pb_80
 
+d40000: dd 4 << 16, 0 ; must be 16-byte aligned
 wm1010: dw 0, 0xffff, 0, 0xffff
-d40000: dd 4 << 16, 0
 
 ; 23170.475006
 ; 22725.260826
@@ -57,650 +57,675 @@ d40000: dd 4 << 16, 0
 coeffs:
     dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 0
+    dw 1 << (ROW_SHIFT - 1), 1
+    dw 1 << (ROW_SHIFT - 1), 0
     dw 1 << (ROW_SHIFT - 1), 1
     dw 1 << (ROW_SHIFT - 1), 0
 
-    dw C4,  C4,  C4,  C4
-    dw C4, -C4,  C4, -C4
+    dw C4,  C4,  C4,  C4, C4,  C4,  C4,  C4
+    dw C4, -C4,  C4, -C4, C4, -C4,  C4, -C4
 
-    dw C2,  C6,  C2,  C6
-    dw C6, -C2,  C6, -C2
+    dw C2,  C6,  C2,  C6, C2,  C6,  C2,  C6
+    dw C6, -C2,  C6, -C2, C6, -C2,  C6, -C2
 
-    dw C1,  C3,  C1,  C3
-    dw C5,  C7,  C5,  C7
+    dw C1,  C3,  C1,  C3, C1,  C3,  C1,  C3
+    dw C5,  C7,  C5,  C7, C5,  C7,  C5,  C7
 
-    dw C3, -C7,  C3, -C7
-    dw -C1, -C5, -C1, -C5
+    dw  C3, -C7,  C3, -C7,  C3, -C7,  C3, -C7
+    dw -C1, -C5, -C1, -C5, -C1, -C5, -C1, -C5
 
-    dw C5, -C1,  C5, -C1
-    dw C7,  C3,  C7,  C3
+    dw C5, -C1,  C5, -C1, C5, -C1,  C5, -C1
+    dw C7,  C3,  C7,  C3, C7,  C3,  C7,  C3
 
-    dw C7, -C5,  C7, -C5
-    dw C3, -C1,  C3, -C1
+    dw C7, -C5,  C7, -C5, C7, -C5,  C7, -C5
+    dw C3, -C1,  C3, -C1, C3, -C1,  C3, -C1
 
 SECTION .text
 
 %macro DC_COND_IDCT 7
-    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
-    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
-    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
-    movq            mm4, [wm1010]
-    pand            mm4, mm0
-    por             mm4, mm1
-    por             mm4, mm2
-    por             mm4, mm3
-    packssdw        mm4, mm4
-    movd            t0d, mm4
+    movq             m0, [blockq + %1]  ; R4     R0      r4      r0
+    movq             m1, [blockq + %2]  ; R6     R2      r6      r2
+    movq             m2, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m3, [blockq + %4]  ; R7     R5      r7      r5
+    movq             m4, [wm1010]
+    pand             m4, m0
+    por              m4, m1
+    por              m4, m2
+    por              m4, m3
+    packssdw         m4, m4
+    movd            t0d, m4
     or              t0d, t0d
     jz              %%1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs + 8]
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs + 8]
-    paddd           mm1, mm0            ; A1             a1
-    paddd           mm0, mm0
-    psubd           mm0, mm1            ; A2             a2
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm5            ; B0             b0
-    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm5, mm2            ; B1             b1
-    psrad           mm7, %7
-    psrad           mm4, %7
-    movq            mm2, mm1            ; A1             a1
-    paddd           mm1, mm5            ; A1+B1          a1+b1
-    psubd           mm2, mm5            ; A1-B1          a1-b1
-    psrad           mm1, %7
-    psrad           mm2, %7
-    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
-    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
-    movq           [%5], mm7
-    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    movq      [24 + %5], mm2
-    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm0, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %7
-    psrad           mm0, %7
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm1            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %7
-    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
-    movq       [8 + %5], mm2
-    psrad           mm4, %7
-    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
-    movq      [16 + %5], mm4
+    movq             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    movq             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    movq             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    movq             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    movq             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, [coeffs + 16]
+    movq             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    movq             m5, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m5, m3             ; C7R7+C5R5      C7r7+C5r5
+    paddd            m0, [coeffs + 16]
+    paddd            m1, m0             ; A1             a1
+    paddd            m0, m0
+    psubd            m0, m1             ; A2             a2
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m5             ; B0             b0
+    movq             m5, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m5, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m5, m2             ; B1             b1
+    psrad            m7, %7
+    psrad            m4, %7
+    movq             m2, m1             ; A1             a1
+    paddd            m1, m5             ; A1+B1          a1+b1
+    psubd            m2, m5             ; A1-B1          a1-b1
+    psrad            m1, %7
+    psrad            m2, %7
+    packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
+    pshufd           m7, m7, 0xD8
+    packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+    pshufd           m2, m2, 0xD8
+    movq           [%5], m7
+    movq             m1, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    movq      [24 + %5], m2
+    pmaddwd          m4, m1             ; -C1R3+C5R1     -C1r3+C5r1
+    movq             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    movq             m2, m0             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m0, m4             ; a2-B2          a2-b2
+    psrad            m2, %7
+    psrad            m0, %7
+    movq             m4, m6             ; A3             a3
+    paddd            m3, m1             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %7
+    packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+    pshufd           m2, m2, 0xD8
+    movq       [8 + %5], m2
+    psrad            m4, %7
+    packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+    pshufd           m4, m4, 0xD8
+    movq      [16 + %5], m4
     jmp             %%2
 %%1:
-    pslld           mm0, 16
-    paddd           mm0, [d40000]
-    psrad           mm0, 13
-    packssdw        mm0, mm0
-    movq           [%5], mm0
-    movq       [8 + %5], mm0
-    movq      [16 + %5], mm0
-    movq      [24 + %5], mm0
+    pslld            m0, 16
+    ; d40000 is only eight bytes long, so this will clobber
+    ; the upper half of m0 with wm1010. It doesn't matter due to pshufd below.
+    paddd            m0, [d40000]
+    psrad            m0, 13
+    packssdw         m0, m0
+    pshufd           m0, m0, 0x0
+    mova           [%5], m0
+    mova      [16 + %5], m0
 %%2:
 %endmacro
 
 %macro Z_COND_IDCT 8
-    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
-    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
-    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
-    movq            mm4, mm0
-    por             mm4, mm1
-    por             mm4, mm2
-    por             mm4, mm3
-    packssdw        mm4, mm4
-    movd            t0d, mm4
+    movq             m0, [blockq + %1]  ; R4     R0      r4      r0
+    movq             m1, [blockq + %2]  ; R6     R2      r6      r2
+    movq             m2, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m3, [blockq + %4]  ; R7     R5      r7      r5
+    movq             m4, m0
+    por              m4, m1
+    por              m4, m2
+    por              m4, m3
+    packssdw         m4, m4
+    movd            t0d, m4
     or              t0d, t0d
     jz               %8
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, [coeffs]
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
-    paddd           mm0, [coeffs]
-    paddd           mm1, mm0            ; A1             a1
-    paddd           mm0, mm0
-    psubd           mm0, mm1            ; A2             a2
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm5            ; B0             b0
-    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm5, mm2            ; B1             b1
-    psrad           mm7, %7
-    psrad           mm4, %7
-    movq            mm2, mm1            ; A1             a1
-    paddd           mm1, mm5            ; A1+B1          a1+b1
-    psubd           mm2, mm5            ; A1-B1          a1-b1
-    psrad           mm1, %7
-    psrad           mm2, %7
-    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
-    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
-    movq           [%5], mm7
-    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    movq      [24 + %5], mm2
-    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm0            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm0, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %7
-    psrad           mm0, %7
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm1            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %7
-    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
-    movq       [8 + %5], mm2
-    psrad           mm4, %7
-    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
-    movq      [16 + %5], mm4
+    movq             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    movq             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    movq             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    movq             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    movq             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, [coeffs]
+    movq             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    movq             m5, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m5, m3             ; C7R7+C5R5      C7r7+C5r5
+    paddd            m0, [coeffs]
+    paddd            m1, m0             ; A1             a1
+    paddd            m0, m0
+    psubd            m0, m1             ; A2             a2
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m5             ; B0             b0
+    movq             m5, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m5, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m5, m2             ; B1             b1
+    psrad            m7, %7
+    psrad            m4, %7
+    movq             m2, m1             ; A1             a1
+    paddd            m1, m5             ; A1+B1          a1+b1
+    psubd            m2, m5             ; A1-B1          a1-b1
+    psrad            m1, %7
+    psrad            m2, %7
+    packssdw         m7, m1             ; A1+B1  a1+b1   A0+B0   a0+b0
+    pshufd           m7, m7, 0xD8
+    packssdw         m2, m4             ; A0-B0  a0-b0   A1-B1   a1-b1
+    pshufd           m2, m2, 0xD8
+    movq           [%5], m7
+    movq             m1, [blockq + %3]  ; R3     R1      r3      r1
+    movq             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    movq      [24 + %5], m2
+    pmaddwd          m4, m1             ; -C1R3+C5R1     -C1r3+C5r1
+    movq             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    movq             m2, m0             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m0, m4             ; a2-B2          a2-b2
+    psrad            m2, %7
+    psrad            m0, %7
+    movq             m4, m6             ; A3             a3
+    paddd            m3, m1             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %7
+    packssdw         m2, m6             ; A3+B3  a3+b3   A2+B2   a2+b2
+    pshufd           m2, m2, 0xD8
+    movq       [8 + %5], m2
+    psrad            m4, %7
+    packssdw         m4, m0             ; A2-B2  a2-b2   A3-B3   a3-b3
+    pshufd           m4, m4, 0xD8
+    movq      [16 + %5], m4
 %endmacro
 
 %macro IDCT1 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm1            ; B0             b0
-    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm1, mm2            ; B1             b1
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm2, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm0, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm0            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m1             ; B0             b0
+    mova             m1, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m1, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m1, m2             ; B1             b1
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m2, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m0, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m0             ; -C1R3+C5R1     -C1r3+C5r1
+    mova             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m0, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m3, m0             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT2 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm1, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm1            ; A0-B0          a0-b0
-    psrad           mm1, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm7            ; A1+B1          a1+b1
-    psubd           mm2, mm7            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm1, mm1            ; A0+B0  a0+b0
-    movd           [%5], mm1
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm2, mm1            ; A2+B2          a2+b2
-    psubd           mm5, mm1            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm1, mm6            ; A3             a3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm1, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm1, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm1, mm1            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm1
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    mova             m7, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m7, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m1, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m1             ; A0-B0          a0-b0
+    psrad            m1, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m7             ; A1+B1          a1+b1
+    psubd            m2, m7             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m1, m1             ; A0+B0  a0+b0
+    movq           [%5], m1
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m1, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m2, m1             ; A2+B2          a2+b2
+    psubd            m5, m1             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m1, m6             ; A3             a3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m1, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m1, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m1, m1             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m1
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT3 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm1, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm1            ; A0-B0          a0-b0
-    psrad           mm1, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm7            ; A1+B1          a1+b1
-    psubd           mm2, mm7            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm1, mm1            ; A0+B0  a0+b0
-    movd           [%5], mm1
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm2, mm1            ; A2+B2          a2+b2
-    psubd           mm5, mm1            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm1, mm6            ; A3             a3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm1, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm1, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm1, mm1            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm1
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    mova             m7, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m7, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m1, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m1             ; A0-B0          a0-b0
+    psrad            m1, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m7             ; A1+B1          a1+b1
+    psubd            m2, m7             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m1, m1             ; A0+B0  a0+b0
+    movq           [%5], m1
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m1, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m1, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m2, m1             ; A2+B2          a2+b2
+    psubd            m5, m1             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m1, m6             ; A3             a3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m1, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m1, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m1, m1             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m1
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT4 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm3, %4             ; R7     R5      r7      r5
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
-    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
-    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm1            ; B0             b0
-    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
-    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    paddd           mm1, mm2            ; B1             b1
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm2, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm2, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm2, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm2, mm2            ; A1-B1  a1-b1
-    movd      [96 + %5], mm2
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm0, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
-    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
-    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
-    movq            mm2, mm5            ; A2             a2
-    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
-    paddd           mm4, mm7            ; B2             b2
-    paddd           mm2, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm2, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm3, mm0            ; B3             b3
-    paddd           mm6, mm3            ; A3+B3          a3+b3
-    psubd           mm4, mm3            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm2, mm2            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm2
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m3, %4             ; R7     R5      r7      r5
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m1, [coeffs + 112] ; C7     C5      C7      C5
+    pmaddwd          m1, m3             ; C7R7+C5R5      C7r7+C5r5
+    pmaddwd          m2, [coeffs + 128] ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m1             ; B0             b0
+    mova             m1, [coeffs + 144] ; -C5    -C1     -C5     -C1
+    pmaddwd          m1, m3             ; -C5R7-C1R5     -C5r7-C1r5
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    paddd            m1, m2             ; B1             b1
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m2, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m2, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m2, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m2, m2             ; A1-B1  a1-b1
+    movq      [96 + %5], m2
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m0, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m0             ; -C1R3+C5R1     -C1r3+C5r1
+    mova             m7, [coeffs + 176] ; C3     C7      C3      C7
+    pmaddwd          m0, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    pmaddwd          m7, m3             ; C3R7+C7R5      C3r7+C7r5
+    mova             m2, m5             ; A2             a2
+    pmaddwd          m3, [coeffs + 208] ; -C1R7+C3R5     -C1r7+C3r5
+    paddd            m4, m7             ; B2             b2
+    paddd            m2, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m2, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m3, m0             ; B3             b3
+    paddd            m6, m3             ; A3+B3          a3+b3
+    psubd            m4, m3             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m2, m2             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m2
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT5 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm3, [coeffs + 64]
-    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm1, mm0            ; A1             a1
-    paddd           mm0, mm3            ; A1+B1          a1+b1
-    psubd           mm1, mm3            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm1, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm1, mm1            ; A1-B1  a1-b1
-    movd      [96 + %5], mm1
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
-    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    movq            mm1, mm5            ; A2             a2
-    paddd           mm1, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm1, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm6, mm2            ; A3+B3          a3+b3
-    psubd           mm4, mm2            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    psrad           mm4, %6
-    packssdw        mm1, mm1            ; A2+B2  a2+b2
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [32 + %5], mm1
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [48 + %5], mm6
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m3, [coeffs + 128]
+    pmaddwd          m3, m2             ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m1, m0             ; A1             a1
+    paddd            m0, m3             ; A1+B1          a1+b1
+    psubd            m1, m3             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m1, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m1, m1             ; A1-B1  a1-b1
+    movq      [96 + %5], m1
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m2             ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd          m2, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    mova             m1, m5             ; A2             a2
+    paddd            m1, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m1, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m6, m2             ; A3+B3          a3+b3
+    psubd            m4, m2             ; a3-B3          a3-b3
+    psrad            m6, %6
+    psrad            m4, %6
+    packssdw         m1, m1             ; A2+B2  a2+b2
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [32 + %5], m1
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [48 + %5], m6
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT6 6
-    movq            mm0, [%1]           ; R4     R0      r4      r0
-    movq            mm1, [%2]           ; R6     R2      r6      r2
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
-    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
-    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
-    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
-    paddd           mm7, mm1            ; A0             a0
-    paddd           mm1, mm1            ; 2C0            2c0
-    psubd           mm1, mm7            ; A3             a3
-    paddd           mm3, mm2            ; A1             a1
-    paddd           mm2, mm2            ; 2C1            2c1
-    psubd           mm2, mm3            ; A2             a2
-    psrad           mm4, %6
-    psrad           mm7, %6
-    psrad           mm3, %6
-    packssdw        mm4, mm7            ; A0     a0
-    movq           [%5], mm4
-    psrad           mm0, %6
-    packssdw        mm0, mm3            ; A1     a1
-    movq      [16 + %5], mm0
-    movq      [96 + %5], mm0
-    movq     [112 + %5], mm4
-    psrad           mm5, %6
-    psrad           mm6, %6
-    psrad           mm2, %6
-    packssdw        mm5, mm2            ; A2-B2  a2-b2
-    movq      [32 + %5], mm5
-    psrad           mm1, %6
-    packssdw        mm6, mm1            ; A3+B3  a3+b3
-    movq      [48 + %5], mm6
-    movq      [64 + %5], mm6
-    movq      [80 + %5], mm5
+    movq             m0, [%1]           ; R4     R0      r4      r0
+    movhps           m0, [%1 + 16]
+    movq             m1, [%2]           ; R6     R2      r6      r2
+    movhps           m1, [%2 + 16]
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    movq             m2, [%1 + 8]       ; R4     R0      r4      r0
+    movhps           m2, [%1 + 24]
+    movq             m3, [%2 + 8]       ; R6     R2      r6      r2
+    movhps           m3, [%2 + 24]
+    mova             m1, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m1, m2             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m2, m7             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m7, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m7, m3             ; C6R6+C2R2      C6r6+C2r2
+    pmaddwd          m3, [coeffs + 80]  ; -C2R6+C6R2     -C2r6+C6r2
+    paddd            m7, m1             ; A0             a0
+    paddd            m1, m1             ; 2C0            2c0
+    psubd            m1, m7             ; A3             a3
+    paddd            m3, m2             ; A1             a1
+    paddd            m2, m2             ; 2C1            2c1
+    psubd            m2, m3             ; A2             a2
+    psrad            m4, %6
+    psrad            m7, %6
+    psrad            m3, %6
+    packssdw         m4, m7             ; A0     a0
+    pshufd           m4, m4, 0xD8
+    mova           [%5], m4
+    psrad            m0, %6
+    packssdw         m0, m3             ; A1     a1
+    pshufd           m0, m0, 0xD8
+    mova      [16 + %5], m0
+    mova      [96 + %5], m0
+    mova     [112 + %5], m4
+    psrad            m5, %6
+    psrad            m6, %6
+    psrad            m2, %6
+    packssdw         m5, m2             ; A2-B2  a2-b2
+    pshufd           m5, m5, 0xD8
+    mova      [32 + %5], m5
+    psrad            m1, %6
+    packssdw         m6, m1             ; A3+B3  a3+b3
+    pshufd           m6, m6, 0xD8
+    mova      [48 + %5], m6
+    mova      [64 + %5], m6
+    mova      [80 + %5], m5
 %endmacro
 
 %macro IDCT7 6
-    movq            mm0, %1             ; R4     R0      r4      r0
-    movq            mm1, %2             ; R6     R2      r6      r2
-    movq            mm2, %3             ; R3     R1      r3      r1
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
-    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
-    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
-    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
-    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
-    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
-    paddd           mm4, mm5            ; A0             a0
-    psubd           mm6, mm5            ; A3             a3
-    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
-    paddd           mm0, mm1            ; A1             a1
-    psubd           mm5, mm1            ; A2             a2
-    movq            mm1, [coeffs + 64]
-    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
-    paddd           mm7, mm4            ; A0+B0          a0+b0
-    paddd           mm4, mm4            ; 2A0            2a0
-    psubd           mm4, mm7            ; A0-B0          a0-b0
-    psrad           mm7, %6
-    psrad           mm4, %6
-    movq            mm3, mm0            ; A1             a1
-    paddd           mm0, mm1            ; A1+B1          a1+b1
-    psubd           mm3, mm1            ; A1-B1          a1-b1
-    psrad           mm0, %6
-    psrad           mm3, %6
-    packssdw        mm7, mm7            ; A0+B0  a0+b0
-    movd           [%5], mm7
-    packssdw        mm0, mm0            ; A1+B1  a1+b1
-    movd      [16 + %5], mm0
-    packssdw        mm3, mm3            ; A1-B1  a1-b1
-    movd      [96 + %5], mm3
-    packssdw        mm4, mm4            ; A0-B0  a0-b0
-    movd     [112 + %5], mm4
-    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
-    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
-    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
-    movq            mm3, mm5            ; A2             a2
-    paddd           mm3, mm4            ; A2+B2          a2+b2
-    psubd           mm5, mm4            ; a2-B2          a2-b2
-    psrad           mm3, %6
-    psrad           mm5, %6
-    movq            mm4, mm6            ; A3             a3
-    paddd           mm6, mm2            ; A3+B3          a3+b3
-    psubd           mm4, mm2            ; a3-B3          a3-b3
-    psrad           mm6, %6
-    packssdw        mm3, mm3            ; A2+B2  a2+b2
-    movd      [32 + %5], mm3
-    psrad           mm4, %6
-    packssdw        mm6, mm6            ; A3+B3  a3+b3
-    movd      [48 + %5], mm6
-    packssdw        mm4, mm4            ; A3-B3  a3-b3
-    packssdw        mm5, mm5            ; A2-B2  a2-b2
-    movd      [64 + %5], mm4
-    movd      [80 + %5], mm5
+    mova             m0, %1             ; R4     R0      r4      r0
+    mova             m1, %2             ; R6     R2      r6      r2
+    mova             m2, %3             ; R3     R1      r3      r1
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m5, [coeffs + 64]  ; C6     C2      C6      C2
+    pmaddwd          m5, m1             ; C6R6+C2R2      C6r6+C2r2
+    mova             m6, [coeffs + 80]  ; -C2    C6      -C2     C6
+    pmaddwd          m1, m6             ; -C2R6+C6R2     -C2r6+C6r2
+    mova             m6, m4             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 96]  ; C3     C1      C3      C1
+    pmaddwd          m7, m2             ; C3R3+C1R1      C3r3+C1r1
+    paddd            m4, m5             ; A0             a0
+    psubd            m6, m5             ; A3             a3
+    mova             m5, m0             ; -C4R4+C4R0     -C4r4+C4r0
+    paddd            m0, m1             ; A1             a1
+    psubd            m5, m1             ; A2             a2
+    mova             m1, [coeffs + 128]
+    pmaddwd          m1, m2             ; -C7R3+C3R1     -C7r3+C3r1
+    paddd            m7, m4             ; A0+B0          a0+b0
+    paddd            m4, m4             ; 2A0            2a0
+    psubd            m4, m7             ; A0-B0          a0-b0
+    psrad            m7, %6
+    psrad            m4, %6
+    mova             m3, m0             ; A1             a1
+    paddd            m0, m1             ; A1+B1          a1+b1
+    psubd            m3, m1             ; A1-B1          a1-b1
+    psrad            m0, %6
+    psrad            m3, %6
+    packssdw         m7, m7             ; A0+B0  a0+b0
+    movq           [%5], m7
+    packssdw         m0, m0             ; A1+B1  a1+b1
+    movq      [16 + %5], m0
+    packssdw         m3, m3             ; A1-B1  a1-b1
+    movq      [96 + %5], m3
+    packssdw         m4, m4             ; A0-B0  a0-b0
+    movq     [112 + %5], m4
+    mova             m4, [coeffs + 160] ; -C1    C5      -C1     C5
+    pmaddwd          m4, m2             ; -C1R3+C5R1     -C1r3+C5r1
+    pmaddwd          m2, [coeffs + 192] ; -C5R3+C7R1     -C5r3+C7r1
+    mova             m3, m5             ; A2             a2
+    paddd            m3, m4             ; A2+B2          a2+b2
+    psubd            m5, m4             ; a2-B2          a2-b2
+    psrad            m3, %6
+    psrad            m5, %6
+    mova             m4, m6             ; A3             a3
+    paddd            m6, m2             ; A3+B3          a3+b3
+    psubd            m4, m2             ; a3-B3          a3-b3
+    psrad            m6, %6
+    packssdw         m3, m3             ; A2+B2  a2+b2
+    movq      [32 + %5], m3
+    psrad            m4, %6
+    packssdw         m6, m6             ; A3+B3  a3+b3
+    movq      [48 + %5], m6
+    packssdw         m4, m4             ; A3-B3  a3-b3
+    packssdw         m5, m5             ; A2-B2  a2-b2
+    movq      [64 + %5], m4
+    movq      [80 + %5], m5
 %endmacro
 
 %macro IDCT8 6
-    movq            mm0, [%1]           ; R4     R0      r4      r0
-    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
-    psrad           mm4, %6
-    psrad           mm0, %6
-    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
-    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
-    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
-    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
-    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
-    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
-    psrad           mm1, %6
-    packssdw        mm4, mm1            ; A0     a0
-    movq           [%5], mm4
-    psrad           mm2, %6
-    packssdw        mm0, mm2            ; A1     a1
-    movq      [16 + %5], mm0
-    movq      [96 + %5], mm0
-    movq     [112 + %5], mm4
-    movq      [32 + %5], mm0
-    movq      [48 + %5], mm4
-    movq      [64 + %5], mm4
-    movq      [80 + %5], mm0
+    movq             m0, [%1]           ; R4     R0      r4      r0
+    movhps           m0, [%1 + 16]
+    mova             m4, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m4, m0             ; C4R4+C4R0      C4r4+C4r0
+    mova             m5, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m0, m5             ; -C4R4+C4R0     -C4r4+C4r0
+    psrad            m4, %6
+    psrad            m0, %6
+    movq             m2, [%1 + 8]       ; R4     R0      r4      r0
+    movhps           m2, [%1 + 24]
+    mova             m1, [coeffs + 32]  ; C4     C4      C4      C4
+    pmaddwd          m1, m2             ; C4R4+C4R0      C4r4+C4r0
+    mova             m7, [coeffs + 48]  ; -C4    C4      -C4     C4
+    pmaddwd          m2, m7             ; -C4R4+C4R0     -C4r4+C4r0
+    mova             m7, [coeffs + 64]  ; C6     C2      C6      C2
+    psrad            m1, %6
+    packssdw         m4, m1             ; A0     a0
+    pshufd           m4, m4, 0xD8
+    mova           [%5], m4
+    psrad            m2, %6
+    packssdw         m0, m2             ; A1     a1
+    pshufd           m0, m0, 0xD8
+    mova      [16 + %5], m0
+    mova      [96 + %5], m0
+    mova     [112 + %5], m4
+    mova      [32 + %5], m0
+    mova      [48 + %5], m4
+    mova      [64 + %5], m4
+    mova      [80 + %5], m0
 %endmacro
 
 %macro IDCT 0
@@ -710,9 +735,7 @@ SECTION .text
     Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
 
     IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -721,9 +744,7 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
 
     IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -731,9 +752,7 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
 
     IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
@@ -741,41 +760,33 @@ SECTION .text
     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
 
     IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%3:
 
     IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%5:
 
     IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
-    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
     jmp %%9
 
     ALIGN 16
     %%1:
 
     IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
-    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
     IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
-    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
     jmp %%9
 
     ALIGN 16
     %%7:
 
     IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
-    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 
     %%9:
 %endmacro
@@ -805,15 +816,12 @@ SECTION .text
     movhps     [pixelsq+lsizeq], m0
 %endmacro
 
-INIT_MMX mmx
+INIT_XMM sse2
 
 cglobal simple_idct, 1, 2, 8, 128, block, t0
     IDCT
-    emms
 RET
 
-INIT_XMM sse2
-
 cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
     IDCT
     lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 9b64cfe9bc..c9ba6aedaf 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -22,10 +22,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
-void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
-
+void ff_simple_idct_sse2(int16_t *block);
 void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
 void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t 
*block);
 

commit 625f5c993cf99a2adf446d8eba7b947999f14267
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 04:04:02 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/tests/dct: Remove unnecessary emms_c
    
    Unnecessary since the Xvid IDCT no longer uses MMX registers at all.
    (Notice that the simple MMX IDCT issues emms and is therefore ABI
    compliant.)
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 784b49276c..eb74f3559e 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -37,7 +37,6 @@
 
 #include "libavutil/cpu.h"
 #include "libavutil/common.h"
-#include "libavutil/emms.h"
 #include "libavutil/internal.h"
 #include "libavutil/lfg.h"
 #include "libavutil/mem_internal.h"
@@ -212,7 +211,6 @@ static int dct_error(const struct algo *dct, int test, int 
is_idct, int speed, c
         permute(block, block1, dct->perm_type);
 
         dct->func(block);
-        emms_c();
 
         if (!strcmp(dct->name, "IJG-AAN-INT")) {
             for (i = 0; i < 64; i++) {
@@ -287,7 +285,6 @@ static int dct_error(const struct algo *dct, int test, int 
is_idct, int speed, c
             memcpy(block, block1, sizeof(block));
             dct->func(block);
         }
-        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);
@@ -449,7 +446,6 @@ static void idct248_error(const char *name,
                 block[i] = block1[i];
             idct248_put(img_dest, 8, block);
         }
-        emms_c();
         it1 += NB_ITS_SPEED;
         ti1 = av_gettime_relative() - ti;
     } while (ti1 < 1000000);

commit a26b99f7933cffd209342905669a6ffa2a537faf
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 03:58:12 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/x86/xvididct: Remove remnants of MMX
    
    The non-MMX code only uses the first six rounders.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 0daa2edd42..c3bfabb955 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -24,7 +24,7 @@
 ;
 ; More details at http://skal.planet-d.net/coding/dct.html
 ;
-; =======     MMX and XMM forward discrete cosine transform     =======
+; ===========     XMM forward discrete cosine transform     ===========
 ;
 ; Copyright(C) 2001 Peter Ross <[email protected]>
 ;
@@ -67,7 +67,6 @@
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
-; Similar to tg_1_16 in MMX code
 tan1:   times 8 dw 13036
 tan2:   times 8 dw 27146
 tan3:   times 8 dw 43790
@@ -91,7 +90,6 @@ iTab4:  dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 
0x4b42, 0xd746
         dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
         dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
 
-; Similar to rounder_0 in MMX code
 ; 4 first similar, then: 4*8->6*16  5*8->4*16  6/7*8->5*16
 walkenIdctRounders: times 4 dd 65536
                     times 4 dd  3597
@@ -99,7 +97,6 @@ walkenIdctRounders: times 4 dd 65536
                     times 4 dd  1203
                     times 4 dd   120
                     times 4 dd   512
-                    times 2 dd     0
 
 SECTION .text
 

commit b03b09aeda1cd890f71f9e6b0bec0a062af4e3be
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 02:59:59 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/x86/xvididct: Don't use MMX registers in SSE2 function
    
    It is higly surprising and would necessitate emms in order to be ABI
    compliant; but it is better just not to use them in the first place.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 4197551cdf..0daa2edd42 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -101,8 +101,6 @@ walkenIdctRounders: times 4 dd 65536
                     times 4 dd   512
                     times 2 dd     0
 
-pb_127: times 8 db 127
-
 SECTION .text
 
 ; Temporary storage before the column pass
@@ -167,36 +165,47 @@ SECTION .text
 %define TAN1  xmm2
 %endif
 
-%macro JZ  2
-    test      %1, %1
+%macro JZ  3
+    test    %1%3, %1%3
     jz       .%2
 %endmacro
 
-%macro JNZ  2
-    test      %1, %1
+%macro JNZ  3
+    test    %1%3, %1%3
     jnz      .%2
 %endmacro
 
 %macro TEST_ONE_ROW 4 ; src, reg, clear, arg
     %3        %4
-    movq     mm1, [%1]
-    por      mm1, [%1 + 8]
-    paddusb  mm1, mm0
-    pmovmskb  %2, mm1
+    mova       m1, [%1]
+    ; due to signed saturation, m1 is all zero iff m1 is all zero after packing
+    packsswb   m1, m1
+%if ARCH_X86_64
+    movq       %2, m1
+%else
+    packsswb   m1, m1
+    movd       %2, m1
+%endif
 %endmacro
 
 ;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
 %macro  TEST_TWO_ROWS  8
     %5         %6
     %7         %8
-    movq      mm1, [%1 + 0]
-    por       mm1, [%1 + 8]
-    movq      mm2, [%2 + 0]
-    por       mm2, [%2 + 8]
-    paddusb   mm1, mm0
-    paddusb   mm2, mm0
-    pmovmskb   %3, mm1
-    pmovmskb   %4, mm2
+    mova       m1, [%1]
+    packsswb   m1, [%2]
+    packsswb   m1, m1
+%if ARCH_X86_64
+    movq       %4, m1
+    mov       %3d, %4d
+    shr       %4q, 32
+%else
+    packsswb   m1, m1
+    movd       %3, m1
+    mov        %4, %3
+    shr        %4, 16
+    and        %3, 0xFFFF
+%endif
 %endmacro
 
 ; IDCT pass on rows.
@@ -499,16 +508,16 @@ SECTION .text
 
 %macro IDCT_SSE2 1 ; 0=normal  1=put  2=add
 %if %1 == 0 || ARCH_X86_32
-    %define GPR0  r1d
-    %define GPR1  r2d
-    %define GPR2  r3d
-    %define GPR3  r4d
+    %define GPR0  r1
+    %define GPR1  r2
+    %define GPR2  r3
+    %define GPR3  r4
     %define NUM_GPRS 5
 %else
-    %define GPR0  r3d
-    %define GPR1  r4d
-    %define GPR2  r5d
-    %define GPR3  r6d
+    %define GPR0  r3
+    %define GPR1  r4
+    %define GPR2  r5
+    %define GPR3  r6
     %define NUM_GPRS 7
 %endif
 %if %1 == 0
@@ -527,34 +536,33 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, 
dest, stride, block
     %xdefine BLOCK r0q
     %endif
 %endif
-    movq           mm0, [pb_127]
     iMTX_MULT      BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
     iMTX_MULT      BLOCK + 1*16, iTab2, PUT_ODD, ROW1,  1*16
     iMTX_MULT      BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
 
     TEST_TWO_ROWS  BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, 
CLEAR_EVEN, ROW4 ; a, c
-    JZ   GPR0, col1
+    JZ   GPR0, col1, d
     iMTX_MULT      BLOCK + 3*16, iTab4, PUT_ODD, ROW3,  3*16
 .col1:
     TEST_TWO_ROWS  BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, 
CLEAR_EVEN, ROW6 ; a, d
     TEST_ONE_ROW   BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
 
     iLLM_HEAD
-    JNZ  GPR1, 2
-    JNZ  GPR0, 3
-    JNZ  GPR2, 4
-    JNZ  GPR3, 5
+    JNZ  GPR1, 2, d
+    JNZ  GPR0, 3, d
+    JNZ  GPR2, 4, d
+    JNZ  GPR3, 5, q
     iLLM_PASS_SPARSE BLOCK, %1
     jmp .6
 .2:
     iMTX_MULT     BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
 .3:
     iMTX_MULT     BLOCK + 5*16, iTab4, PUT_ODD, ROW5,  4*16
-    JZ   GPR2, col2
+    JZ   GPR2, col2, d
 .4:
     iMTX_MULT     BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
 .col2:
-    JZ   GPR3, col3
+    JZ   GPR3, col3, q
 .5:
     iMTX_MULT     BLOCK + 7*16, iTab2, PUT_ODD, ROW7,  5*16
 .col3:

commit a7013f813c7cd13b3ccb066bd0cb1231b9749818
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 17:53:30 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/tests/x86/dct: Test 32bit simple idct
    
    The test has been removed in bfb28b5ce89f3e950214b67ea95b45e3355c2caf
    when MMX idctdsp functions overridden by SSE2 were removed;
    ff_simple_idct_mmx() has been completely disabled in this patch
    for x64 and so the test should have been disabled on x64 instead
    of removing it.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 7800abc7f7..e864de6904 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,6 +88,10 @@ static const struct algo idct_tab_arch[] = {
     { "SIMPLE10-AVX",   ff_simple_idct10_avx,  FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_AVX},
     { "SIMPLE12-AVX",   ff_simple_idct12_avx,  FF_IDCT_PERM_TRANSPOSE, 
AV_CPU_FLAG_AVX,  1 },
 #endif
+#else
+#if HAVE_SSE2_EXTERNAL
+    { "SIMPLE-SSE2",   ff_simple_idct_mmx,  FF_IDCT_PERM_SIMPLE, 
AV_CPU_FLAG_SSE2},
+#endif
 #endif
 #endif
     { 0 }

commit 86f8adc58e0443024f5ad992ecdb959f0d6d8d95
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 13:56:01 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100

    avcodec/x86/idctdsp_init: Fix IDCT permutation for 32bit without SSE2
    
    bfb28b5ce89f3e950214b67ea95b45e3355c2caf removed the MMX idct_put
    and idct_add functions, because they were overridden by SSE2 versions
    (which use SSE2 only for the put/add part, not the actual IDCT).
    This meant that for MMX, the idct functions are not set in unison,
    so that the permutation which is meant to apply to all three
    is incorrect on 32bit systems if SSE2 is unavailable/disabled.
    
    Fix this by setting the MMX version only if SSE2 is enabled.
    
    (No one complained, so apparently no one uses a new FFmpeg
    with non-SSE2 capable systems.)
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 2d165b975b..281d143ade 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -65,18 +65,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
 {
     int cpu_flags = av_get_cpu_flags();
 
-#if ARCH_X86_32
-    if (EXTERNAL_MMX(cpu_flags)) {
-        if (!high_bit_depth &&
-            avctx->lowres == 0 &&
-            (avctx->idct_algo == FF_IDCT_AUTO ||
-                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
-                avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
-                c->idct      = ff_simple_idct_mmx;
-        }
-    }
-#endif
-
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
         c->put_pixels_clamped        = ff_put_pixels_clamped_sse2;
@@ -88,6 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, 
AVCodecContext *avctx,
             (avctx->idct_algo == FF_IDCT_AUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+                c->idct      = ff_simple_idct_mmx;
                 c->idct_put  = ff_simple_idct_put_sse2;
                 c->idct_add  = ff_simple_idct_add_sse2;
                 c->perm_type = FF_IDCT_PERM_SIMPLE;

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/dvdec.c             |    3 -
 libavcodec/mjpegdec.c          |    2 -
 libavcodec/tests/dct.c         |    4 -
 libavcodec/tests/x86/dct.c     |    4 +
 libavcodec/x86/idctdsp_init.c  |   13 +-
 libavcodec/x86/simple_idct.asm | 1244 ++++++++++++++++++++--------------------
 libavcodec/x86/simple_idct.h   |    5 +-
 libavcodec/x86/xvididct.asm    |   81 +--
 8 files changed, 675 insertions(+), 681 deletions(-)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to