The branch, master has been updated
via 32f32537b6242364b42140443bd8e03a0c2a0b92 (commit)
via ade54335b2feea2b8c661449d2bf6eaced3fb48c (commit)
via 625f5c993cf99a2adf446d8eba7b947999f14267 (commit)
via a26b99f7933cffd209342905669a6ffa2a537faf (commit)
via b03b09aeda1cd890f71f9e6b0bec0a062af4e3be (commit)
via a7013f813c7cd13b3ccb066bd0cb1231b9749818 (commit)
via 86f8adc58e0443024f5ad992ecdb959f0d6d8d95 (commit)
from d4e0d5ed48aa9c0e11b9ddeea8c2d14632314089 (commit)
- Log -----------------------------------------------------------------
commit 32f32537b6242364b42140443bd8e03a0c2a0b92
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 05:08:28 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/dvdec,mjpegdec: Remove emms_c
It is no longer necessary now that the IDCTDSP is always ABI-compliant
(and free of MMX).
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
index 242708c70a..4799ec96dc 100644
--- a/libavcodec/dvdec.c
+++ b/libavcodec/dvdec.c
@@ -36,7 +36,6 @@
*/
#include "libavutil/avassert.h"
-#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/mem_internal.h"
#include "libavutil/thread.h"
@@ -683,8 +682,6 @@ static int dvvideo_decode_frame(AVCodecContext *avctx,
AVFrame *frame,
avctx->execute(avctx, dv_decode_video_segment, s->work_chunks, NULL,
dv_work_pool_size(s->sys), sizeof(DVwork_chunk));
- emms_c();
-
/* return image */
*got_frame = 1;
diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c
index 5fd77073da..fb39c4e9fd 100644
--- a/libavcodec/mjpegdec.c
+++ b/libavcodec/mjpegdec.c
@@ -33,7 +33,6 @@
#include "config_components.h"
#include "libavutil/attributes.h"
-#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
@@ -1824,7 +1823,6 @@ next_field:
}
}
- emms_c();
return 0;
out_of_range:
av_log(s->avctx, AV_LOG_ERROR, "decode_sos: ac/dc index out of range\n");
commit ade54335b2feea2b8c661449d2bf6eaced3fb48c
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 14:25:54 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/x86/simple_idct: Port to SSE2
Before this commit, the (32-bit only) simple idct came in three
versions: A pure MMX IDCT and idct-put and idct-add versions
which use SSE2 at the put and add stage, but still use pure MMX
for the actual IDCT.
This commit ports said IDCT to SSE2; this was entirely trivial
for the IDCT1-5 and IDCT7 parts (where one can directly use
the full register width) and was easy for IDCT6 and IDCT8
(involving a few movhps and pshufds). Unfortunately, DC_COND_INIT
and Z_COND_INIT still use only the lower half of the registers.
This saved 4658B here; the benchmarking option of the dct test tool
showed a 15% speedup.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index e864de6904..f879ab1d42 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -90,7 +90,7 @@ static const struct algo idct_tab_arch[] = {
#endif
#else
#if HAVE_SSE2_EXTERNAL
- { "SIMPLE-SSE2", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE,
AV_CPU_FLAG_SSE2},
+ { "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE,
AV_CPU_FLAG_SSE2},
#endif
#endif
#endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 281d143ade..9c7f235b3f 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -76,7 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c,
AVCodecContext *avctx,
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
- c->idct = ff_simple_idct_mmx;
+ c->idct = ff_simple_idct_sse2;
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm
index c79519372a..0dc03738e4 100644
--- a/libavcodec/x86/simple_idct.asm
+++ b/libavcodec/x86/simple_idct.asm
@@ -1,5 +1,5 @@
;
-; Simple IDCT MMX
+; Simple IDCT SSE2
;
; Copyright (c) 2001, 2002 Michael Niedermayer <[email protected]>
;
@@ -30,8 +30,8 @@ SECTION_RODATA
%if ARCH_X86_32
cextern pb_80
+d40000: dd 4 << 16, 0 ; must be 16-byte aligned
wm1010: dw 0, 0xffff, 0, 0xffff
-d40000: dd 4 << 16, 0
; 23170.475006
; 22725.260826
@@ -57,650 +57,675 @@ d40000: dd 4 << 16, 0
coeffs:
dw 1 << (ROW_SHIFT - 1), 0
dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 0
+ dw 1 << (ROW_SHIFT - 1), 1
+ dw 1 << (ROW_SHIFT - 1), 0
dw 1 << (ROW_SHIFT - 1), 1
dw 1 << (ROW_SHIFT - 1), 0
- dw C4, C4, C4, C4
- dw C4, -C4, C4, -C4
+ dw C4, C4, C4, C4, C4, C4, C4, C4
+ dw C4, -C4, C4, -C4, C4, -C4, C4, -C4
- dw C2, C6, C2, C6
- dw C6, -C2, C6, -C2
+ dw C2, C6, C2, C6, C2, C6, C2, C6
+ dw C6, -C2, C6, -C2, C6, -C2, C6, -C2
- dw C1, C3, C1, C3
- dw C5, C7, C5, C7
+ dw C1, C3, C1, C3, C1, C3, C1, C3
+ dw C5, C7, C5, C7, C5, C7, C5, C7
- dw C3, -C7, C3, -C7
- dw -C1, -C5, -C1, -C5
+ dw C3, -C7, C3, -C7, C3, -C7, C3, -C7
+ dw -C1, -C5, -C1, -C5, -C1, -C5, -C1, -C5
- dw C5, -C1, C5, -C1
- dw C7, C3, C7, C3
+ dw C5, -C1, C5, -C1, C5, -C1, C5, -C1
+ dw C7, C3, C7, C3, C7, C3, C7, C3
- dw C7, -C5, C7, -C5
- dw C3, -C1, C3, -C1
+ dw C7, -C5, C7, -C5, C7, -C5, C7, -C5
+ dw C3, -C1, C3, -C1, C3, -C1, C3, -C1
SECTION .text
%macro DC_COND_IDCT 7
- movq mm0, [blockq + %1] ; R4 R0 r4 r0
- movq mm1, [blockq + %2] ; R6 R2 r6 r2
- movq mm2, [blockq + %3] ; R3 R1 r3 r1
- movq mm3, [blockq + %4] ; R7 R5 r7 r5
- movq mm4, [wm1010]
- pand mm4, mm0
- por mm4, mm1
- por mm4, mm2
- por mm4, mm3
- packssdw mm4, mm4
- movd t0d, mm4
+ movq m0, [blockq + %1] ; R4 R0 r4 r0
+ movq m1, [blockq + %2] ; R6 R2 r6 r2
+ movq m2, [blockq + %3] ; R3 R1 r3 r1
+ movq m3, [blockq + %4] ; R7 R5 r7 r5
+ movq m4, [wm1010]
+ pand m4, m0
+ por m4, m1
+ por m4, m2
+ por m4, m3
+ packssdw m4, m4
+ movd t0d, m4
or t0d, t0d
jz %%1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, [coeffs + 8]
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
- paddd mm0, [coeffs + 8]
- paddd mm1, mm0 ; A1 a1
- paddd mm0, mm0
- psubd mm0, mm1 ; A2 a2
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm5 ; B0 b0
- movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm5, mm2 ; B1 b1
- psrad mm7, %7
- psrad mm4, %7
- movq mm2, mm1 ; A1 a1
- paddd mm1, mm5 ; A1+B1 a1+b1
- psubd mm2, mm5 ; A1-B1 a1-b1
- psrad mm1, %7
- psrad mm2, %7
- packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
- packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [%5], mm7
- movq mm1, [blockq + %3] ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- movq [24 + %5], mm2
- pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm0 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm0, mm4 ; a2-B2 a2-b2
- psrad mm2, %7
- psrad mm0, %7
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm1 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %7
- packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
- movq [8 + %5], mm2
- psrad mm4, %7
- packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
- movq [16 + %5], mm4
+ movq m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ movq m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ movq m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ movq m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ movq m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, [coeffs + 16]
+ movq m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ movq m5, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m5, m3 ; C7R7+C5R5 C7r7+C5r5
+ paddd m0, [coeffs + 16]
+ paddd m1, m0 ; A1 a1
+ paddd m0, m0
+ psubd m0, m1 ; A2 a2
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m5 ; B0 b0
+ movq m5, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m5, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m5, m2 ; B1 b1
+ psrad m7, %7
+ psrad m4, %7
+ movq m2, m1 ; A1 a1
+ paddd m1, m5 ; A1+B1 a1+b1
+ psubd m2, m5 ; A1-B1 a1-b1
+ psrad m1, %7
+ psrad m2, %7
+ packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
+ pshufd m7, m7, 0xD8
+ packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+ pshufd m2, m2, 0xD8
+ movq [%5], m7
+ movq m1, [blockq + %3] ; R3 R1 r3 r1
+ movq m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ movq [24 + %5], m2
+ pmaddwd m4, m1 ; -C1R3+C5R1 -C1r3+C5r1
+ movq m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ movq m2, m0 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m0, m4 ; a2-B2 a2-b2
+ psrad m2, %7
+ psrad m0, %7
+ movq m4, m6 ; A3 a3
+ paddd m3, m1 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %7
+ packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+ pshufd m2, m2, 0xD8
+ movq [8 + %5], m2
+ psrad m4, %7
+ packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+ pshufd m4, m4, 0xD8
+ movq [16 + %5], m4
jmp %%2
%%1:
- pslld mm0, 16
- paddd mm0, [d40000]
- psrad mm0, 13
- packssdw mm0, mm0
- movq [%5], mm0
- movq [8 + %5], mm0
- movq [16 + %5], mm0
- movq [24 + %5], mm0
+ pslld m0, 16
+ ; d40000 is only eight bytes long, so this will clobber
+ ; the upper half of m0 with wm1010. It doesn't matter due to pshufd below.
+ paddd m0, [d40000]
+ psrad m0, 13
+ packssdw m0, m0
+ pshufd m0, m0, 0x0
+ mova [%5], m0
+ mova [16 + %5], m0
%%2:
%endmacro
%macro Z_COND_IDCT 8
- movq mm0, [blockq + %1] ; R4 R0 r4 r0
- movq mm1, [blockq + %2] ; R6 R2 r6 r2
- movq mm2, [blockq + %3] ; R3 R1 r3 r1
- movq mm3, [blockq + %4] ; R7 R5 r7 r5
- movq mm4, mm0
- por mm4, mm1
- por mm4, mm2
- por mm4, mm3
- packssdw mm4, mm4
- movd t0d, mm4
+ movq m0, [blockq + %1] ; R4 R0 r4 r0
+ movq m1, [blockq + %2] ; R6 R2 r6 r2
+ movq m2, [blockq + %3] ; R3 R1 r3 r1
+ movq m3, [blockq + %4] ; R7 R5 r7 r5
+ movq m4, m0
+ por m4, m1
+ por m4, m2
+ por m4, m3
+ packssdw m4, m4
+ movd t0d, m4
or t0d, t0d
jz %8
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, [coeffs]
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm5, mm3 ; C7R7+C5R5 C7r7+C5r5
- paddd mm0, [coeffs]
- paddd mm1, mm0 ; A1 a1
- paddd mm0, mm0
- psubd mm0, mm1 ; A2 a2
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm5 ; B0 b0
- movq mm5, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm5, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm5, mm2 ; B1 b1
- psrad mm7, %7
- psrad mm4, %7
- movq mm2, mm1 ; A1 a1
- paddd mm1, mm5 ; A1+B1 a1+b1
- psubd mm2, mm5 ; A1-B1 a1-b1
- psrad mm1, %7
- psrad mm2, %7
- packssdw mm7, mm1 ; A1+B1 a1+b1 A0+B0 a0+b0
- packssdw mm2, mm4 ; A0-B0 a0-b0 A1-B1 a1-b1
- movq [%5], mm7
- movq mm1, [blockq + %3] ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- movq [24 + %5], mm2
- pmaddwd mm4, mm1 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm0 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm0, mm4 ; a2-B2 a2-b2
- psrad mm2, %7
- psrad mm0, %7
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm1 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %7
- packssdw mm2, mm6 ; A3+B3 a3+b3 A2+B2 a2+b2
- movq [8 + %5], mm2
- psrad mm4, %7
- packssdw mm4, mm0 ; A2-B2 a2-b2 A3-B3 a3-b3
- movq [16 + %5], mm4
+ movq m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ movq m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ movq m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ movq m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ movq m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, [coeffs]
+ movq m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ movq m5, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m5, m3 ; C7R7+C5R5 C7r7+C5r5
+ paddd m0, [coeffs]
+ paddd m1, m0 ; A1 a1
+ paddd m0, m0
+ psubd m0, m1 ; A2 a2
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m5 ; B0 b0
+ movq m5, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m5, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m5, m2 ; B1 b1
+ psrad m7, %7
+ psrad m4, %7
+ movq m2, m1 ; A1 a1
+ paddd m1, m5 ; A1+B1 a1+b1
+ psubd m2, m5 ; A1-B1 a1-b1
+ psrad m1, %7
+ psrad m2, %7
+ packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0
+ pshufd m7, m7, 0xD8
+ packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1
+ pshufd m2, m2, 0xD8
+ movq [%5], m7
+ movq m1, [blockq + %3] ; R3 R1 r3 r1
+ movq m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ movq [24 + %5], m2
+ pmaddwd m4, m1 ; -C1R3+C5R1 -C1r3+C5r1
+ movq m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ movq m2, m0 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m0, m4 ; a2-B2 a2-b2
+ psrad m2, %7
+ psrad m0, %7
+ movq m4, m6 ; A3 a3
+ paddd m3, m1 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %7
+ packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2
+ pshufd m2, m2, 0xD8
+ movq [8 + %5], m2
+ psrad m4, %7
+ packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3
+ pshufd m4, m4, 0xD8
+ movq [16 + %5], m4
%endmacro
%macro IDCT1 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm1 ; B0 b0
- movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm1, mm2 ; B1 b1
- psrad mm7, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm2, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm0, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm0 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m1 ; B0 b0
+ mova m1, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m1, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m1, m2 ; B1 b1
+ psrad m7, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m2, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m0, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m0 ; -C1R3+C5R1 -C1r3+C5r1
+ mova m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m0, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m3, m0 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT2 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm1 ; A0-B0 a0-b0
- psrad mm1, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm7 ; A1+B1 a1+b1
- psubd mm2, mm7 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm1, mm1 ; A0+B0 a0+b0
- movd [%5], mm1
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm1, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2, mm1 ; A2+B2 a2+b2
- psubd mm5, mm1 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm1, mm6 ; A3 a3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm1, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm1, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm1, mm1 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm1
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ mova m7, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m7, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m1, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m1 ; A0-B0 a0-b0
+ psrad m1, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m7 ; A1+B1 a1+b1
+ psubd m2, m7 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m1, m1 ; A0+B0 a0+b0
+ movq [%5], m1
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m1, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m2, m1 ; A2+B2 a2+b2
+ psubd m5, m1 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m1, m6 ; A3 a3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m1, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m1, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m1, m1 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m1
+ movq [80 + %5], m5
%endmacro
%macro IDCT3 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- movq mm7, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm7, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm1, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm1 ; A0-B0 a0-b0
- psrad mm1, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm7 ; A1+B1 a1+b1
- psubd mm2, mm7 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm1, mm1 ; A0+B0 a0+b0
- movd [%5], mm1
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm1, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm1, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm2, mm1 ; A2+B2 a2+b2
- psubd mm5, mm1 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm1, mm6 ; A3 a3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm1, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm1, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm1, mm1 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm1
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ mova m7, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m7, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m1, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m1 ; A0-B0 a0-b0
+ psrad m1, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m7 ; A1+B1 a1+b1
+ psubd m2, m7 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m1, m1 ; A0+B0 a0+b0
+ movq [%5], m1
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m1, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m1, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m2, m1 ; A2+B2 a2+b2
+ psubd m5, m1 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m1, m6 ; A3 a3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m1, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m1, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m1, m1 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m1
+ movq [80 + %5], m5
%endmacro
%macro IDCT4 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm3, %4 ; R7 R5 r7 r5
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm1, [coeffs + 56] ; C7 C5 C7 C5
- pmaddwd mm1, mm3 ; C7R7+C5R5 C7r7+C5r5
- pmaddwd mm2, [coeffs + 64] ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm1 ; B0 b0
- movq mm1, [coeffs + 72] ; -C5 -C1 -C5 -C1
- pmaddwd mm1, mm3 ; -C5R7-C1R5 -C5r7-C1r5
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- paddd mm1, mm2 ; B1 b1
- psrad mm7, %6
- psrad mm4, %6
- movq mm2, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm2, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm2, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm2, mm2 ; A1-B1 a1-b1
- movd [96 + %5], mm2
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm0, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm0 ; -C1R3+C5R1 -C1r3+C5r1
- movq mm7, [coeffs + 88] ; C3 C7 C3 C7
- pmaddwd mm0, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- pmaddwd mm7, mm3 ; C3R7+C7R5 C3r7+C7r5
- movq mm2, mm5 ; A2 a2
- pmaddwd mm3, [coeffs + 104] ; -C1R7+C3R5 -C1r7+C3r5
- paddd mm4, mm7 ; B2 b2
- paddd mm2, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm2, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm3, mm0 ; B3 b3
- paddd mm6, mm3 ; A3+B3 a3+b3
- psubd mm4, mm3 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm2, mm2 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm2
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m3, %4 ; R7 R5 r7 r5
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m1, [coeffs + 112] ; C7 C5 C7 C5
+ pmaddwd m1, m3 ; C7R7+C5R5 C7r7+C5r5
+ pmaddwd m2, [coeffs + 128] ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m1 ; B0 b0
+ mova m1, [coeffs + 144] ; -C5 -C1 -C5 -C1
+ pmaddwd m1, m3 ; -C5R7-C1R5 -C5r7-C1r5
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ paddd m1, m2 ; B1 b1
+ psrad m7, %6
+ psrad m4, %6
+ mova m2, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m2, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m2, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m2, m2 ; A1-B1 a1-b1
+ movq [96 + %5], m2
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m0, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m0 ; -C1R3+C5R1 -C1r3+C5r1
+ mova m7, [coeffs + 176] ; C3 C7 C3 C7
+ pmaddwd m0, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ pmaddwd m7, m3 ; C3R7+C7R5 C3r7+C7r5
+ mova m2, m5 ; A2 a2
+ pmaddwd m3, [coeffs + 208] ; -C1R7+C3R5 -C1r7+C3r5
+ paddd m4, m7 ; B2 b2
+ paddd m2, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m2, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m3, m0 ; B3 b3
+ paddd m6, m3 ; A3+B3 a3+b3
+ psubd m4, m3 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m2, m2 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m2
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT5 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm3, [coeffs + 64]
- pmaddwd mm3, mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- psrad mm7, %6
- psrad mm4, %6
- movq mm1, mm0 ; A1 a1
- paddd mm0, mm3 ; A1+B1 a1+b1
- psubd mm1, mm3 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm1, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm1, mm1 ; A1-B1 a1-b1
- movd [96 + %5], mm1
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
- pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm1, mm5 ; A2 a2
- paddd mm1, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm1, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm6, mm2 ; A3+B3 a3+b3
- psubd mm4, mm2 ; a3-B3 a3-b3
- psrad mm6, %6
- psrad mm4, %6
- packssdw mm1, mm1 ; A2+B2 a2+b2
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [32 + %5], mm1
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [48 + %5], mm6
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m3, [coeffs + 128]
+ pmaddwd m3, m2 ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ psrad m7, %6
+ psrad m4, %6
+ mova m1, m0 ; A1 a1
+ paddd m0, m3 ; A1+B1 a1+b1
+ psubd m1, m3 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m1, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m1, m1 ; A1-B1 a1-b1
+ movq [96 + %5], m1
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m2 ; -C1R3+C5R1 -C1r3+C5r1
+ pmaddwd m2, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ mova m1, m5 ; A2 a2
+ paddd m1, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m1, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m6, m2 ; A3+B3 a3+b3
+ psubd m4, m2 ; a3-B3 a3-b3
+ psrad m6, %6
+ psrad m4, %6
+ packssdw m1, m1 ; A2+B2 a2+b2
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [32 + %5], m1
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [48 + %5], m6
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT6 6
- movq mm0, [%1] ; R4 R0 r4 r0
- movq mm1, [%2] ; R6 R2 r6 r2
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm2, [8 + %1] ; R4 R0 r4 r0
- movq mm3, [8 + %2] ; R6 R2 r6 r2
- movq mm1, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm7, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm7, mm3 ; C6R6+C2R2 C6r6+C2r2
- pmaddwd mm3, [coeffs + 40] ; -C2R6+C6R2 -C2r6+C6r2
- paddd mm7, mm1 ; A0 a0
- paddd mm1, mm1 ; 2C0 2c0
- psubd mm1, mm7 ; A3 a3
- paddd mm3, mm2 ; A1 a1
- paddd mm2, mm2 ; 2C1 2c1
- psubd mm2, mm3 ; A2 a2
- psrad mm4, %6
- psrad mm7, %6
- psrad mm3, %6
- packssdw mm4, mm7 ; A0 a0
- movq [%5], mm4
- psrad mm0, %6
- packssdw mm0, mm3 ; A1 a1
- movq [16 + %5], mm0
- movq [96 + %5], mm0
- movq [112 + %5], mm4
- psrad mm5, %6
- psrad mm6, %6
- psrad mm2, %6
- packssdw mm5, mm2 ; A2-B2 a2-b2
- movq [32 + %5], mm5
- psrad mm1, %6
- packssdw mm6, mm1 ; A3+B3 a3+b3
- movq [48 + %5], mm6
- movq [64 + %5], mm6
- movq [80 + %5], mm5
+ movq m0, [%1] ; R4 R0 r4 r0
+ movhps m0, [%1 + 16]
+ movq m1, [%2] ; R6 R2 r6 r2
+ movhps m1, [%2 + 16]
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ movq m2, [%1 + 8] ; R4 R0 r4 r0
+ movhps m2, [%1 + 24]
+ movq m3, [%2 + 8] ; R6 R2 r6 r2
+ movhps m3, [%2 + 24]
+ mova m1, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m1, m2 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m2, m7 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m7, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m7, m3 ; C6R6+C2R2 C6r6+C2r2
+ pmaddwd m3, [coeffs + 80] ; -C2R6+C6R2 -C2r6+C6r2
+ paddd m7, m1 ; A0 a0
+ paddd m1, m1 ; 2C0 2c0
+ psubd m1, m7 ; A3 a3
+ paddd m3, m2 ; A1 a1
+ paddd m2, m2 ; 2C1 2c1
+ psubd m2, m3 ; A2 a2
+ psrad m4, %6
+ psrad m7, %6
+ psrad m3, %6
+ packssdw m4, m7 ; A0 a0
+ pshufd m4, m4, 0xD8
+ mova [%5], m4
+ psrad m0, %6
+ packssdw m0, m3 ; A1 a1
+ pshufd m0, m0, 0xD8
+ mova [16 + %5], m0
+ mova [96 + %5], m0
+ mova [112 + %5], m4
+ psrad m5, %6
+ psrad m6, %6
+ psrad m2, %6
+ packssdw m5, m2 ; A2-B2 a2-b2
+ pshufd m5, m5, 0xD8
+ mova [32 + %5], m5
+ psrad m1, %6
+ packssdw m6, m1 ; A3+B3 a3+b3
+ pshufd m6, m6, 0xD8
+ mova [48 + %5], m6
+ mova [64 + %5], m6
+ mova [80 + %5], m5
%endmacro
%macro IDCT7 6
- movq mm0, %1 ; R4 R0 r4 r0
- movq mm1, %2 ; R6 R2 r6 r2
- movq mm2, %3 ; R3 R1 r3 r1
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm5, [coeffs + 32] ; C6 C2 C6 C2
- pmaddwd mm5, mm1 ; C6R6+C2R2 C6r6+C2r2
- movq mm6, [coeffs + 40] ; -C2 C6 -C2 C6
- pmaddwd mm1, mm6 ; -C2R6+C6R2 -C2r6+C6r2
- movq mm6, mm4 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 48] ; C3 C1 C3 C1
- pmaddwd mm7, mm2 ; C3R3+C1R1 C3r3+C1r1
- paddd mm4, mm5 ; A0 a0
- psubd mm6, mm5 ; A3 a3
- movq mm5, mm0 ; -C4R4+C4R0 -C4r4+C4r0
- paddd mm0, mm1 ; A1 a1
- psubd mm5, mm1 ; A2 a2
- movq mm1, [coeffs + 64]
- pmaddwd mm1, mm2 ; -C7R3+C3R1 -C7r3+C3r1
- paddd mm7, mm4 ; A0+B0 a0+b0
- paddd mm4, mm4 ; 2A0 2a0
- psubd mm4, mm7 ; A0-B0 a0-b0
- psrad mm7, %6
- psrad mm4, %6
- movq mm3, mm0 ; A1 a1
- paddd mm0, mm1 ; A1+B1 a1+b1
- psubd mm3, mm1 ; A1-B1 a1-b1
- psrad mm0, %6
- psrad mm3, %6
- packssdw mm7, mm7 ; A0+B0 a0+b0
- movd [%5], mm7
- packssdw mm0, mm0 ; A1+B1 a1+b1
- movd [16 + %5], mm0
- packssdw mm3, mm3 ; A1-B1 a1-b1
- movd [96 + %5], mm3
- packssdw mm4, mm4 ; A0-B0 a0-b0
- movd [112 + %5], mm4
- movq mm4, [coeffs + 80] ; -C1 C5 -C1 C5
- pmaddwd mm4, mm2 ; -C1R3+C5R1 -C1r3+C5r1
- pmaddwd mm2, [coeffs + 96] ; -C5R3+C7R1 -C5r3+C7r1
- movq mm3, mm5 ; A2 a2
- paddd mm3, mm4 ; A2+B2 a2+b2
- psubd mm5, mm4 ; a2-B2 a2-b2
- psrad mm3, %6
- psrad mm5, %6
- movq mm4, mm6 ; A3 a3
- paddd mm6, mm2 ; A3+B3 a3+b3
- psubd mm4, mm2 ; a3-B3 a3-b3
- psrad mm6, %6
- packssdw mm3, mm3 ; A2+B2 a2+b2
- movd [32 + %5], mm3
- psrad mm4, %6
- packssdw mm6, mm6 ; A3+B3 a3+b3
- movd [48 + %5], mm6
- packssdw mm4, mm4 ; A3-B3 a3-b3
- packssdw mm5, mm5 ; A2-B2 a2-b2
- movd [64 + %5], mm4
- movd [80 + %5], mm5
+ mova m0, %1 ; R4 R0 r4 r0
+ mova m1, %2 ; R6 R2 r6 r2
+ mova m2, %3 ; R3 R1 r3 r1
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m5, [coeffs + 64] ; C6 C2 C6 C2
+ pmaddwd m5, m1 ; C6R6+C2R2 C6r6+C2r2
+ mova m6, [coeffs + 80] ; -C2 C6 -C2 C6
+ pmaddwd m1, m6 ; -C2R6+C6R2 -C2r6+C6r2
+ mova m6, m4 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 96] ; C3 C1 C3 C1
+ pmaddwd m7, m2 ; C3R3+C1R1 C3r3+C1r1
+ paddd m4, m5 ; A0 a0
+ psubd m6, m5 ; A3 a3
+ mova m5, m0 ; -C4R4+C4R0 -C4r4+C4r0
+ paddd m0, m1 ; A1 a1
+ psubd m5, m1 ; A2 a2
+ mova m1, [coeffs + 128]
+ pmaddwd m1, m2 ; -C7R3+C3R1 -C7r3+C3r1
+ paddd m7, m4 ; A0+B0 a0+b0
+ paddd m4, m4 ; 2A0 2a0
+ psubd m4, m7 ; A0-B0 a0-b0
+ psrad m7, %6
+ psrad m4, %6
+ mova m3, m0 ; A1 a1
+ paddd m0, m1 ; A1+B1 a1+b1
+ psubd m3, m1 ; A1-B1 a1-b1
+ psrad m0, %6
+ psrad m3, %6
+ packssdw m7, m7 ; A0+B0 a0+b0
+ movq [%5], m7
+ packssdw m0, m0 ; A1+B1 a1+b1
+ movq [16 + %5], m0
+ packssdw m3, m3 ; A1-B1 a1-b1
+ movq [96 + %5], m3
+ packssdw m4, m4 ; A0-B0 a0-b0
+ movq [112 + %5], m4
+ mova m4, [coeffs + 160] ; -C1 C5 -C1 C5
+ pmaddwd m4, m2 ; -C1R3+C5R1 -C1r3+C5r1
+ pmaddwd m2, [coeffs + 192] ; -C5R3+C7R1 -C5r3+C7r1
+ mova m3, m5 ; A2 a2
+ paddd m3, m4 ; A2+B2 a2+b2
+ psubd m5, m4 ; a2-B2 a2-b2
+ psrad m3, %6
+ psrad m5, %6
+ mova m4, m6 ; A3 a3
+ paddd m6, m2 ; A3+B3 a3+b3
+ psubd m4, m2 ; a3-B3 a3-b3
+ psrad m6, %6
+ packssdw m3, m3 ; A2+B2 a2+b2
+ movq [32 + %5], m3
+ psrad m4, %6
+ packssdw m6, m6 ; A3+B3 a3+b3
+ movq [48 + %5], m6
+ packssdw m4, m4 ; A3-B3 a3-b3
+ packssdw m5, m5 ; A2-B2 a2-b2
+ movq [64 + %5], m4
+ movq [80 + %5], m5
%endmacro
%macro IDCT8 6
- movq mm0, [%1] ; R4 R0 r4 r0
- movq mm4, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm4, mm0 ; C4R4+C4R0 C4r4+C4r0
- movq mm5, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm0, mm5 ; -C4R4+C4R0 -C4r4+C4r0
- psrad mm4, %6
- psrad mm0, %6
- movq mm2, [8 + %1] ; R4 R0 r4 r0
- movq mm1, [coeffs + 16] ; C4 C4 C4 C4
- pmaddwd mm1, mm2 ; C4R4+C4R0 C4r4+C4r0
- movq mm7, [coeffs + 24] ; -C4 C4 -C4 C4
- pmaddwd mm2, mm7 ; -C4R4+C4R0 -C4r4+C4r0
- movq mm7, [coeffs + 32] ; C6 C2 C6 C2
- psrad mm1, %6
- packssdw mm4, mm1 ; A0 a0
- movq [%5], mm4
- psrad mm2, %6
- packssdw mm0, mm2 ; A1 a1
- movq [16 + %5], mm0
- movq [96 + %5], mm0
- movq [112 + %5], mm4
- movq [32 + %5], mm0
- movq [48 + %5], mm4
- movq [64 + %5], mm4
- movq [80 + %5], mm0
+ movq m0, [%1] ; R4 R0 r4 r0
+ movhps m0, [%1 + 16]
+ mova m4, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m4, m0 ; C4R4+C4R0 C4r4+C4r0
+ mova m5, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m0, m5 ; -C4R4+C4R0 -C4r4+C4r0
+ psrad m4, %6
+ psrad m0, %6
+ movq m2, [%1 + 8] ; R4 R0 r4 r0
+ movhps m2, [%1 + 24]
+ mova m1, [coeffs + 32] ; C4 C4 C4 C4
+ pmaddwd m1, m2 ; C4R4+C4R0 C4r4+C4r0
+ mova m7, [coeffs + 48] ; -C4 C4 -C4 C4
+ pmaddwd m2, m7 ; -C4R4+C4R0 -C4r4+C4r0
+ mova m7, [coeffs + 64] ; C6 C2 C6 C2
+ psrad m1, %6
+ packssdw m4, m1 ; A0 a0
+ pshufd m4, m4, 0xD8
+ mova [%5], m4
+ psrad m2, %6
+ packssdw m0, m2 ; A1 a1
+ pshufd m0, m0, 0xD8
+ mova [16 + %5], m0
+ mova [96 + %5], m0
+ mova [112 + %5], m4
+ mova [32 + %5], m0
+ mova [48 + %5], m4
+ mova [64 + %5], m4
+ mova [80 + %5], m0
%endmacro
%macro IDCT 0
@@ -710,9 +735,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%1
IDCT1 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT1 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -721,9 +744,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
IDCT2 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT2 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -731,9 +752,7 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
IDCT3 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT3 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
@@ -741,41 +760,33 @@ SECTION .text
Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
IDCT4 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT4 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%3:
IDCT5 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT5 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%5:
IDCT6 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
- IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
jmp %%9
ALIGN 16
%%1:
IDCT7 [rsp + 0], [rsp + 64], [rsp + 32], [rsp + 96], blockq + 0, 20
- IDCT7 [rsp + 8], [rsp + 72], [rsp + 40], [rsp + 104], blockq + 4, 20
IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq + 8, 20
- IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
jmp %%9
ALIGN 16
%%7:
IDCT8 rsp + 0, rsp + 64, rsp + 32, rsp + 96, blockq + 0, 20
- IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq + 8, 20
%%9:
%endmacro
@@ -805,15 +816,12 @@ SECTION .text
movhps [pixelsq+lsizeq], m0
%endmacro
-INIT_MMX mmx
+INIT_XMM sse2
cglobal simple_idct, 1, 2, 8, 128, block, t0
IDCT
- emms
RET
-INIT_XMM sse2
-
cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
IDCT
lea lsize3q, [lsizeq*3]
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index 9b64cfe9bc..c9ba6aedaf 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -22,10 +22,7 @@
#include <stddef.h>
#include <stdint.h>
-void ff_simple_idct_mmx(int16_t *block);
-void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t
*block);
-void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t
*block);
-
+void ff_simple_idct_sse2(int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t
*block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t
*block);
commit 625f5c993cf99a2adf446d8eba7b947999f14267
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 04:04:02 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/tests/dct: Remove unnecessary emms_c
Unnecessary since the Xvid IDCT no longer uses MMX registers at all.
(Notice that the simple MMX IDCT issues emms and is therefore ABI
compliant.)
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/tests/dct.c b/libavcodec/tests/dct.c
index 784b49276c..eb74f3559e 100644
--- a/libavcodec/tests/dct.c
+++ b/libavcodec/tests/dct.c
@@ -37,7 +37,6 @@
#include "libavutil/cpu.h"
#include "libavutil/common.h"
-#include "libavutil/emms.h"
#include "libavutil/internal.h"
#include "libavutil/lfg.h"
#include "libavutil/mem_internal.h"
@@ -212,7 +211,6 @@ static int dct_error(const struct algo *dct, int test, int
is_idct, int speed, c
permute(block, block1, dct->perm_type);
dct->func(block);
- emms_c();
if (!strcmp(dct->name, "IJG-AAN-INT")) {
for (i = 0; i < 64; i++) {
@@ -287,7 +285,6 @@ static int dct_error(const struct algo *dct, int test, int
is_idct, int speed, c
memcpy(block, block1, sizeof(block));
dct->func(block);
}
- emms_c();
it1 += NB_ITS_SPEED;
ti1 = av_gettime_relative() - ti;
} while (ti1 < 1000000);
@@ -449,7 +446,6 @@ static void idct248_error(const char *name,
block[i] = block1[i];
idct248_put(img_dest, 8, block);
}
- emms_c();
it1 += NB_ITS_SPEED;
ti1 = av_gettime_relative() - ti;
} while (ti1 < 1000000);
commit a26b99f7933cffd209342905669a6ffa2a537faf
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 03:58:12 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/x86/xvididct: Remove remnants of MMX
The non-MMX code only uses the first six rounders.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 0daa2edd42..c3bfabb955 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -24,7 +24,7 @@
;
; More details at http://skal.planet-d.net/coding/dct.html
;
-; ======= MMX and XMM forward discrete cosine transform =======
+; =========== XMM forward discrete cosine transform ===========
;
; Copyright(C) 2001 Peter Ross <[email protected]>
;
@@ -67,7 +67,6 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
-; Similar to tg_1_16 in MMX code
tan1: times 8 dw 13036
tan2: times 8 dw 27146
tan3: times 8 dw 43790
@@ -91,7 +90,6 @@ iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746,
0x4b42, 0xd746
dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
-; Similar to rounder_0 in MMX code
; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16
walkenIdctRounders: times 4 dd 65536
times 4 dd 3597
@@ -99,7 +97,6 @@ walkenIdctRounders: times 4 dd 65536
times 4 dd 1203
times 4 dd 120
times 4 dd 512
- times 2 dd 0
SECTION .text
commit b03b09aeda1cd890f71f9e6b0bec0a062af4e3be
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 5 02:59:59 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/x86/xvididct: Don't use MMX registers in SSE2 function
It is higly surprising and would necessitate emms in order to be ABI
compliant; but it is better just not to use them in the first place.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 4197551cdf..0daa2edd42 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -101,8 +101,6 @@ walkenIdctRounders: times 4 dd 65536
times 4 dd 512
times 2 dd 0
-pb_127: times 8 db 127
-
SECTION .text
; Temporary storage before the column pass
@@ -167,36 +165,47 @@ SECTION .text
%define TAN1 xmm2
%endif
-%macro JZ 2
- test %1, %1
+%macro JZ 3
+ test %1%3, %1%3
jz .%2
%endmacro
-%macro JNZ 2
- test %1, %1
+%macro JNZ 3
+ test %1%3, %1%3
jnz .%2
%endmacro
%macro TEST_ONE_ROW 4 ; src, reg, clear, arg
%3 %4
- movq mm1, [%1]
- por mm1, [%1 + 8]
- paddusb mm1, mm0
- pmovmskb %2, mm1
+ mova m1, [%1]
+ ; due to signed saturation, m1 is all zero iff m1 is all zero after packing
+ packsswb m1, m1
+%if ARCH_X86_64
+ movq %2, m1
+%else
+ packsswb m1, m1
+ movd %2, m1
+%endif
%endmacro
;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
%macro TEST_TWO_ROWS 8
%5 %6
%7 %8
- movq mm1, [%1 + 0]
- por mm1, [%1 + 8]
- movq mm2, [%2 + 0]
- por mm2, [%2 + 8]
- paddusb mm1, mm0
- paddusb mm2, mm0
- pmovmskb %3, mm1
- pmovmskb %4, mm2
+ mova m1, [%1]
+ packsswb m1, [%2]
+ packsswb m1, m1
+%if ARCH_X86_64
+ movq %4, m1
+ mov %3d, %4d
+ shr %4q, 32
+%else
+ packsswb m1, m1
+ movd %3, m1
+ mov %4, %3
+ shr %4, 16
+ and %3, 0xFFFF
+%endif
%endmacro
; IDCT pass on rows.
@@ -499,16 +508,16 @@ SECTION .text
%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
%if %1 == 0 || ARCH_X86_32
- %define GPR0 r1d
- %define GPR1 r2d
- %define GPR2 r3d
- %define GPR3 r4d
+ %define GPR0 r1
+ %define GPR1 r2
+ %define GPR2 r3
+ %define GPR3 r4
%define NUM_GPRS 5
%else
- %define GPR0 r3d
- %define GPR1 r4d
- %define GPR2 r5d
- %define GPR3 r6d
+ %define GPR0 r3
+ %define GPR1 r4
+ %define GPR2 r5
+ %define GPR3 r6
%define NUM_GPRS 7
%endif
%if %1 == 0
@@ -527,34 +536,33 @@ cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64,
dest, stride, block
%xdefine BLOCK r0q
%endif
%endif
- movq mm0, [pb_127]
iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3,
CLEAR_EVEN, ROW4 ; a, c
- JZ GPR0, col1
+ JZ GPR0, col1, d
iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
.col1:
TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5,
CLEAR_EVEN, ROW6 ; a, d
TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
iLLM_HEAD
- JNZ GPR1, 2
- JNZ GPR0, 3
- JNZ GPR2, 4
- JNZ GPR3, 5
+ JNZ GPR1, 2, d
+ JNZ GPR0, 3, d
+ JNZ GPR2, 4, d
+ JNZ GPR3, 5, q
iLLM_PASS_SPARSE BLOCK, %1
jmp .6
.2:
iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
.3:
iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
- JZ GPR2, col2
+ JZ GPR2, col2, d
.4:
iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
.col2:
- JZ GPR3, col3
+ JZ GPR3, col3, q
.5:
iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
.col3:
commit a7013f813c7cd13b3ccb066bd0cb1231b9749818
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 17:53:30 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/tests/x86/dct: Test 32bit simple idct
The test has been removed in bfb28b5ce89f3e950214b67ea95b45e3355c2caf
when MMX idctdsp functions overridden by SSE2 were removed;
ff_simple_idct_mmx() has been completely disabled in this patch
for x64 and so the test should have been disabled on x64 instead
of removing it.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 7800abc7f7..e864de6904 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,6 +88,10 @@ static const struct algo idct_tab_arch[] = {
{ "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE,
AV_CPU_FLAG_AVX},
{ "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE,
AV_CPU_FLAG_AVX, 1 },
#endif
+#else
+#if HAVE_SSE2_EXTERNAL
+ { "SIMPLE-SSE2", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE,
AV_CPU_FLAG_SSE2},
+#endif
#endif
#endif
{ 0 }
commit 86f8adc58e0443024f5ad992ecdb959f0d6d8d95
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 13:56:01 2025 +0100
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Sat Nov 8 18:48:54 2025 +0100
avcodec/x86/idctdsp_init: Fix IDCT permutation for 32bit without SSE2
bfb28b5ce89f3e950214b67ea95b45e3355c2caf removed the MMX idct_put
and idct_add functions, because they were overridden by SSE2 versions
(which use SSE2 only for the put/add part, not the actual IDCT).
This meant that for MMX, the idct functions are not set in unison,
so that the permutation which is meant to apply to all three
is incorrect on 32bit systems if SSE2 is unavailable/disabled.
Fix this by setting the MMX version only if SSE2 is enabled.
(No one complained, so apparently no one uses a new FFmpeg
with non-SSE2 capable systems.)
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 2d165b975b..281d143ade 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -65,18 +65,6 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c,
AVCodecContext *avctx,
{
int cpu_flags = av_get_cpu_flags();
-#if ARCH_X86_32
- if (EXTERNAL_MMX(cpu_flags)) {
- if (!high_bit_depth &&
- avctx->lowres == 0 &&
- (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
- c->idct = ff_simple_idct_mmx;
- }
- }
-#endif
-
if (EXTERNAL_SSE2(cpu_flags)) {
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
c->put_pixels_clamped = ff_put_pixels_clamped_sse2;
@@ -88,6 +76,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c,
AVCodecContext *avctx,
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ c->idct = ff_simple_idct_mmx;
c->idct_put = ff_simple_idct_put_sse2;
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
-----------------------------------------------------------------------
Summary of changes:
libavcodec/dvdec.c | 3 -
libavcodec/mjpegdec.c | 2 -
libavcodec/tests/dct.c | 4 -
libavcodec/tests/x86/dct.c | 4 +
libavcodec/x86/idctdsp_init.c | 13 +-
libavcodec/x86/simple_idct.asm | 1244 ++++++++++++++++++++--------------------
libavcodec/x86/simple_idct.h | 5 +-
libavcodec/x86/xvididct.asm | 81 +--
8 files changed, 675 insertions(+), 681 deletions(-)
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]