--- libavcodec/tests/x86/dct.c | 3 +++ libavcodec/x86/idctdsp_init.c | 1 + libavcodec/x86/simple_idct.asm | 45 ++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/simple_idct.h | 1 + 4 files changed, 50 insertions(+)
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c index 34f5b8767b..97116570f4 100644 --- a/libavcodec/tests/x86/dct.c +++ b/libavcodec/tests/x86/dct.c @@ -97,6 +97,9 @@ static const struct algo idct_tab_arch[] = { #endif #endif #endif +#if HAVE_SSE2_EXTERNAL + { "SIMPLE-SSE2", ff_simple_idct_sse2, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_SSE2 }, +#endif { 0 } }; diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index f1c915aa00..82530a5cc4 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -92,6 +92,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct_put = ff_simple_idct_put_sse2; c->idct_add = ff_simple_idct_add_sse2; + c->idct = ff_simple_idct_sse2; c->perm_type = FF_IDCT_PERM_SIMPLE; } } diff --git a/libavcodec/x86/simple_idct.asm b/libavcodec/x86/simple_idct.asm index 3b62a4f9d3..a6eb42464b 100644 --- a/libavcodec/x86/simple_idct.asm +++ b/libavcodec/x86/simple_idct.asm @@ -151,6 +151,10 @@ SECTION .text psrad m2, %7 packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0 packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1 +%if mmsize == 16 +pshufd m7, m7, 8 +pshufd m2, m2, 8 +%endif movq [%5], m7 movq m1, [blockq + %3] ; R3 R1 r3 r1 movq m4, [coeffs + 80] ; -C1 C5 -C1 C5 @@ -172,9 +176,15 @@ SECTION .text psubd m4, m3 ; a3-B3 a3-b3 psrad m6, %7 packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2 +%if mmsize == 16 +pshufd m2, m2, 8 +%endif movq [8 + %5], m2 psrad m4, %7 packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3 +%if mmsize == 16 +pshufd m4, m4, 8 +%endif movq [16 + %5], m4 jmp %%2 %%1: @@ -182,6 +192,9 @@ SECTION .text paddd m0, [d40000] psrad m0, 13 packssdw m0, m0 +%if mmsize == 16 +pshufd m0, m0, 8 +%endif movq [%5], m0 movq [8 + %5], m0 movq [16 + %5], m0 @@ -239,6 +252,10 @@ SECTION .text psrad m2, %7 packssdw m7, m1 ; A1+B1 a1+b1 A0+B0 a0+b0 packssdw m2, m4 ; A0-B0 a0-b0 A1-B1 a1-b1 +%if mmsize == 16 +pshufd m7, m7, 8 +pshufd m2, m2, 8 +%endif movq [%5], m7 movq m1, [blockq + %3] ; R3 R1 r3 r1 movq m4, [coeffs + 80] ; -C1 C5 -C1 C5 @@ -260,9 +277,15 @@ SECTION .text psubd m4, m3 ; a3-B3 a3-b3 psrad m6, %7 packssdw m2, m6 ; A3+B3 a3+b3 A2+B2 a2+b2 +%if mmsize == 16 +pshufd m2, m2, 8 +%endif movq [8 + %5], m2 psrad m4, %7 packssdw m4, m0 ; A2-B2 a2-b2 A3-B3 a3-b3 +%if mmsize == 16 +pshufd m4, m4, 8 +%endif movq [16 + %5], m4 %endmacro @@ -614,9 +637,15 @@ SECTION .text psrad m7, %6 psrad m3, %6 packssdw m4, m7 ; A0 a0 +%if mmsize == 16 +pshufd m4, m4, q0020 +%endif movq [%5], m4 psrad m0, %6 packssdw m0, m3 ; A1 a1 +%if mmsize == 16 +pshufd m0, m0, q0020 +%endif movq [16 + %5], m0 movq [96 + %5], m0 movq [112 + %5], m4 @@ -624,9 +653,15 @@ SECTION .text psrad m6, %6 psrad m2, %6 packssdw m5, m2 ; A2-B2 a2-b2 +%if mmsize == 16 +pshufd m5, m5, q0020 +%endif movq [32 + %5], m5 psrad m1, %6 packssdw m6, m1 ; A3+B3 a3+b3 +%if mmsize == 16 +pshufd m6, m6, q0020 +%endif movq [48 + %5], m6 movq [64 + %5], m6 movq [80 + %5], m5 @@ -711,9 +746,15 @@ SECTION .text movq m7, [coeffs + 32] ; C6 C2 C6 C2 psrad m1, %6 packssdw m4, m1 ; A0 a0 +%if mmsize == 16 +pshufd m4, m4, 8 +%endif movq [%5], m4 psrad m2, %6 packssdw m0, m2 ; A1 a1 +%if mmsize == 16 +pshufd m0, m0, 8 +%endif movq [16 + %5], m0 movq [96 + %5], m0 movq [112 + %5], m4 @@ -889,6 +930,10 @@ RET INIT_XMM sse2 +cglobal simple_idct, 1, 2, 8, 128, block, t0 + IDCT +RET + cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0 IDCT lea lsize3q, [lsizeq*3] diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index d17ef6a462..b19e910372 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -26,6 +26,7 @@ void ff_simple_idct_mmx(int16_t *block); void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct_sse2(int16_t *block); void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); -- 2.12.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel