--- libavcodec/x86/idctdsp_init.c | 2 ++ libavcodec/x86/simple_idct.h | 3 +++ libavcodec/x86/simple_idct10.asm | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+)
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 4b2145e478..1826d01e0e 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -102,6 +102,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct = ff_simple_idct8_sse2; + c->idct_put = ff_simple_idct8_put_sse2; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } } @@ -113,6 +114,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct = ff_simple_idct8_avx; + c->idct_put = ff_simple_idct8_put_avx; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index d17a855312..b559f8527c 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -32,6 +32,9 @@ void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block) void ff_simple_idct8_sse2(int16_t *block); void ff_simple_idct8_avx(int16_t *block); +void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + void ff_simple_idct10_sse2(int16_t *block); void ff_simple_idct10_avx(int16_t *block); diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index 168b6a08e0..f31fb5cfa5 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -71,11 +71,34 @@ CONST_DEC w7_min_w5, W7sh2, -W5sh2 SECTION .text +%macro STORE_HI_LO 12 + movq %1, %9 + movq %3, %10 + movq %5, %11 + movq %7, %12 + movhps %2, %9 + movhps %4, %10 + movhps %6, %11 + movhps %8, %12 +%endmacro + %macro idct_fn 0 cglobal simple_idct8, 1, 1, 16, block IDCT_FN "", 11, pw_round_20_div_w4, 20, "store" RET +; TODO: optimise by not writing the final data to the block. +cglobal simple_idct8_put, 3, 4, 16, pixels, lsize, block + IDCT_FN "", 11, pw_round_20_div_w4, 20 + lea r3, [3*lsizeq] + lea r2, [pixelsq + r3] + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 +RET + cglobal simple_idct10, 1, 1, 16, block IDCT_FN "", 12, "", 19, "store" RET -- 2.13.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel