--- libavcodec/x86/idctdsp_init.c | 2 ++ libavcodec/x86/simple_idct.h | 3 ++ libavcodec/x86/simple_idct10.asm | 61 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+)
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 1826d01e0e..9da60d1a1e 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -103,6 +103,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct = ff_simple_idct8_sse2; c->idct_put = ff_simple_idct8_put_sse2; + c->idct_add = ff_simple_idct8_add_sse2; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } } @@ -115,6 +116,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { c->idct = ff_simple_idct8_avx; c->idct_put = ff_simple_idct8_put_avx; + c->idct_add = ff_simple_idct8_add_avx; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index b559f8527c..9b64cfe9bc 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -35,6 +35,9 @@ void ff_simple_idct8_avx(int16_t *block); void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_simple_idct8_add_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + void ff_simple_idct10_sse2(int16_t *block); void ff_simple_idct10_avx(int16_t *block); diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index f31fb5cfa5..29e18fe6a6 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -82,6 +82,31 @@ SECTION .text movhps %8, %12 %endmacro +%macro LOAD_ZXBW_8 16 + pmovzxbw %1, %9 + pmovzxbw %2, %10 + pmovzxbw %3, %11 + pmovzxbw %4, %12 + pmovzxbw %5, %13 + pmovzxbw %6, %14 + pmovzxbw %7, %15 + pmovzxbw %8, %16 +%endmacro + +%macro LOAD_ZXBW_4 9 + movh %1, %5 + movh %2, %6 + movh %3, %7 + movh %4, %8 + punpcklbw %1, %9 + punpcklbw %2, %9 + punpcklbw %3, %9 + punpcklbw %4, %9 +%endmacro + +%define PASS4ROWS(base, stride, stride3) \ + [base], [base + stride], [base + 2*stride], [base + stride3] + %macro idct_fn 0 cglobal simple_idct8, 1, 1, 16, block IDCT_FN "", 11, pw_round_20_div_w4, 20, "store" @@ -99,6 +124,42 @@ cglobal simple_idct8_put, 3, 4, 16, pixels, lsize, block STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9 RET +; TODO: optimise by not writing the final data to the block. +cglobal simple_idct8_add, 3, 4, 16, pixels, lsize, block + IDCT_FN "", 11, pw_round_20_div_w4, 20 + lea r2, [3*lsizeq] + lea r3, [pixelsq + r2] + %if cpuflag(sse4) + LOAD_ZXBW_8 m3, m5, m6, m7, m12, m13, m14, m15, PASS8ROWS(pixelsq, r3, lsizeq, r2) + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + paddsw m4, m12 + paddsw m11, m13 + paddsw m9, m14 + paddsw m10, m15 + %else + pxor m12, m12 + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m8, m3 + paddsw m0, m5 + paddsw m1, m6 + paddsw m2, m7 + lea pixelsq, [pixelsq + 4*lsizeq] + LOAD_ZXBW_4 m3, m5, m6, m7, PASS4ROWS(pixelsq, lsizeq, r2), m12 + paddsw m4, m3 + paddsw m11, m5 + paddsw m9, m6 + paddsw m10, m7 + %endif + packuswb m8, m0 + packuswb m1, m2 + packuswb m4, m11 + packuswb m9, m10 + STORE_HI_LO PASS8ROWS(pixelsq, r3, lsizeq, r2), m8, m1, m4, m9 +RET + cglobal simple_idct10, 1, 1, 16, block IDCT_FN "", 12, "", 19, "store" RET -- 2.13.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel