On 12 frames of a 444p 12 bits DNxHR sequence, _put function: C: 78902 decicycles in idct, 262071 runs, 73 skips avx: 32478 decicycles in idct, 262045 runs, 99 skips
Difference between the 2: stddev: 0.39 PSNR:104.47 MAXDIFF: 2 This is unavoidable and due to the scale factors used in the x86 version, which cannot match the C ones, as this would cause overflows (there's one less 1bit of precision). In particular, the trick to merge an addition into a multiplication of the first butterfly of the pass can cause an overflow (15bits coeff now needing 16). --- libavcodec/x86/idctdsp_init.c | 21 +++++++++++++++++++-- libavcodec/x86/simple_idct.h | 6 ++++++ libavcodec/x86/simple_idct10.asm | 17 +++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 4fc9b0d..be563c2 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -86,8 +86,8 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_sse2; } - if (ARCH_X86_64 && - avctx->bits_per_raw_sample == 10 && avctx->lowres == 0 && + if (ARCH_X86_64 && avctx->lowres == 0) { + if (avctx->bits_per_raw_sample == 10 && (avctx->idct_algo == FF_IDCT_AUTO || avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLE)) { @@ -102,5 +102,22 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->idct_put = ff_simple_idct10_put_avx; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } + } + + if (avctx->bits_per_raw_sample == 12 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct = ff_simple_idct12_sse2; + c->idct_put = ff_simple_idct12_put_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct = ff_simple_idct12_avx; + c->idct_put = ff_simple_idct12_put_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } } } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index e8f59c1..8eeb31e 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -31,4 +31,10 @@ void ff_simple_idct10_avx(int16_t *block); void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block); void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_sse2(int16_t *block); +void ff_simple_idct12_avx(int16_t *block); + +void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block); + #endif /* AVCODEC_X86_SIMPLE_IDCT_H */ diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index b1f45ea..a410191 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -29,9 +29,13 @@ SECTION_RODATA +cextern pw_1 +cextern pw_2 cextern pw_8 cextern pw_1023 +cextern pw_4095 pd_round: times 4 dd 1<<(13-1) +pd_round2: times 4 dd 1<<(15-1) %include "libavcodec/x86/simple_idct10_template.asm" @@ -47,6 +51,19 @@ cglobal simple_idct10_put, 3, 3, 16 mova m15, [pd_round] IDCT_FN "", 13, pw_8, 18, 0, pw_1023 RET + +cglobal simple_idct12, 1, 1, 16 + mova m15, [pd_round2] + IDCT_FN "", 15, pw_2, 16 + RET + +cglobal simple_idct12_put, 3, 3, 16 + ; range isn't known, so the C simple_idct range is used + ; Also, using a bias on input overflows, so use the bias + ; on output of the first butterfly instead + mova m15, [pd_round2] + IDCT_FN "", 15, pw_2, 16, 0, pw_4095 + RET %endmacro INIT_XMM sse2 -- 2.6.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel