On 12 frames of a 444p 12 bits DNxHR sequence: C: 78902 decicycles in idct, 262071 runs, 73 skips avx: 32478 decicycles in idct, 262045 runs, 99 skips
Difference between the 2: stddev: 0.39 PSNR:104.47 MAXDIFF: 2 This is unavoidable and due to the scale factors used in the x86 version, which cannot match the C ones, as this would cause overflows (there's one less 1bit of precision). --- libavcodec/x86/idctdsp_init.c | 19 +++++++++++++++++-- libavcodec/x86/simple_idct.h | 3 +++ libavcodec/x86/simple_idct10.asm | 18 +++++++++++++++--- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index 17ddc9e..0051461 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -86,8 +86,8 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_sse2; } - if (ARCH_X86_64 && - avctx->bits_per_raw_sample == 10 && avctx->lowres == 0 && + if (ARCH_X86_64 && avctx->lowres == 0) { + if (avctx->bits_per_raw_sample == 10 && (avctx->idct_algo == FF_IDCT_AUTO || avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLE)) { @@ -100,5 +100,20 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->idct_put = ff_simple_idct10_put_avx; c->perm_type = FF_IDCT_PERM_TRANSPOSE; } + } + + if (avctx->bits_per_raw_sample == 12 && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (EXTERNAL_SSE2(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_sse2; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + + } + if (EXTERNAL_AVX(cpu_flags)) { + c->idct_put = ff_simple_idct12_put_avx; + c->perm_type = FF_IDCT_PERM_TRANSPOSE; + } + } } } diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h index d886434..0a90c36 100644 --- a/libavcodec/x86/simple_idct.h +++ b/libavcodec/x86/simple_idct.h @@ -28,4 +28,7 @@ void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block); void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block); +void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block); + #endif /* AVCODEC_X86_SIMPLE_IDCT_H */ diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index 77db0a7..06290ae 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -29,25 +29,37 @@ SECTION_RODATA +cextern pw_1 +cextern pw_2 cextern pw_8 cextern pw_1023 +cextern pw_4095 pd_round: times 4 dd 1<<(13-1) %include "libavcodec/x86/simple_idct10_template.asm" section .text align=16 -%macro idct_put_fn 1 +%macro idct_put_fn10 1 cglobal simple_idct10_put, 3, 3, %1 IDCT_PUT_FN "", 13, pw_8, 18, 0, pw_1023 RET %endmacro +%macro idct_put_fn12 1 +cglobal simple_idct12_put, 3, 3, %1 + ; range isn't known, so the C simple_idct range is used + IDCT_PUT_FN pw_1, 15, pw_2, 16, 0, pw_4095 + RET +%endmacro + INIT_XMM sse2 -idct_put_fn 16 +idct_put_fn10 16 +idct_put_fn12 16 %if HAVE_AVX_EXTERNAL INIT_XMM avx -idct_put_fn 16 +idct_put_fn10 16 +idct_put_fn12 16 %endif %endif -- 2.6.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel