Results for omse on the 3 idct dct-test. C: 0.16915859 0.11848359 0.12913125 x86: 0.16883281 0.11849063 0.19041875
Using 14 and 17 as shifts subtantially improve those, but actually cause overflows and incorrect decoding of 12bpp content. --- libavcodec/simple_idct_template.c | 17 ++++------------- libavcodec/x86/idctdsp_init.c | 8 +++----- libavcodec/x86/simple_idct10.asm | 7 +++---- 3 files changed, 10 insertions(+), 22 deletions(-) diff --git a/libavcodec/simple_idct_template.c b/libavcodec/simple_idct_template.c index 0585679..c94c583 100644 --- a/libavcodec/simple_idct_template.c +++ b/libavcodec/simple_idct_template.c @@ -66,7 +66,6 @@ #elif BIT_DEPTH == 10 || BIT_DEPTH == 12 -# if BIT_DEPTH == 10 #define W1 22725 // 90901 #define W2 21407 // 85627 #define W3 19265 // 77062 @@ -75,6 +74,7 @@ #define W6 8867 // 35468 #define W7 4520 // 18081 +# if BIT_DEPTH == 10 # ifdef EXTRA_SHIFT #define ROW_SHIFT 13 #define COL_SHIFT 18 @@ -84,19 +84,10 @@ #define COL_SHIFT 19 #define DC_SHIFT 2 # endif - # else -#define W1 45451 -#define W2 42813 -#define W3 38531 -#define W4 32767 -#define W5 25746 -#define W6 17734 -#define W7 9041 - -#define ROW_SHIFT 16 -#define COL_SHIFT 17 -#define DC_SHIFT -1 +#define ROW_SHIFT 15 +#define COL_SHIFT 16 +#define DC_SHIFT -1 # endif #define MUL(a, b) ((a) * (b)) diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c index bcf7e5b..8b25ff9 100644 --- a/libavcodec/x86/idctdsp_init.c +++ b/libavcodec/x86/idctdsp_init.c @@ -86,11 +86,11 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, c->add_pixels_clamped = ff_add_pixels_clamped_sse2; } - if (ARCH_X86_64 && avctx->lowres == 0) { - if (avctx->bits_per_raw_sample == 10 && + if (ARCH_X86_64 && avctx->lowres == 0 && (avctx->idct_algo == FF_IDCT_AUTO || avctx->idct_algo == FF_IDCT_SIMPLEAUTO || avctx->idct_algo == FF_IDCT_SIMPLE)) { + if (avctx->bits_per_raw_sample == 10) { if (EXTERNAL_SSE2(cpu_flags)) { c->idct_put = ff_simple_idct10_put_sse2; c->idct_add = NULL; @@ -106,9 +106,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, } } - if (avctx->bits_per_raw_sample == 12 && - (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEMMX)) { + if (avctx->bits_per_raw_sample == 12) { if (EXTERNAL_SSE2(cpu_flags)) { c->idct_put = ff_simple_idct12_put_sse2; c->idct_add = NULL; diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index cd83d61..c5ee05c 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -29,12 +29,11 @@ SECTION_RODATA -cextern pw_2 -cextern pw_16 cextern pw_1023 cextern pw_4095 pd_round_12: times 4 dd 1<<(12-1) pd_round_15: times 4 dd 1<<(15-1) +pd_round_16: times 4 dd 1<<(16-1) pd_round_19: times 4 dd 1<<(19-1) %macro CONST_DEC 3 @@ -79,14 +78,14 @@ cglobal simple_idct10_put, 3, 3, 16 cglobal simple_idct12, 1, 1, 16 ; coeffs are already 15bits, adding the offset would cause ; overflow in the input - IDCT_FN "", 15, pw_2, 16 + IDCT_FN "", 15, "", 16 RET cglobal simple_idct12_put, 3, 3, 16 ; range isn't known, so the C simple_idct range is used ; Also, using a bias on input overflows, so use the bias ; on output of the first butterfly instead - IDCT_FN "", 15, pw_2, 16, 0, pw_4095 + IDCT_FN "", 15, "", 16, 0, pw_4095 RET %endmacro -- 2.6.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel