1.04x faster (521±1.7 vs. 501±1.1 decicycles) compared with mmxext --- libavcodec/x86/h264_idct.asm | 21 +++++++++++++++++++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 23 insertions(+)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index ca8ffdb..c4b6e55 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -1158,6 +1158,18 @@ INIT_XMM avx movd [%7+%8], %4 %endmacro +%macro DC_ADD_INIT 1 + add %1d, 32 + sar %1d, 6 + movd m0, %1d + SPLATW m0, m0, 0 + lea %1, [3*stride_q] + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ movsxdifnidn stride_q, stride_d IDCT4_ADD dst_q, block_q, stride_q @@ -1167,3 +1179,12 @@ cglobal h264_idct8_add_8, 3, 4, 10, dst_, block_, stride_ movsxdifnidn stride_q, stride_d IDCT8_ADD_SSE dst_q, block_q, stride_q, r3 RET + +; Not any faster +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ + movsxdifnidn stride_q, stride_d + movsx r3d, word [block_q] + mov dword [block_q], 0 + DC_ADD_INIT r3 + DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 2172a71..1aa66a8 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx) IDCT_ADD_FUNC(, 8, avx) IDCT_ADD_FUNC(, 10, sse2) IDCT_ADD_FUNC(_dc, 8, mmxext) +IDCT_ADD_FUNC(_dc, 8, avx) IDCT_ADD_FUNC(_dc, 10, mmxext) IDCT_ADD_FUNC(8_dc, 8, mmxext) IDCT_ADD_FUNC(8_dc, 10, sse2) @@ -342,6 +343,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct_add = ff_h264_idct_add_8_avx; c->h264_idct8_add = ff_h264_idct8_add_8_avx; + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel