1.00x faster (2884±63.9 vs. 2880±21.1 decicycles) compared with sse2 --- libavcodec/x86/h264_idct.asm | 60 +++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 62 insertions(+)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index f1f2ce7..1515ea5 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -1237,3 +1237,63 @@ cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, s add16_sse2_cycle 6, 0x1e add16_sse2_cycle 7, 0x26 RET + +; dst, block_offset, block, stride, nnzc, counter, coeff, dst2, picreg +; 0 1 2 3 4 5 6 7 8 +cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst_, block_offset_, block_, stride_, nnzc_, counter_, coeff_, dst2_, picreg + movsxdifnidn stride_q, stride_d + xor counter_q, counter_q + %ifdef PIC + lea picregq, [scan8_mem] + %endif + + .next_block: + movzx coeff_d, byte [scan8 + counter_q] + movzx coeff_d, byte [nnzc_q + coeff_q] + test coeff_d, coeff_d + jz .skip_block + + cmp coeff_d, 1 + jnz .no_dc + + movsx coeff_d, word [block_q] + test coeff_d, coeff_d + jz .no_dc + + mov word [block_q], 0 + DC_ADD_INIT r6 + %define stride3 r6 + %if ARCH_X86_64 == 0 + %define dst2_q r1 + %define dst2_d r1d + %endif + + mov dst2_d, dword [block_offset_q + 4*counter_q] + add dst2_q, dst_q + DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3 + lea dst2_q, [dst2_q + 4*stride_q] + DC_ADD_MMXEXT_OP movq, dst2_q, stride_q, stride3 + %if ARCH_X86_64 == 0 + mov block_offset_q, block_offset_m + %endif + + add counter_q, 4 + add block_q, 128 + cmp counter_q, 16 + jl .next_block + RET + + .no_dc: + mov dst2_d, dword [block_offset_q + 4*counter_q] + add dst2_q, dst_q + IDCT8_ADD_SSE dst2_q, block_q, stride_q, stride3 + %if ARCH_X86_64 == 0 + mov block_offset_q, block_offset_m + %endif + + .skip_block: + add counter_q, 4 + add block_q, 128 + cmp counter_q, 16 + jl .next_block +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index 3396fd8..4050276 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -57,6 +57,7 @@ void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ IDCT_ADD_REP_FUNC(8, 4, 8, mmx) IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) IDCT_ADD_REP_FUNC(8, 4, 8, sse2) +IDCT_ADD_REP_FUNC(8, 4, 8, avx) IDCT_ADD_REP_FUNC(8, 4, 10, sse2) IDCT_ADD_REP_FUNC(8, 4, 10, avx) IDCT_ADD_REP_FUNC(, 16, 8, mmx) @@ -348,6 +349,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx; c->h264_idct_add16 = ff_h264_idct_add16_8_avx; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel