1.01x faster (2150±46.1 vs. 2118±29.0 decicycles) compared with sse2 --- libavcodec/x86/h264_idct.asm | 40 +++++++++++++++++++++++++++++++++++++++- libavcodec/x86/h264dsp_init.c | 2 ++ 2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index a74e095..f1f2ce7 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -858,7 +858,7 @@ h264_add8x4_idct_sse2: %else add r0, r0m %endif - call h264_add8x4_idct_sse2 + call h264_add8x4_idct_ %+ cpuname %%skip: %if %1 < 7 add r2, 64 @@ -1142,6 +1142,29 @@ IDCT_DC_DEQUANT 7 INIT_XMM avx +ALIGN 16 +; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride +h264_add8x4_idct_avx: + movu m0, [r2 + 0] + movu m1, [r2 + 32] + movu m2, [r2 + 16] + movu m3, [r2 + 48] + SBUTTERFLY qdq, 0, 1, 4 + SBUTTERFLY qdq, 2, 3, 5 + IDCT4_1D w,0,1,2,3,4,5 + TRANSPOSE2x4x4W 0,1,2,3,4 + paddw m0, [pw_32] + IDCT4_1D w,0,1,2,3,4,5 + pxor m7, m7 + mova [r2+ 0], m7 + mova [r2+16], m7 + mova [r2+32], m7 + mova [r2+48], m7 + STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 + lea r0, [r0+r3*2] + STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 +ret + ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride movd %3, [%7] @@ -1199,3 +1222,18 @@ cglobal h264_idct8_dc_add_8, 3, 4, 0 lea dst_q, [dst_q + stride_q*4] DC_ADD_MMXEXT_OP movq, dst_q, stride_q, r3 RET + +cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8, dst_, block_offset_, block_, stride_, nnzc_ + movsxdifnidn stride_q, stride_d + %if ARCH_X86_64 + mov r5, r0 + %endif + add16_sse2_cycle 0, 0xc + add16_sse2_cycle 1, 0x14 + add16_sse2_cycle 2, 0xe + add16_sse2_cycle 3, 0x16 + add16_sse2_cycle 4, 0x1c + add16_sse2_cycle 5, 0x24 + add16_sse2_cycle 6, 0x1e + add16_sse2_cycle 7, 0x26 +RET diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c index de7becf..3396fd8 100644 --- a/libavcodec/x86/h264dsp_init.c +++ b/libavcodec/x86/h264dsp_init.c @@ -62,6 +62,7 @@ IDCT_ADD_REP_FUNC(8, 4, 10, avx) IDCT_ADD_REP_FUNC(, 16, 8, mmx) IDCT_ADD_REP_FUNC(, 16, 8, mmxext) IDCT_ADD_REP_FUNC(, 16, 8, sse2) +IDCT_ADD_REP_FUNC(, 16, 8, avx) IDCT_ADD_REP_FUNC(, 16, 10, sse2) IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) @@ -346,6 +347,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, c->h264_idct8_add = ff_h264_idct8_add_8_avx; c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_avx; + c->h264_idct_add16 = ff_h264_idct_add16_8_avx; } } else if (bit_depth == 10) { if (EXTERNAL_MMXEXT(cpu_flags)) { -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel