On 4/4/2017 10:53 PM, James Darnley wrote: > Haswell: > - 1.02x faster (405±0.7 vs. 397±0.8 decicycles) compared with mmxext > > Skylake-U: > - 1.06x faster (498±1.8 vs. 470±1.3 decicycles) compared with mmxext > --- > libavcodec/x86/h264_idct.asm | 20 ++++++++++++++++++++ > libavcodec/x86/h264dsp_init.c | 2 ++ > 2 files changed, 22 insertions(+) > > diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm > index 24fb4d2..7fd57d3 100644 > --- a/libavcodec/x86/h264_idct.asm > +++ b/libavcodec/x86/h264_idct.asm > @@ -1158,7 +1158,27 @@ INIT_XMM avx > movd [%7+%8], %4 > %endmacro > > +%macro DC_ADD_INIT 1 > + add %1d, 32 > + sar %1d, 6 > + movd m0, %1d > + SPLATW m0, m0, 0
Considering DC_ADD_MMXEXT_OP works with dwords, a single pshuflw should be enough. This macro calls two instructions to fill the entire XMM register, and there's no need for that. You could for that matter try to optimize DC_ADD_MMXEXT_OP a bit, combining said dwords with punpk* into fewer registers to reduce the amount of padd* and psub* needed afterwards. See ADD_RES_MMX_4_8 in hevc_add_res.asm And again, SSE2 first, AVX only if measurably faster. But since you're not making use of the wider XMM regs here at all, the only chips that will see any real speed up are those slow in mmx (like Skylake seems to be). > + lea %1, [3*stride_q] > + pxor m1, m1 > + psubw m1, m0 > + packuswb m0, m0 > + packuswb m1, m1 > +%endmacro > + > cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ > movsxdifnidn stride_q, stride_d > IDCT4_ADD dst_q, block_q, stride_q > RET > + > +cglobal h264_idct_dc_add_8, 3, 4, 0, dst_, block_, stride_ > + movsxdifnidn stride_q, stride_d > + movsx r3d, word [block_q] > + mov dword [block_q], 0 > + DC_ADD_INIT r3 > + DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 > +RET > diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c > index 8ba085f..bf74937 100644 > --- a/libavcodec/x86/h264dsp_init.c > +++ b/libavcodec/x86/h264dsp_init.c > @@ -35,6 +35,7 @@ IDCT_ADD_FUNC(, 8, mmx) > IDCT_ADD_FUNC(, 8, avx) > IDCT_ADD_FUNC(, 10, sse2) > IDCT_ADD_FUNC(_dc, 8, mmxext) > +IDCT_ADD_FUNC(_dc, 8, avx) > IDCT_ADD_FUNC(_dc, 10, mmxext) > IDCT_ADD_FUNC(8_dc, 8, mmxext) > IDCT_ADD_FUNC(8_dc, 10, sse2) > @@ -340,6 +341,7 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const > int bit_depth, > } > > c->h264_idct_add = ff_h264_idct_add_8_avx; > + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_avx; > } > } else if (bit_depth == 10) { > if (EXTERNAL_MMXEXT(cpu_flags)) { > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel