On 4/4/2017 10:53 PM, James Darnley wrote: > Haswell: > - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext > > Skylake-U: > - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
Again, you should add an SSE2 version first, then an AVX one if it's measurably faster than the SSE2 one. > --- > libavcodec/x86/h264_idct.asm | 33 ++++++++++++++++++++++++++++++++- > libavcodec/x86/h264dsp_init.c | 3 +++ > 2 files changed, 35 insertions(+), 1 deletion(-) > > diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm > index bc4dce4..24fb4d2 100644 > --- a/libavcodec/x86/h264_idct.asm > +++ b/libavcodec/x86/h264_idct.asm > @@ -65,7 +65,15 @@ SECTION .text > > IDCT4_1D w, 0, 1, 2, 3, 4, 5 > mova m6, [pw_32] > - TRANSPOSE4x4W 0, 1, 2, 3, 4 > + %if mmsize == 8 > + TRANSPOSE4x4W 0, 1, 2, 3, 4 > + %else > + punpcklwd m0, m1 > + punpcklwd m2, m3 > + SBUTTERFLY dq, 0, 2, 4 > + MOVHL m1, m0 > + MOVHL m3, m2 > + %endif > paddw m0, m6 > IDCT4_1D w, 0, 1, 2, 3, 4, 5 > pxor m7, m7 > @@ -1131,3 +1139,26 @@ INIT_MMX mmx > IDCT_DC_DEQUANT 0 > INIT_MMX sse2 > IDCT_DC_DEQUANT 7 > + > +INIT_XMM avx > + > +; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't > have this yet > +%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride > + movd %3, [%7] > + movd %4, [%7+%8] > + psraw %1, %6 > + psraw %2, %6 > + punpcklbw %3, %5 > + punpcklbw %4, %5 > + paddw %3, %1 > + paddw %4, %2 > + packuswb %3, %5 > + packuswb %4, %5 > + movd [%7], %3 > + movd [%7+%8], %4 > +%endmacro > + > +cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ > + movsxdifnidn stride_q, stride_d > + IDCT4_ADD dst_q, block_q, stride_q > +RET > diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c > index 0643b37..8ba085f 100644 > --- a/libavcodec/x86/h264dsp_init.c > +++ b/libavcodec/x86/h264dsp_init.c > @@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## > OPT(uint8_t *dst, \ > int stride); > > IDCT_ADD_FUNC(, 8, mmx) > +IDCT_ADD_FUNC(, 8, avx) > IDCT_ADD_FUNC(, 10, sse2) > IDCT_ADD_FUNC(_dc, 8, mmxext) > IDCT_ADD_FUNC(_dc, 10, mmxext) > @@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const > int bit_depth, > c->h264_h_loop_filter_chroma = > ff_deblock_h_chroma422_8_avx; > c->h264_h_loop_filter_chroma_intra = > ff_deblock_h_chroma422_intra_8_avx; > } > + > + c->h264_idct_add = ff_h264_idct_add_8_avx; > } > } else if (bit_depth == 10) { > if (EXTERNAL_MMXEXT(cpu_flags)) { > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel