--- libavcodec/x86/vp9dsp_init_16bpp.c | 42 ++++++++++++++++++++++++++++++-------- libavcodec/x86/vp9mc.asm | 24 ++++++++++++++++++++++ 2 files changed, 58 insertions(+), 8 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 3319012..25a7b2a 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, \ int h, int mx, int my) -fpel_func(put, 8, mmx); -fpel_func(put, 16, sse); -fpel_func(put, 32, sse); -fpel_func(put, 64, sse); -fpel_func(put, 128, sse); -fpel_func(put, 32, avx); -fpel_func(put, 64, avx); -fpel_func(put, 128, avx); +fpel_func(put, 8, mmx); +fpel_func(avg16, 8, mmxext); +fpel_func(put, 16, sse); +fpel_func(put, 32, sse); +fpel_func(put, 64, sse); +fpel_func(put, 128, sse); +fpel_func(avg16, 16, sse2); +fpel_func(avg16, 32, sse2); +fpel_func(avg16, 64, sse2); +fpel_func(avg16, 128, sse2); +fpel_func(put, 32, avx); +fpel_func(put, 64, avx); +fpel_func(put, 128, avx); +fpel_func(avg16, 32, avx2); +fpel_func(avg16, 64, avx2); +fpel_func(avg16, 128, avx2); #undef fpel_func #endif /* HAVE_YASM */ @@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp) init_fpel(4, 0, 8, put, mmx); } + if (EXTERNAL_MMX(cpu_flags)) { + init_fpel(4, 1, 8, avg16, mmxext); + } + if (EXTERNAL_SSE(cpu_flags)) { init_fpel(3, 0, 16, put, sse); init_fpel(2, 0, 32, put, sse); init_fpel(1, 0, 64, put, sse); init_fpel(0, 0, 128, put, sse); } + + if (EXTERNAL_SSE2(cpu_flags)) { + init_fpel(3, 1, 16, avg16, sse2); + init_fpel(2, 1, 32, avg16, sse2); + init_fpel(1, 1, 64, avg16, sse2); + init_fpel(0, 1, 128, avg16, sse2); + } + if (EXTERNAL_AVX_FAST(cpu_flags)) { init_fpel(2, 0, 32, put, avx); init_fpel(1, 0, 64, put, avx); init_fpel(0, 0, 128, put, avx); } + if (EXTERNAL_AVX2(cpu_flags)) { + init_fpel(2, 1, 32, avg16, avx2); + init_fpel(1, 1, 64, avg16, avx2); + init_fpel(0, 1, 128, avg16, avx2); + } + #undef init_fpel #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index fb5b1e9..ebfb200 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h pavgb m1, [dstq+d%3] pavgb m2, [dstq+d%4] pavgb m3, [dstq+d%5] +%elifidn %1, avg16 + pavgw m0, [dstq] + pavgw m1, [dstq+d%3] + pavgw m2, [dstq+d%4] + pavgw m3, [dstq+d%5] +%if %2/mmsize == 8 + pavgw m4, [dstq+mmsize*4] + pavgw m5, [dstq+mmsize*5] + pavgw m6, [dstq+mmsize*6] + pavgw m7, [dstq+mmsize*7] +%endif %endif %%dstfn [dstq], m0 %%dstfn [dstq+d%3], m1 @@ -631,6 +642,19 @@ INIT_YMM avx2 fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 %endif +INIT_MMX mmxext +fpel_fn avg16, 8, strideq, strideq*2, stride3q, 4 +INIT_XMM sse2 +fpel_fn avg16, 16, strideq, strideq*2, stride3q, 4 +fpel_fn avg16, 32, mmsize, strideq, strideq+mmsize, 2 +fpel_fn avg16, 64, mmsize, mmsize*2, mmsize*3, 1 +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1, 8 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +fpel_fn avg16, 32, strideq, strideq*2, stride3q, 4 +fpel_fn avg16, 64, mmsize, strideq, strideq+mmsize, 2 +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1 +%endif %undef s16 %undef d16 %undef s32 -- 2.1.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel