On 9/15/2015 9:24 PM, Ronald S. Bultje wrote: > --- > libavcodec/x86/vp9dsp_init_16bpp.c | 42 > ++++++++++++++++++++++++++++++--------
Why not just add all this to vp9dsp_init.c and selectively initialize everything by checking the existing bpp argument in ff_vp9dsp_init_x86()? If you think you wont be able to reuse the existing macros in vp9dsp_init.c then i guess a new file would indeed be better to avoid bloat. Alternatively, macros that can be shared could be moved to a header and then used by both init files (like fpel_func and init_fpel). > libavcodec/x86/vp9mc.asm | 24 ++++++++++++++++++++++ > 2 files changed, 58 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > b/libavcodec/x86/vp9dsp_init_16bpp.c > index 3319012..25a7b2a 100644 > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > @@ -36,14 +36,22 @@ void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t > dst_stride, \ The naming scheme for other dsp functions in libavcodec is more like ff_codec_function_size_{8,10,12}_cpuflag(). Not too important for these as most are shared between all bpps, but for other functions it may be a good idea to rename the 8bit functions to follow the above naming scheme in preparation for the new 10/12bits ones. > const uint8_t *src, ptrdiff_t src_stride, \ > int h, int mx, int my) > > -fpel_func(put, 8, mmx); > -fpel_func(put, 16, sse); > -fpel_func(put, 32, sse); > -fpel_func(put, 64, sse); > -fpel_func(put, 128, sse); > -fpel_func(put, 32, avx); > -fpel_func(put, 64, avx); > -fpel_func(put, 128, avx); > +fpel_func(put, 8, mmx); > +fpel_func(avg16, 8, mmxext); > +fpel_func(put, 16, sse); > +fpel_func(put, 32, sse); > +fpel_func(put, 64, sse); > +fpel_func(put, 128, sse); > +fpel_func(avg16, 16, sse2); > +fpel_func(avg16, 32, sse2); > +fpel_func(avg16, 64, sse2); > +fpel_func(avg16, 128, sse2); > +fpel_func(put, 32, avx); > +fpel_func(put, 64, avx); > +fpel_func(put, 128, avx); > +fpel_func(avg16, 32, avx2); > +fpel_func(avg16, 64, avx2); > +fpel_func(avg16, 128, avx2); > #undef fpel_func > > #endif /* HAVE_YASM */ > @@ -67,18 +75,36 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, > int bpp) > init_fpel(4, 0, 8, put, mmx); > } > > + if (EXTERNAL_MMX(cpu_flags)) { > + init_fpel(4, 1, 8, avg16, mmxext); > + } > + > if (EXTERNAL_SSE(cpu_flags)) { > init_fpel(3, 0, 16, put, sse); > init_fpel(2, 0, 32, put, sse); > init_fpel(1, 0, 64, put, sse); > init_fpel(0, 0, 128, put, sse); > } > + > + if (EXTERNAL_SSE2(cpu_flags)) { > + init_fpel(3, 1, 16, avg16, sse2); > + init_fpel(2, 1, 32, avg16, sse2); > + init_fpel(1, 1, 64, avg16, sse2); > + init_fpel(0, 1, 128, avg16, sse2); > + } > + > if (EXTERNAL_AVX_FAST(cpu_flags)) { > init_fpel(2, 0, 32, put, avx); > init_fpel(1, 0, 64, put, avx); > init_fpel(0, 0, 128, put, avx); > } > > + if (EXTERNAL_AVX2(cpu_flags)) { > + init_fpel(2, 1, 32, avg16, avx2); > + init_fpel(1, 1, 64, avg16, avx2); > + init_fpel(0, 1, 128, avg16, avx2); > + } > + > #undef init_fpel > > #endif /* HAVE_YASM */ > diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm > index fb5b1e9..ebfb200 100644 > --- a/libavcodec/x86/vp9mc.asm > +++ b/libavcodec/x86/vp9mc.asm > @@ -586,6 +586,17 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h > pavgb m1, [dstq+d%3] > pavgb m2, [dstq+d%4] > pavgb m3, [dstq+d%5] > +%elifidn %1, avg16 Nit: Maybe a separate parameter that's either b or w to make this pavg%# instead? > + pavgw m0, [dstq] > + pavgw m1, [dstq+d%3] > + pavgw m2, [dstq+d%4] > + pavgw m3, [dstq+d%5] > +%if %2/mmsize == 8 > + pavgw m4, [dstq+mmsize*4] > + pavgw m5, [dstq+mmsize*5] > + pavgw m6, [dstq+mmsize*6] > + pavgw m7, [dstq+mmsize*7] > +%endif > %endif > %%dstfn [dstq], m0 > %%dstfn [dstq+d%3], m1 > @@ -631,6 +642,19 @@ INIT_YMM avx2 > fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 > fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 > %endif > +INIT_MMX mmxext > +fpel_fn avg16, 8, strideq, strideq*2, stride3q, 4 > +INIT_XMM sse2 > +fpel_fn avg16, 16, strideq, strideq*2, stride3q, 4 > +fpel_fn avg16, 32, mmsize, strideq, strideq+mmsize, 2 > +fpel_fn avg16, 64, mmsize, mmsize*2, mmsize*3, 1 > +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1, 8 > +%if HAVE_AVX2_EXTERNAL > +INIT_YMM avx2 > +fpel_fn avg16, 32, strideq, strideq*2, stride3q, 4 > +fpel_fn avg16, 64, mmsize, strideq, strideq+mmsize, 2 > +fpel_fn avg16, 128, mmsize, mmsize*2, mmsize*3, 1 > +%endif > %undef s16 > %undef d16 > %undef s32 > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel