>I know unaligned loads are not as slow as they used to be, >but could m1 be produced by m2 and palignr?
I am not sure, can you clarify your question? >From the comment I assume you don't use the extra two bytes >that you get from the load, as you mark them as "*" >generic undefined values No, those two extra bytes are actually used, that's the above/left corner pixel. If you look in the vp9dsp_template.c file, there is a macro defined diag_downright_ that's top[-1] in the body. Sorry for this ambiguous marking, but it's used in other ipred_dr functions so I decided to follow it. >"cnt" doesn't seem to be used. Yes indeed, I mislooked that, thanks. On Fri, Jun 9, 2017 at 6:03 PM, Ivan Kalvachev <ikalvac...@gmail.com> wrote: > On 6/8/17, Ilia Valiakhmetov <zakne...@gmail.com> wrote: > > vp9_diag_downright_16x16_12bpp_c: 149.0 > > vp9_diag_downright_16x16_12bpp_sse2: 67.8 > > vp9_diag_downright_16x16_12bpp_ssse3: 45.6 > > vp9_diag_downright_16x16_12bpp_avx: 36.6 > > vp9_diag_downright_16x16_12bpp_avx2: 25.5 > > > > ~30% faster than avx > > > > Signed-off-by: Ilia Valiakhmetov <zakne...@gmail.com> > > --- > > libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ > > libavcodec/x86/vp9intrapred_16bpp.asm | 56 > > +++++++++++++++++++++++++++++++++++ > > 2 files changed, 58 insertions(+) > > > > diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c > > b/libavcodec/x86/vp9dsp_init_16bpp.c > > index d1b8fcd..8d1aa13 100644 > > --- a/libavcodec/x86/vp9dsp_init_16bpp.c > > +++ b/libavcodec/x86/vp9dsp_init_16bpp.c > > @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); > > decl_ipred_fns(dc_top, 16, mmxext, sse2); > > decl_ipred_fns(dc_left, 16, mmxext, sse2); > > decl_ipred_fn(dl, 16, 16, avx2); > > +decl_ipred_fn(dr, 16, 16, avx2); > > decl_ipred_fn(dl, 32, 16, avx2); > > > > #define decl_ipred_dir_funcs(type) \ > > @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext > > *dsp) > > init_fpel_func(1, 1, 64, avg, _16, avx2); > > init_fpel_func(0, 1, 128, avg, _16, avx2); > > init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); > > + init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); > > init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); > > } > > > > diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm > > b/libavcodec/x86/vp9intrapred_16bpp.asm > > index 92333bc..67b98b1 100644 > > --- a/libavcodec/x86/vp9intrapred_16bpp.asm > > +++ b/libavcodec/x86/vp9intrapred_16bpp.asm > > @@ -1170,6 +1170,62 @@ DR_FUNCS 2 > > INIT_XMM avx > > DR_FUNCS 2 > > > > +%if HAVE_AVX2_EXTERNAL > > +INIT_YMM avx2 > > +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a > > + mova m0, [lq] ; > klmnopqrstuvwxyz > > + movu m1, [aq-2] ; > *abcdefghijklmno > > + mova m2, [aq] ; > abcdefghijklmnop > > From the comment I assume you don't use the extra two bytes > that you get from the load, as you mark them as "*" > generic undefined values > > > + vperm2i128 m4, m2, m2, q2001 ; > ijklmnop........ > > + vpalignr m5, m4, m2, 2 ; > bcdefghijklmnop. > > + vperm2i128 m3, m0, m1, q0201 ; > stuvwxyz*abcdefg > > + LOWPASS 1, 2, 5 ; > ABCDEFGHIJKLMNO. > > + vpalignr m4, m3, m0, 2 ; > lmnopqrstuvwxyz* > > + vpalignr m5, m3, m0, 4 ; > mnopqrstuvwxyz*a > > + LOWPASS 0, 4, 5 ; > LMNOPQRSTUVWXYZ# > > + vperm2i128 m5, m0, m1, q0201 ; > TUVWXYZ#ABCDEFGH > > + DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt > > "cnt" doesn't seem to be used. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel