vp9_vert_left_16x16_12bpp_c: 273.8 vp9_vert_left_16x16_12bpp_sse2: 69.4 vp9_vert_left_16x16_12bpp_ssse3: 35.3 vp9_vert_left_16x16_12bpp_avx: 34.6 vp9_vert_left_16x16_12bpp_avx2: 22.4
~35% faster than avx Signed-off-by: Ilia Valiakhmetov <zakne...@gmail.com> --- libavcodec/x86/vp9dsp_init_16bpp.c | 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 53 +++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 60d10a1..da8b74c 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -55,6 +55,7 @@ decl_ipred_fn(dl, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dr, 32, 16, avx2); +decl_ipred_fn(vl, 16, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -143,6 +144,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) #if ARCH_X86_64 init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); #endif + init_ipred_func(vl, VERT_LEFT, 16, 16, avx2); } #endif /* HAVE_X86ASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 32b6982..8d8d65e 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1538,6 +1538,59 @@ VL_FUNCS 1 INIT_XMM avx VL_FUNCS 1 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a + movifnidn aq, amp + mova m0, [aq] ; abcdefghijklmnop + vpbroadcastw xm5, [aq+30] ; pppppppp + vperm2i128 m1, m0, m5, q0201 ; ijklmnoppppppppp + vpalignr m2, m1, m0, 2 ; bcdefghijklmnopp + vpalignr m3, m1, m0, 4 ; cdefghijklmnoppp + mova m4, m2 + pavgw m4, m0 + LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPp + vperm2i128 m2, m0, m5, q0201 + vperm2i128 m3, m4, m5, q0201 + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + mova [dstq+strideq*0], m4 + mova [dstq+strideq*1], m0 + vpalignr m1, m2, m0, 2 + vpalignr m5, m3, m4, 2 + mova [dstq+strideq*2], m5 + mova [dstq+stride3q ], m1 + vpalignr m1, m2, m0, 4 + vpalignr m5, m3, m4, 4 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m5 + mova [dstq+strideq*1], m1 + vpalignr m1, m2, m0, 6 + vpalignr m5, m3, m4, 6 + mova [dstq+strideq*2], m5 + mova [dstq+stride3q ], m1 + vpalignr m1, m2, m0, 8 + vpalignr m5, m3, m4, 8 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m5 + mova [dstq+strideq*1], m1 + vpalignr m1, m2, m0, 10 + vpalignr m5, m3, m4, 10 + mova [dstq+strideq*2], m5 + mova [dstq+stride3q ], m1 + vpalignr m1, m2, m0, 12 + vpalignr m5, m3, m4, 12 + lea dstq, [dstq+strideq*4] + mova [dstq+strideq*0], m5 + mova [dstq+strideq*1], m1 + vpalignr m1, m2, m0, 14 + vpalignr m5, m3, m4, 14 + mova [dstq+strideq*2], m5 + mova [dstq+stride3q ], m1 + RET +%endif + %macro VR_FUNCS 0 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a movu m0, [aq-2] -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel