Use symmetry properties of the ipred_dl function for better performance. vp9_diag_downleft_32x32_12bpp_c: 1534.2 vp9_diag_downleft_32x32_12bpp_sse2: 145.9 vp9_diag_downleft_32x32_12bpp_ssse3: 140.0 vp9_diag_downleft_32x32_12bpp_avx: 134.8 vp9_diag_downleft_32x32_12bpp_avx2: 78.9
~40% faster than avx Signed-off-by: Ilia Valiakhmetov <zakne...@gmail.com> --- libavcodec/x86/vp9intrapred_16bpp.asm | 47 ++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 8d8d65e..33a8a7f 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -901,49 +901,68 @@ cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a LOWPASS 1, 2, 3 ; RSTUVWXYZ......5 vperm2i128 m2, m1, m4, q0201 ; Z......555555555 vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY - DEFINE_ARGS dst, stride, stride3, cnt + vperm2i128 m6, m2, m2, q0101 + DEFINE_ARGS dst, stride, stride3, dst16, cnt lea stride3q, [strideq*3] - mov cntd, 4 + lea dst16q, [dstq+strideq*8] + lea dst16q, [dst16q+strideq*8] + mov cntd, 2 .loop: mova [dstq+strideq*0 + 0], m0 mova [dstq+strideq*0 +32], m1 + mova [dst16q+strideq*0+ 0], m1 + mova [dst16q+strideq*0+32], m6 vpalignr m3, m5, m0, 2 vpalignr m4, m2, m1, 2 mova [dstq+strideq*1 + 0], m3 mova [dstq+strideq*1 +32], m4 + mova [dst16q+strideq*1 +0], m4 + mova [dst16q+strideq*1 +32], m6 vpalignr m3, m5, m0, 4 vpalignr m4, m2, m1, 4 mova [dstq+strideq*2 + 0], m3 mova [dstq+strideq*2 +32], m4 + mova [dst16q+strideq*2+0], m4 + mova [dst16q+strideq*2+32], m6 vpalignr m3, m5, m0, 6 - vpalignr m4, m2, m1, 6 + vpalignr m4, m2, m1, 6 mova [dstq+stride3q*1+ 0], m3 mova [dstq+stride3q*1+32], m4 - lea dstq, [dstq+strideq*4] + mova [dst16q+stride3q*1+0], m4 + mova [dst16q+stride3q*1+32], m6 vpalignr m3, m5, m0, 8 vpalignr m4, m2, m1, 8 + lea dstq, [dstq+strideq*4] + lea dst16q, [dst16q+strideq*4] mova [dstq+strideq*0 + 0], m3 mova [dstq+strideq*0 +32], m4 + mova [dst16q+strideq*0 +0], m4 + mova [dst16q+strideq*0 +32], m6 vpalignr m3, m5, m0, 10 vpalignr m4, m2, m1, 10 mova [dstq+strideq*1 + 0], m3 mova [dstq+strideq*1 +32], m4 + mova [dst16q+strideq*1 +0], m4 + mova [dst16q+strideq*1 +32], m6 vpalignr m3, m5, m0, 12 vpalignr m4, m2, m1, 12 - mova [dstq+strideq*2+ 0], m3 - mova [dstq+strideq*2+32], m4 + mova [dstq+strideq*2+ 0], m3 + mova [dstq+strideq*2+32], m4 + mova [dst16q+strideq*2+0], m4 + mova [dst16q+strideq*2+32], m6 vpalignr m3, m5, m0, 14 vpalignr m4, m2, m1, 14 - mova [dstq+stride3q+ 0], m3 - mova [dstq+stride3q+ 32], m4 - vpalignr m3, m5, m0, 16 - vpalignr m4, m2, m1, 16 - vperm2i128 m5, m3, m4, q0201 - vperm2i128 m2, m4, m4, q0101 - mova m0, m3 - mova m1, m4 + mova [dstq+stride3q+ 0], m3 + mova [dstq+stride3q+ 32], m4 + mova [dst16q+stride3q+ 0], m4 + mova [dst16q+stride3q+32], m6 + mova m0, m5 + mova m1, m2 + vperm2i128 m5, m5, m2, q0201 + mova m2, m6 lea dstq, [dstq+strideq*4] + lea dst16q, [dst16q+strideq*4] dec cntd jg .loop RET -- 2.8.3 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel