[FFmpeg-devel] [PATCH] avcodec/vp9: avx2 implementation of ipred_dl_16x16_16
vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 1 runs Signed-off-by: Ilia --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 39 +++ 2 files changed, 41 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index eb67499..4576ff1 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2); decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); +decl_ipred_fn(dl, 16, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(2, 1, 32, avg, _16, avx2); init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); +init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index c0ac16d..212e413 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -847,6 +847,45 @@ DL_FUNCS INIT_XMM avx DL_FUNCS +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a +movifnidn aq, amp +movam0, [aq] ; abcdefghijklmnop +vpbroadcastw xm1, [aq+30]; +vperm2i128 m2, m0, m1, q0201 ; ijklmnop +vpalignrm3, m2, m0, 2 ; bcdefghijklmnopp +vpalignrm4, m2, m0, 4 ; cdefghijklmnoppp +LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp +vperm2i128 m2, m0, m1, q0201 ; JKLMNOPp +DEFINE_ARGS dst, stride, stride3, cnt +mov cntd, 2 +lea stride3q, [strideq*3] +.loop: +mova [dstq+strideq*0], m0 +vpalignrm3, m2, m0, 2 +vpalignrm4, m2, m0, 4 +mova [dstq+strideq*1], m3 +mova [dstq+strideq*2], m4 +vpalignrm3, m2, m0, 6 +vpalignrm4, m2, m0, 8 +mova [dstq+stride3q ], m3 +lea dstq, [dstq+strideq*4] +mova [dstq+strideq*0], m4 +vpalignrm3, m2, m0, 10 +vpalignrm4, m2, m0, 12 +mova [dstq+strideq*1], m3 +mova [dstq+strideq*2], m4 +vpalignrm3, m2, m0, 14 +mova [dstq+stride3q ], m3 +lea dstq, [dstq+strideq*4] +movam0, m2 +vperm2i128 m2, m2, m2, q0101 ; +dec cntd +jg .loop +RET +%endif + %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a movhm0, [lq]; wxyz -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] libavcodec/vp9 ipred_dl_32x32_16 avx2 version
vp9_diag_downleft_32x32_8bpp_c: 580.2 vp9_diag_downleft_32x32_8bpp_sse2: 75.6 vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 vp9_diag_downleft_32x32_8bpp_avx: 72.7 vp9_diag_downleft_32x32_10bpp_c: 1101.2 vp9_diag_downleft_32x32_10bpp_sse2: 145.4 vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 vp9_diag_downleft_32x32_10bpp_avx: 134.8 vp9_diag_downleft_32x32_10bpp_avx2: 94.0 vp9_diag_downleft_32x32_12bpp_c: 1108.5 vp9_diag_downleft_32x32_12bpp_sse2: 145.5 vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 vp9_diag_downleft_32x32_12bpp_avx: 135.2 vp9_diag_downleft_32x32_12bpp_avx2: 94.0 ~30% faster than avx --- libavcodec/x86/vp9dsp_init_16bpp.c| 4 +- libavcodec/x86/vp9intrapred_16bpp.asm | 75 +++ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 4e1f24f..d1b8fcd 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,7 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); -decl_ipred_fn(dl, 32, 32, avx2); +decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -136,7 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); -init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 32, avx2); +init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 2ec5381..10a0994 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a DEFINE_ARGS dst, stride, stride3, cnt mov cntd, 2 lea stride3q, [strideq*3] + .loop: mova [dstq+strideq*0], m0 vpalignrm3, m2, m0, 2 @@ -887,24 +888,64 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a movifnidn aq, amp -movam0, [aq+mmsize*0] ; abcdefghijklmnop -movam1, [aq+mmsize*1] ; qrstuvwxyz012345 -vpbroadcastw xm4, [aq+mmsize*1+30]; -vpalignrm2, m1, m0, 2 ; bcdefghijklmnopq -vpalignrm3, m1, m0, 4 ; cdefghijklmnopqr -vperm2i128 m5, m1, m4, q0201 ; yz012345 -LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ -vpalignrm2, m5, m1, 2 ; rstuvwxyz0123455 -vpalignrm3, m5, m1, 4 ; stuvwxyz01234555 -LOWPASS 1, 2, 3 ; RSTUVWXYZ..5 -vperm2i128 m2, m1, m4, q0201 ; Z..5 +movam0, [aq+mmsize*0+ 0] ; abcdefghijklmnop +movam1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 +vpbroadcastw xm4, [aq+mmsize*1+30] ; +vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx +vpalignrm2, m5, m0, 2 ; bcdefghijklmnopq +vpalignrm3, m5, m0, 4 ; cdefghijklmnopqr +LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ +vperm2i128 m5, m1, m4, q0201 ; yz012345 +vpalignrm2, m5, m1, 2 ; rstuvwxyz0123455 +vpalignrm3, m5, m1, 4 ; stuvwxyz01234555 +LOWPASS 1, 2, 3 ; RSTUVWXYZ..5 +vperm2i128 m2, m1, m4, q0201 ; Z..5 +vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY +DEFINE_ARGS dst, stride, stride3, stride5, cnt +lea stride3q, [strideq*3] +lea stride5q, [strideq*5] +mov cntd, 4 -mova [dstq+strideq*0+0 ], m0 -mova [dstq+strideq*0+32], m1 -vpalignrm3, m1, m0, 2 -vpalignrm4, m2, m1, 2 -mova [dstq+strideq*1+0 ], m3 -mova [dstq+strideq*1+32], m4 +.loop: +mova [dstq+strideq*0 + 0], m0 +mova [dstq+strideq*0 +32], m1 +vpalignr m3, m5, m0, 2 +vpalignr m4, m2, m1, 2 +mova [dstq+strideq*1 + 0], m3 +mova [dstq+strideq*1 +32], m4 +vpalignr m3, m5, m0, 4 +vpalignr m4, m2, m1, 4 +mova [dstq+strideq*2 + 0], m3 +mova [dstq+strideq*2 +32], m4 +vpa
[FFmpeg-devel] [PATCH] libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation
vp9_diag_downleft_32x32_8bpp_c: 580.2 vp9_diag_downleft_32x32_8bpp_sse2: 75.6 vp9_diag_downleft_32x32_8bpp_ssse3: 73.7 vp9_diag_downleft_32x32_8bpp_avx: 72.7 vp9_diag_downleft_32x32_10bpp_c: 1101.2 vp9_diag_downleft_32x32_10bpp_sse2: 145.4 vp9_diag_downleft_32x32_10bpp_ssse3: 137.5 vp9_diag_downleft_32x32_10bpp_avx: 134.8 vp9_diag_downleft_32x32_10bpp_avx2: 94.0 vp9_diag_downleft_32x32_12bpp_c: 1108.5 vp9_diag_downleft_32x32_12bpp_sse2: 145.5 vp9_diag_downleft_32x32_12bpp_ssse3: 137.3 vp9_diag_downleft_32x32_12bpp_avx: 135.2 vp9_diag_downleft_32x32_12bpp_avx2: 94.0 ~30% faster than avx implementation --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++ 2 files changed, 65 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 4576ff1..d1b8fcd 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); +init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 212e413..5cd6a3e 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a DEFINE_ARGS dst, stride, stride3, cnt mov cntd, 2 lea stride3q, [strideq*3] + .loop: mova [dstq+strideq*0], m0 vpalignrm3, m2, m0, 2 @@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a dec cntd jg .loop RET + +cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a +movifnidn aq, amp +movam0, [aq+mmsize*0+ 0] ; abcdefghijklmnop +movam1, [aq+mmsize*1+ 0] ; qrstuvwxyz012345 +vpbroadcastw xm4, [aq+mmsize*1+30] ; +vperm2i128 m5, m0, m1, q0201 ; ijklmnopqrstuvwx +vpalignrm2, m5, m0, 2 ; bcdefghijklmnopq +vpalignrm3, m5, m0, 4 ; cdefghijklmnopqr +LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPQ +vperm2i128 m5, m1, m4, q0201 ; yz012345 +vpalignrm2, m5, m1, 2 ; rstuvwxyz0123455 +vpalignrm3, m5, m1, 4 ; stuvwxyz01234555 +LOWPASS 1, 2, 3 ; RSTUVWXYZ..5 +vperm2i128 m2, m1, m4, q0201 ; Z..5 +vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY +DEFINE_ARGS dst, stride, stride3, cnt +lea stride3q, [strideq*3] +mov cntd, 4 + +.loop: +mova [dstq+strideq*0 + 0], m0 +mova [dstq+strideq*0 +32], m1 +vpalignr m3, m5, m0, 2 +vpalignr m4, m2, m1, 2 +mova [dstq+strideq*1 + 0], m3 +mova [dstq+strideq*1 +32], m4 +vpalignr m3, m5, m0, 4 +vpalignr m4, m2, m1, 4 +mova [dstq+strideq*2 + 0], m3 +mova [dstq+strideq*2 +32], m4 +vpalignr m3, m5, m0, 6 +vpalignr m4, m2, m1, 6 +mova [dstq+stride3q*1+ 0], m3 +mova [dstq+stride3q*1+32], m4 +leadstq, [dstq+strideq*4] +vpalignr m3, m5, m0, 8 +vpalignr m4, m2, m1, 8 +mova [dstq+strideq*0 + 0], m3 +mova [dstq+strideq*0 +32], m4 +vpalignr m3, m5, m0, 10 +vpalignr m4, m2, m1, 10 +mova [dstq+strideq*1 + 0], m3 +mova [dstq+strideq*1 +32], m4 +vpalignr m3, m5, m0, 12 +vpalignr m4, m2, m1, 12 +mova [dstq+strideq*2+ 0], m3 +mova [dstq+strideq*2+32], m4 +vpalignr m3, m5, m0, 14 +vpalignr m4, m2, m1, 14 +mova [dstq+stride3q+ 0], m3 +mova [dstq+stride3q+ 32], m4 +vpalignr m3, m5, m0, 16 +vpalignr m4, m2, m1, 16 +vperm2i128 m5, m3, m4, q0201 +vperm2i128 m2, m4, m4, q0101 +mova m0, m3 +mova m1, m4 +
[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation
vp9_diag_downright_16x16_12bpp_c: 149.0 vp9_diag_downright_16x16_12bpp_sse2: 67.8 vp9_diag_downright_16x16_12bpp_ssse3: 45.6 vp9_diag_downright_16x16_12bpp_avx: 36.6 vp9_diag_downright_16x16_12bpp_avx2: 25.5 ~30% faster than avx Signed-off-by: Ilia Valiakhmetov --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++ 2 files changed, 58 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index d1b8fcd..8d1aa13 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); +init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 92333bc..67b98b1 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1170,6 +1170,62 @@ DR_FUNCS 2 INIT_XMM avx DR_FUNCS 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a +movam0, [lq] ; klmnopqrstuvwxyz +movum1, [aq-2] ; *abcdefghijklmno +movam2, [aq] ; abcdefghijklmnop +vperm2i128 m4, m2, m2, q2001 ; ijklmnop +vpalignrm5, m4, m2, 2 ; bcdefghijklmnop. +vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg +LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. +vpalignrm4, m3, m0, 2 ; lmnopqrstuvwxyz* +vpalignrm5, m3, m0, 4 ; mnopqrstuvwxyz*a +LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# +vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH +DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt +lea dst3q, [dstq+strideq*4] +lea stride3q, [strideq*3] +lea stride5q, [stride3q+strideq*2] + +vpalignrm3, m5, m0, 2 +vpalignrm4, m1, m5, 2 +mova[dst3q+stride5q*2], m3 ; 14 +mova[ dstq+stride3q*2], m4 ; 6 +vpalignrm3, m5, m0, 4 +vpalignrm4, m1, m5, 4 +sub dst3q, strideq +mova[dst3q+stride5q*2], m3 ; 13 +mova[dst3q+strideq*2 ], m4 ; 5 +mova[dst3q+stride3q*4], m0 ; 15 +vpalignrm3, m5, m0, 6 +vpalignrm4, m1, m5, 6 +mova [dstq+stride3q*4], m3 ; 12 +mova [dst3q+strideq*1], m4 ; 4 +vpalignrm3, m5, m0, 8 +vpalignrm4, m1, m5, 8 +mova [dst3q+strideq*8], m3 ; 11 +mova [dst3q+strideq*0], m4 ; 3 +vpalignrm3, m5, m0, 12 +vpalignrm4, m1, m5, 12 +mova[dst3q+stride3q*2], m3 ; 9 +mova [dstq+strideq*1 ], m4 ; 1 +vpalignrm3, m5, m0, 10 +vpalignrm4, m1, m5, 10 +mova [dstq+stride5q*2], m3 ; 10 +mova [dstq+strideq*2 ], m4 ; 2 +vpalignrm3, m5, m0, 14 +vpalignrm4, m1, m5, 14 +mova [dstq+strideq*8], m3 ; 8 +mova [dstq+strideq*0], m4 ; 0 +sub dstq, strideq +mova [dst3q+strideq*4], m5 ; 7 +mova [ dstq+strideq*0], m1 ; -1 +RET +%endif + + %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a movifnidn aq, amp -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation
Signed-off-by: Ilia Valiakhmetov --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++ 2 files changed, 58 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index d1b8fcd..8d1aa13 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); +init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 92333bc..7230de2 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1170,6 +1170,62 @@ DR_FUNCS 2 INIT_XMM avx DR_FUNCS 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 4, 6, dst, stride, l, a +movam0, [lq] ; klmnopqrstuvwxyz +movum1, [aq-2] ; *abcdefghijklmno +movam2, [aq] ; abcdefghijklmnop +vperm2i128 m4, m2, m2, q2001 ; ijklmnop +vpalignrm5, m4, m2, 2 ; bcdefghijklmnop. +vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg +LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. +vpalignrm4, m3, m0, 2 ; lmnopqrstuvwxyz* +vpalignrm5, m3, m0, 4 ; mnopqrstuvwxyz*a +LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# +vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH +DEFINE_ARGS dst, stride, stride3, stride5, dst3 +lea dst3q, [dstq+strideq*4] +lea stride3q, [strideq*3] +lea stride5q, [stride3q+strideq*2] + +vpalignrm3, m5, m0, 2 +vpalignrm4, m1, m5, 2 +mova[dst3q+stride5q*2], m3 ; 14 +mova[ dstq+stride3q*2], m4 ; 6 +vpalignrm3, m5, m0, 4 +vpalignrm4, m1, m5, 4 +sub dst3q, strideq +mova[dst3q+stride5q*2], m3 ; 13 +mova[dst3q+strideq*2 ], m4 ; 5 +mova[dst3q+stride3q*4], m0 ; 15 +vpalignrm3, m5, m0, 6 +vpalignrm4, m1, m5, 6 +mova [dstq+stride3q*4], m3 ; 12 +mova [dst3q+strideq*1], m4 ; 4 +vpalignrm3, m5, m0, 8 +vpalignrm4, m1, m5, 8 +mova [dst3q+strideq*8], m3 ; 11 +mova [dst3q+strideq*0], m4 ; 3 +vpalignrm3, m5, m0, 10 +vpalignrm4, m1, m5, 10 +mova [dstq+stride5q*2], m3 ; 10 +mova [dstq+strideq*2 ], m4 ; 2 +vpalignrm3, m5, m0, 12 +vpalignrm4, m1, m5, 12 +mova[dst3q+stride3q*2], m3 ; 9 +mova [dstq+strideq*1 ], m4 ; 1 +vpalignrm3, m5, m0, 14 +vpalignrm4, m1, m5, 14 +mova [dstq+strideq*8], m3 ; 8 +mova [dstq+strideq*0], m4 ; 0 +sub dstq, strideq +mova [dst3q+strideq*4], m5 ; 7 +mova [ dstq+strideq*0], m1 ; -1 +RET +%endif + + %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a movifnidn aq, amp -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation
Signed-off-by: Ilia Valiakhmetov --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++ 2 files changed, 58 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index d1b8fcd..8d1aa13 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,6 +52,7 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ @@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); +init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); } diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 92333bc..764f704 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1170,6 +1170,62 @@ DR_FUNCS 2 INIT_XMM avx DR_FUNCS 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a +movam0, [lq] ; klmnopqrstuvwxyz +movum1, [aq-2] ; *abcdefghijklmno +movam2, [aq] ; abcdefghijklmnop +vperm2i128 m4, m2, m2, q2001 ; ijklmnop +vpalignrm5, m4, m2, 2 ; bcdefghijklmnop. +vperm2i128 m3, m0, m1, q0201 ; stuvwxyz*abcdefg +LOWPASS 1, 2, 5 ; ABCDEFGHIJKLMNO. +vpalignrm4, m3, m0, 2 ; lmnopqrstuvwxyz* +vpalignrm5, m3, m0, 4 ; mnopqrstuvwxyz*a +LOWPASS 0, 4, 5 ; LMNOPQRSTUVWXYZ# +vperm2i128 m5, m0, m1, q0201 ; TUVWXYZ#ABCDEFGH +DEFINE_ARGS dst, stride, stride3, stride5, dst3 +lea dst3q, [dstq+strideq*4] +lea stride3q, [strideq*3] +lea stride5q, [stride3q+strideq*2] + +vpalignrm3, m5, m0, 2 +vpalignrm4, m1, m5, 2 +mova[dst3q+stride5q*2], m3 ; 14 +mova[ dstq+stride3q*2], m4 ; 6 +vpalignrm3, m5, m0, 4 +vpalignrm4, m1, m5, 4 +sub dst3q, strideq +mova[dst3q+stride5q*2], m3 ; 13 +mova[dst3q+strideq*2 ], m4 ; 5 +mova[dst3q+stride3q*4], m0 ; 15 +vpalignrm3, m5, m0, 6 +vpalignrm4, m1, m5, 6 +mova [dstq+stride3q*4], m3 ; 12 +mova [dst3q+strideq*1], m4 ; 4 +vpalignrm3, m5, m0, 8 +vpalignrm4, m1, m5, 8 +mova [dst3q+strideq*8], m3 ; 11 +mova [dst3q+strideq*0], m4 ; 3 +vpalignrm3, m5, m0, 10 +vpalignrm4, m1, m5, 10 +mova [dstq+stride5q*2], m3 ; 10 +mova [dstq+strideq*2 ], m4 ; 2 +vpalignrm3, m5, m0, 12 +vpalignrm4, m1, m5, 12 +mova[dst3q+stride3q*2], m3 ; 9 +mova [dstq+strideq*1 ], m4 ; 1 +vpalignrm3, m5, m0, 14 +vpalignrm4, m1, m5, 14 +mova [dstq+strideq*8], m3 ; 8 +mova [dstq+strideq*0], m4 ; 0 +sub dstq, strideq +mova [dst3q+strideq*4], m5 ; 7 +mova [ dstq+strideq*0], m1 ; -1 +RET +%endif + + %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a movifnidn aq, amp -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCHv2 1/2] avcodec: add execute3() api to utilize the main function of avpriv_slicethread_create().
Signed-off-by: Ilia Valiakhmetov --- libavcodec/avcodec.h | 7 ++- libavcodec/options.c | 1 + libavcodec/pthread_slice.c | 26 -- libavcodec/utils.c | 14 ++ 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 650..712f40c 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1089,6 +1089,10 @@ typedef struct RcOverride{ */ #define AV_CODEC_CAP_AVOID_PROBING (1 << 17) /** + * Codec initializes slice-based threading with a main function + */ +#define AV_CODEC_SLICE_THREAD_HAS_MF (1 << 18) +/** * Codec is intra only. */ #define AV_CODEC_CAP_INTRA_ONLY 0x4000 @@ -3233,7 +3237,7 @@ typedef struct AVCodecContext { * - decoding: Set by libavcodec, user can override. */ int (*execute2)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, int count); - +int (*execute3)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext *c3), void *arg2, int *ret, int count); /** * noise vs. sse weight for the nsse comparison function * - encoding: Set by user. @@ -5774,6 +5778,7 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, int profile); int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2),void *arg, int *ret, int count, int size); int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int, int),void *arg, int *ret, int count); +int avcodec_default_execute3(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext *c3), void *arg, int *ret, int count); //FIXME func typedef /** diff --git a/libavcodec/options.c b/libavcodec/options.c index 82e1217..6d63bdb 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -117,6 +117,7 @@ static int init_context_defaults(AVCodecContext *s, const AVCodec *codec) s->get_format = avcodec_default_get_format; s->execute = avcodec_default_execute; s->execute2= avcodec_default_execute2; +s->execute3= avcodec_default_execute3; s->sample_aspect_ratio = (AVRational){0,1}; s->pix_fmt = AV_PIX_FMT_NONE; s->sw_pix_fmt = AV_PIX_FMT_NONE; diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c index c781d35..3aff816 100644 --- a/libavcodec/pthread_slice.c +++ b/libavcodec/pthread_slice.c @@ -38,11 +38,13 @@ typedef int (action_func)(AVCodecContext *c, void *arg); typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr); +typedef int (main_func)(AVCodecContext *c); typedef struct SliceThreadContext { AVSliceThread *thread; action_func *func; action_func2 *func2; +main_func *m_func; void *args; int *rets; int job_size; @@ -54,6 +56,12 @@ typedef struct SliceThreadContext { pthread_mutex_t *progress_mutex; } SliceThreadContext; +static void main_function(void *priv) { +AVCodecContext *avctx = priv; +SliceThreadContext *c = avctx->internal->thread_ctx; +c->m_func(avctx); +} + static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads) { AVCodecContext *avctx = priv; @@ -99,7 +107,8 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i c->func = func; c->rets = ret; -avpriv_slicethread_execute(c->thread, job_count, 0); +avpriv_slicethread_execute(c->thread, job_count, !!c->m_func); + return 0; } @@ -110,10 +119,20 @@ static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg return thread_execute(avctx, NULL, arg, ret, job_count, 0); } +static int thread_execute3(AVCodecContext *avctx, action_func2* func2, main_func* m_func, void *arg, int *ret, int job_count) +{ +SliceThreadContext *c = avctx->internal->thread_ctx; +c->func2 = func2; +c->m_func = m_func; +return thread_execute(avctx, NULL, arg, ret, job_count, 0); +} + + int ff_slice_thread_init(AVCodecContext *avctx) { SliceThreadContext *c; int thread_count = avctx->thread_count; +static void (*m_f)(void *); #if HAVE_W32THREADS w32thread_init(); @@ -142,7 +161,9 @@ int ff_slice_thread_init(AVCodecContext *avctx) } avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c)); -if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, NULL, thread_count)) <= 1) { +m_f = avctx->codec->capabilities & AV_CODEC_SLICE_THREAD_HAS_MF ? &main_function : NULL; + +if (!c || (thread_count =
[FFmpeg-devel] [PATCH 2/2] avcodec/pthread_slice: add main function support for avpriv_slicethread_create()
--- libavcodec/internal.h | 4 libavcodec/pthread_slice.c | 33 ++--- libavcodec/thread.h| 1 + libavutil/slicethread.h| 18 ++ 4 files changed, 37 insertions(+), 19 deletions(-) diff --git a/libavcodec/internal.h b/libavcodec/internal.h index 64120ea..4668952 100644 --- a/libavcodec/internal.h +++ b/libavcodec/internal.h @@ -64,6 +64,10 @@ * dimensions to coded rather than display values. */ #define FF_CODEC_CAP_EXPORTS_CROPPING (1 << 4) +/** + * Codec initializes slice-based threading with a main function + */ +#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF(1 << 5) #ifdef TRACE # define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__) diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c index c781d35..65e5abf 100644 --- a/libavcodec/pthread_slice.c +++ b/libavcodec/pthread_slice.c @@ -38,21 +38,13 @@ typedef int (action_func)(AVCodecContext *c, void *arg); typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr); +typedef int (main_func)(AVCodecContext *c); -typedef struct SliceThreadContext { -AVSliceThread *thread; -action_func *func; -action_func2 *func2; -void *args; -int *rets; -int job_size; - -int *entries; -int entries_count; -int thread_count; -pthread_cond_t *progress_cond; -pthread_mutex_t *progress_mutex; -} SliceThreadContext; +static void main_function(void *priv) { +AVCodecContext *avctx = priv; +SliceThreadContext *c = avctx->internal->thread_ctx; +c->m_func(avctx); +} static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads) { @@ -84,7 +76,7 @@ void ff_slice_thread_free(AVCodecContext *avctx) av_freep(&avctx->internal->thread_ctx); } -static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size) +int ff_thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size) { SliceThreadContext *c = avctx->internal->thread_ctx; @@ -99,7 +91,7 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i c->func = func; c->rets = ret; -avpriv_slicethread_execute(c->thread, job_count, 0); +avpriv_slicethread_execute(c->thread, job_count, !!c->m_func); return 0; } @@ -107,13 +99,14 @@ static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg { SliceThreadContext *c = avctx->internal->thread_ctx; c->func2 = func2; -return thread_execute(avctx, NULL, arg, ret, job_count, 0); +return ff_thread_execute(avctx, NULL, arg, ret, job_count, 0); } int ff_slice_thread_init(AVCodecContext *avctx) { SliceThreadContext *c; int thread_count = avctx->thread_count; +static void (*main_f)(void *); #if HAVE_W32THREADS w32thread_init(); @@ -142,7 +135,8 @@ int ff_slice_thread_init(AVCodecContext *avctx) } avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c)); -if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, NULL, thread_count)) <= 1) { +main_f = avctx->codec->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF ? &main_function : NULL; +if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, main_f, thread_count)) <= 1) { if (c) avpriv_slicethread_free(&c->thread); av_freep(&avctx->internal->thread_ctx); @@ -150,9 +144,10 @@ int ff_slice_thread_init(AVCodecContext *avctx) avctx->active_thread_type = 0; return 0; } +c->m_func = NULL; avctx->thread_count = thread_count; -avctx->execute = thread_execute; +avctx->execute = ff_thread_execute; avctx->execute2 = thread_execute2; return 0; } diff --git a/libavcodec/thread.h b/libavcodec/thread.h index 90864b5..dd8f5fe 100644 --- a/libavcodec/thread.h +++ b/libavcodec/thread.h @@ -133,6 +133,7 @@ void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f); int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src); int ff_thread_init(AVCodecContext *s); +int ff_thread_execute(AVCodecContext *avctx, int (*func)(AVCodecContext *c, void *arg), void *arg, int *ret, int job_count, int job_size); void ff_thread_free(AVCodecContext *s); int ff_alloc_entries(AVCodecContext *avctx, int count); diff --git a/libavutil/slicethread.h b/libavutil/slicethread.h index f6f6f30..9d15c96 100644 --- a/libavutil/slicethread.h +++ b/libavutil/slicethread.h @@ -16,11 +16,29 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavcodec/avcodec.h" + #ifndef AVUTIL_SLICETHREAD_H #define AVUTIL_SLICETHREAD_H typedef struct AVSliceThread AVSliceThread; +typedef struct SliceThreadContext { +AVSliceThread *thread; +int (*func)(AVCodecContext *c, void *arg); +int (*func2)(AVCodecContex
[FFmpeg-devel] [PATCHv2 2/2] avcodec/pthread_slice: add ff_slice_thread_execute_with_mainfunc()
Signed-off-by: Ilia Valiakhmetov v2: --- libavcodec/internal.h | 4 libavcodec/pthread_slice.c | 22 -- libavcodec/thread.h| 4 +++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/libavcodec/internal.h b/libavcodec/internal.h index 64120ea..4668952 100644 --- a/libavcodec/internal.h +++ b/libavcodec/internal.h @@ -64,6 +64,10 @@ * dimensions to coded rather than display values. */ #define FF_CODEC_CAP_EXPORTS_CROPPING (1 << 4) +/** + * Codec initializes slice-based threading with a main function + */ +#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF(1 << 5) #ifdef TRACE # define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__) diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c index c781d35..d659f9b 100644 --- a/libavcodec/pthread_slice.c +++ b/libavcodec/pthread_slice.c @@ -38,11 +38,13 @@ typedef int (action_func)(AVCodecContext *c, void *arg); typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr); +typedef int (main_func)(AVCodecContext *c); typedef struct SliceThreadContext { AVSliceThread *thread; action_func *func; action_func2 *func2; +main_func *mainfunc; void *args; int *rets; int job_size; @@ -54,6 +56,12 @@ typedef struct SliceThreadContext { pthread_mutex_t *progress_mutex; } SliceThreadContext; +static void main_function(void *priv) { +AVCodecContext *avctx = priv; +SliceThreadContext *c = avctx->internal->thread_ctx; +c->mainfunc(avctx); +} + static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads) { AVCodecContext *avctx = priv; @@ -99,7 +107,7 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i c->func = func; c->rets = ret; -avpriv_slicethread_execute(c->thread, job_count, 0); +avpriv_slicethread_execute(c->thread, job_count, !!c->mainfunc ); return 0; } @@ -110,10 +118,19 @@ static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg return thread_execute(avctx, NULL, arg, ret, job_count, 0); } +int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, action_func2* func2, main_func *mainfunc, void *arg, int *ret, int job_count) +{ +SliceThreadContext *c = avctx->internal->thread_ctx; +c->func2 = func2; +c->mainfunc = mainfunc; +return thread_execute(avctx, NULL, arg, ret, job_count, 0); +} + int ff_slice_thread_init(AVCodecContext *avctx) { SliceThreadContext *c; int thread_count = avctx->thread_count; +static void (*mainfunc)(void *); #if HAVE_W32THREADS w32thread_init(); @@ -142,7 +159,8 @@ int ff_slice_thread_init(AVCodecContext *avctx) } avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c)); -if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, NULL, thread_count)) <= 1) { +mainfunc = avctx->codec->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF ? &main_function : NULL; +if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, mainfunc, thread_count)) <= 1) { if (c) avpriv_slicethread_free(&c->thread); av_freep(&avctx->internal->thread_ctx); diff --git a/libavcodec/thread.h b/libavcodec/thread.h index 90864b5..3186193 100644 --- a/libavcodec/thread.h +++ b/libavcodec/thread.h @@ -133,8 +133,10 @@ void ff_thread_release_buffer(AVCodecContext *avctx, ThreadFrame *f); int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src); int ff_thread_init(AVCodecContext *s); +int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, +int (*action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr), +int (*main_func)(AVCodecContext *c), void *arg, int *ret, int job_count); void ff_thread_free(AVCodecContext *s); - int ff_alloc_entries(AVCodecContext *avctx, int count); void ff_reset_entries(AVCodecContext *avctx); void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, int n); -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 2/2] avcodec/vp9: change avctx->execute3 in favor of ff_slice_thread_execute_with_mainfunc()
Signed-off-by: Ilia Valiakhmetov v8: --- libavcodec/vp9.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c index b780262..a71045e 100644 --- a/libavcodec/vp9.c +++ b/libavcodec/vp9.c @@ -1628,7 +1628,7 @@ FF_ENABLE_DEPRECATION_WARNINGS } } -avctx->execute3(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols); +ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols); } else { ret = decode_tiles(avctx, data, size); if (ret < 0) @@ -1776,7 +1776,8 @@ AVCodec ff_vp9_decoder = { .init = vp9_decode_init, .close = vp9_decode_free, .decode= vp9_decode_frame, -.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_SLICE_THREAD_HAS_MF, +.capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS, +.caps_internal = FF_CODEC_CAP_SLICE_THREAD_HAS_MF, .flush = vp9_decode_flush, .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy), .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context), -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/vp9: add tile threading support
Signed-off-by: Ilia Valiakhmetov --- Changelog | 1 + 1 file changed, 1 insertion(+) diff --git a/Changelog b/Changelog index cae5254..8a4818a 100644 --- a/Changelog +++ b/Changelog @@ -43,6 +43,7 @@ version : - add --disable-autodetect build switch - drop deprecated qtkit input device (use avfoundation instead) - despill video filter +- tile threading support for VP9 version 3.3: - CrystalHD decoder moved to new decode API -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] Changelog: add vp9 tile threading support
Signed-off-by: Ilia Valiakhmetov --- Changelog | 1 + 1 file changed, 1 insertion(+) diff --git a/Changelog b/Changelog index 22928de..ca0758a 100644 --- a/Changelog +++ b/Changelog @@ -46,6 +46,7 @@ version : - haas audio filter - SUP/PGS subtitle muxer - convolve video filter +- VP9 tile threading support version 3.3: - CrystalHD decoder moved to new decode API -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/vp9: add 64-bit ipred_dr_32x32_16 avx2 implementation
vp9_diag_downright_32x32_12bpp_c: 429.7 vp9_diag_downright_32x32_12bpp_sse2: 158.9 vp9_diag_downright_32x32_12bpp_ssse3: 144.6 vp9_diag_downright_32x32_12bpp_avx: 141.0 vp9_diag_downright_32x32_12bpp_avx2: 73.8 Almost 50% faster than avx implementation --- libavcodec/x86/vp9dsp_init_16bpp.c| 6 +- libavcodec/x86/vp9intrapred_16bpp.asm | 103 +- 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 8d1aa13..54216f0 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -52,8 +52,9 @@ decl_ipred_fns(dc, 16, mmxext, sse2); decl_ipred_fns(dc_top, 16, mmxext, sse2); decl_ipred_fns(dc_left, 16, mmxext, sse2); decl_ipred_fn(dl, 16, 16, avx2); -decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); +decl_ipred_fn(dr, 16, 16, avx2); +decl_ipred_fn(dr, 32, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -137,8 +138,9 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) init_fpel_func(1, 1, 64, avg, _16, avx2); init_fpel_func(0, 1, 128, avg, _16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); -init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2); +init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2); +init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); } #endif /* HAVE_YASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 6d4400b..32b6982 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1221,8 +1221,109 @@ cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a mova [dstq+strideq*0], m4 ; 0 mova [dst3q+strideq*4], m5 ; 7 RET -%endif +%if ARCH_X86_64 +cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a +movam0, [lq+mmsize*0+0]; l[0-15] +movam1, [lq+mmsize*1+0]; l[16-31] +movum2, [aq+mmsize*0-2]; *abcdefghijklmno +movam3, [aq+mmsize*0+0]; abcdefghijklmnop +movam4, [aq+mmsize*1+0]; qrstuvwxyz012345 +vperm2i128 m5, m0, m1, q0201 ; lmnopqrstuvwxyz0 +vpalignrm6, m5, m0, 2 ; mnopqrstuvwxyz01 +vpalignrm7, m5, m0, 4 ; nopqrstuvwxyz012 +LOWPASS 0, 6, 7 ; L[0-15] +vperm2i128 m7, m1, m2, q0201 ; stuvwxyz*abcdefg +vpalignrm5, m7, m1, 2 ; lmnopqrstuvwxyz* +vpalignrm6, m7, m1, 4 ; mnopqrstuvwxyz*a +LOWPASS 1, 5, 6 ; L[16-31]# +vperm2i128 m5, m3, m4, q0201 ; ijklmnopqrstuvwx +vpalignrm6, m5, m3, 2 ; bcdefghijklmnopq +LOWPASS 2, 3, 6 ; A[0-15] +movum3, [aq+mmsize*1-2]; pqrstuvwxyz01234 +vperm2i128 m6, m4, m4, q2001 ; yz012345 +vpalignrm7, m6, m4, 2 ; rstuvwxyz012345. +LOWPASS 3, 4, 7 ; A[16-31]. +vperm2i128 m4, m1, m2, q0201 ; TUVWXYZ#ABCDEFGH +vperm2i128 m5, m0, m1, q0201 ; L[7-15]L[16-23] +vperm2i128 m8, m2, m3, q0201 ; IJKLMNOPQRSTUVWX +DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt +lea stride3q, [strideq*3] +lea stride5q, [stride3q+strideq*2] +lea stride7q, [strideq*4+stride3q] +lea dst24q, [dst8q+stride3q*8] +lea dst8q, [dst8q+strideq*8] +mov cntd, 2 + +.loop: +mova [dst24q+stride7q+0 ], m0 ; 31 23 15 7 +mova [dst24q+stride7q+32], m1 +mova[dst8q+stride7q+0], m1 +mova [dst8q+stride7q+32], m2 +vpalignrm6, m4, m1, 2 +vpalignrm7, m5, m0, 2 +vpalignrm9, m8, m2, 2 +mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6 +mova [dst24q+stride3q*2+32], m6 +mova [dst8q+stride3q*2+0], m6 +mova [dst8q+stride3q*2+32], m9 +vpalignrm6, m4, m1, 4 +vpalignrm7, m5, m0, 4 +vpalignrm9, m8, m2, 4 +mova [dst24q+stride5q+0], m7 ; 29 21 13 5 +mova [dst24q+stride5q+32], m6 +mova[dst8q+stride5q+0], m6 +mova [dst8q+stride5q+32], m9 +vpalignrm6, m4, m1, 6 +v
[FFmpeg-devel] [PATCH] avcodec/vp9: AVX2 ipred_dl_32x32 improvement
Use symmetry properties of the ipred_dl function for better performance. vp9_diag_downleft_32x32_12bpp_c: 1534.2 vp9_diag_downleft_32x32_12bpp_sse2: 145.9 vp9_diag_downleft_32x32_12bpp_ssse3: 140.0 vp9_diag_downleft_32x32_12bpp_avx: 134.8 vp9_diag_downleft_32x32_12bpp_avx2: 78.9 ~40% faster than avx Signed-off-by: Ilia Valiakhmetov --- libavcodec/x86/vp9intrapred_16bpp.asm | 47 --- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 8d8d65e..33a8a7f 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -901,49 +901,68 @@ cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a LOWPASS 1, 2, 3 ; RSTUVWXYZ..5 vperm2i128 m2, m1, m4, q0201 ; Z..5 vperm2i128 m5, m0, m1, q0201 ; JKLMNOPQRSTUVWXY -DEFINE_ARGS dst, stride, stride3, cnt +vperm2i128 m6, m2, m2, q0101 +DEFINE_ARGS dst, stride, stride3, dst16, cnt lea stride3q, [strideq*3] -mov cntd, 4 +lea dst16q, [dstq+strideq*8] +lea dst16q, [dst16q+strideq*8] +mov cntd, 2 .loop: mova [dstq+strideq*0 + 0], m0 mova [dstq+strideq*0 +32], m1 +mova [dst16q+strideq*0+ 0], m1 +mova [dst16q+strideq*0+32], m6 vpalignr m3, m5, m0, 2 vpalignr m4, m2, m1, 2 mova [dstq+strideq*1 + 0], m3 mova [dstq+strideq*1 +32], m4 +mova [dst16q+strideq*1 +0], m4 +mova [dst16q+strideq*1 +32], m6 vpalignr m3, m5, m0, 4 vpalignr m4, m2, m1, 4 mova [dstq+strideq*2 + 0], m3 mova [dstq+strideq*2 +32], m4 +mova [dst16q+strideq*2+0], m4 +mova [dst16q+strideq*2+32], m6 vpalignr m3, m5, m0, 6 -vpalignr m4, m2, m1, 6 +vpalignr m4, m2, m1, 6 mova [dstq+stride3q*1+ 0], m3 mova [dstq+stride3q*1+32], m4 -leadstq, [dstq+strideq*4] +mova [dst16q+stride3q*1+0], m4 +mova [dst16q+stride3q*1+32], m6 vpalignr m3, m5, m0, 8 vpalignr m4, m2, m1, 8 +leadstq, [dstq+strideq*4] +lea dst16q, [dst16q+strideq*4] mova [dstq+strideq*0 + 0], m3 mova [dstq+strideq*0 +32], m4 +mova [dst16q+strideq*0 +0], m4 +mova [dst16q+strideq*0 +32], m6 vpalignr m3, m5, m0, 10 vpalignr m4, m2, m1, 10 mova [dstq+strideq*1 + 0], m3 mova [dstq+strideq*1 +32], m4 +mova [dst16q+strideq*1 +0], m4 +mova [dst16q+strideq*1 +32], m6 vpalignr m3, m5, m0, 12 vpalignr m4, m2, m1, 12 -mova [dstq+strideq*2+ 0], m3 -mova [dstq+strideq*2+32], m4 +mova[dstq+strideq*2+ 0], m3 +mova[dstq+strideq*2+32], m4 +mova [dst16q+strideq*2+0], m4 +mova [dst16q+strideq*2+32], m6 vpalignr m3, m5, m0, 14 vpalignr m4, m2, m1, 14 -mova [dstq+stride3q+ 0], m3 -mova [dstq+stride3q+ 32], m4 -vpalignr m3, m5, m0, 16 -vpalignr m4, m2, m1, 16 -vperm2i128 m5, m3, m4, q0201 -vperm2i128 m2, m4, m4, q0101 -mova m0, m3 -mova m1, m4 +mova[dstq+stride3q+ 0], m3 +mova[dstq+stride3q+ 32], m4 +mova [dst16q+stride3q+ 0], m4 +mova [dst16q+stride3q+32], m6 +mova m0, m5 +mova m1, m2 +vperm2i128 m5, m5, m2, q0201 +mova m2, m6 leadstq, [dstq+strideq*4] +lea dst16q, [dst16q+strideq*4] deccntd jg .loop RET -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH] avcodec/vp9: AVX2 ipred_vl_16x16
vp9_vert_left_16x16_12bpp_c: 273.8 vp9_vert_left_16x16_12bpp_sse2: 69.4 vp9_vert_left_16x16_12bpp_ssse3: 35.3 vp9_vert_left_16x16_12bpp_avx: 34.6 vp9_vert_left_16x16_12bpp_avx2: 22.4 ~35% faster than avx Signed-off-by: Ilia Valiakhmetov --- libavcodec/x86/vp9dsp_init_16bpp.c| 2 ++ libavcodec/x86/vp9intrapred_16bpp.asm | 53 +++ 2 files changed, 55 insertions(+) diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c b/libavcodec/x86/vp9dsp_init_16bpp.c index 60d10a1..da8b74c 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp.c +++ b/libavcodec/x86/vp9dsp_init_16bpp.c @@ -55,6 +55,7 @@ decl_ipred_fn(dl, 16, 16, avx2); decl_ipred_fn(dl, 32, 16, avx2); decl_ipred_fn(dr, 16, 16, avx2); decl_ipred_fn(dr, 32, 16, avx2); +decl_ipred_fn(vl, 16, 16, avx2); #define decl_ipred_dir_funcs(type) \ decl_ipred_fns(type, 16, sse2, sse2); \ @@ -143,6 +144,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) #if ARCH_X86_64 init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2); #endif +init_ipred_func(vl, VERT_LEFT, 16, 16, avx2); } #endif /* HAVE_X86ASM */ diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm b/libavcodec/x86/vp9intrapred_16bpp.asm index 32b6982..8d8d65e 100644 --- a/libavcodec/x86/vp9intrapred_16bpp.asm +++ b/libavcodec/x86/vp9intrapred_16bpp.asm @@ -1538,6 +1538,59 @@ VL_FUNCS 1 INIT_XMM avx VL_FUNCS 1 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a +movifnidn aq, amp +movam0, [aq]; abcdefghijklmnop +vpbroadcastw xm5, [aq+30] ; +vperm2i128 m1, m0, m5, q0201 ; ijklmnop +vpalignrm2, m1, m0, 2 ; bcdefghijklmnopp +vpalignrm3, m1, m0, 4 ; cdefghijklmnoppp +movam4, m2 +pavgw m4, m0 +LOWPASS 0, 2, 3 ; BCDEFGHIJKLMNOPp +vperm2i128 m2, m0, m5, q0201 +vperm2i128 m3, m4, m5, q0201 +DEFINE_ARGS dst, stride, stride3 +lea stride3q, [strideq*3] + +mova [dstq+strideq*0], m4 +mova [dstq+strideq*1], m0 +vpalignrm1, m2, m0, 2 +vpalignrm5, m3, m4, 2 +mova [dstq+strideq*2], m5 +mova [dstq+stride3q ], m1 +vpalignrm1, m2, m0, 4 +vpalignrm5, m3, m4, 4 +lea dstq, [dstq+strideq*4] +mova [dstq+strideq*0], m5 +mova [dstq+strideq*1], m1 +vpalignrm1, m2, m0, 6 +vpalignrm5, m3, m4, 6 +mova [dstq+strideq*2], m5 +mova [dstq+stride3q ], m1 +vpalignrm1, m2, m0, 8 +vpalignrm5, m3, m4, 8 +lea dstq, [dstq+strideq*4] +mova [dstq+strideq*0], m5 +mova [dstq+strideq*1], m1 +vpalignrm1, m2, m0, 10 +vpalignrm5, m3, m4, 10 +mova [dstq+strideq*2], m5 +mova [dstq+stride3q ], m1 +vpalignrm1, m2, m0, 12 +vpalignrm5, m3, m4, 12 +lea dstq, [dstq+strideq*4] +mova [dstq+strideq*0], m5 +mova [dstq+strideq*1], m1 +vpalignrm1, m2, m0, 14 +vpalignrm5, m3, m4, 14 +mova [dstq+strideq*2], m5 +mova [dstq+stride3q ], m1 +RET +%endif + %macro VR_FUNCS 0 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a movum0, [aq-2] -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
[FFmpeg-devel] [PATCH 1/2] avcodec: add execute3() api to utilize the main function of avpriv_slicethread_create().
Signed-off-by: Ilia Valiakhmetov --- libavcodec/avcodec.h | 7 ++- libavcodec/options.c | 1 + libavcodec/pthread_slice.c | 27 +-- libavcodec/utils.c | 13 + 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index 650..712f40c 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1089,6 +1089,10 @@ typedef struct RcOverride{ */ #define AV_CODEC_CAP_AVOID_PROBING (1 << 17) /** + * Codec initializes slice-based threading with a main function + */ +#define AV_CODEC_SLICE_THREAD_HAS_MF (1 << 18) +/** * Codec is intra only. */ #define AV_CODEC_CAP_INTRA_ONLY 0x4000 @@ -3233,7 +3237,7 @@ typedef struct AVCodecContext { * - decoding: Set by libavcodec, user can override. */ int (*execute2)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, int count); - +int (*execute3)(struct AVCodecContext *c, int (*func)(struct AVCodecContext *c2, void *arg, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext *c3), void *arg2, int *ret, int count); /** * noise vs. sse weight for the nsse comparison function * - encoding: Set by user. @@ -5774,6 +5778,7 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, int profile); int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2),void *arg, int *ret, int count, int size); int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int, int),void *arg, int *ret, int count); +int avcodec_default_execute3(AVCodecContext *c, int (*func)(AVCodecContext *c2, void *arg2, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext *c3), void *arg, int *ret, int count); //FIXME func typedef /** diff --git a/libavcodec/options.c b/libavcodec/options.c index 82e1217..6d63bdb 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -117,6 +117,7 @@ static int init_context_defaults(AVCodecContext *s, const AVCodec *codec) s->get_format = avcodec_default_get_format; s->execute = avcodec_default_execute; s->execute2= avcodec_default_execute2; +s->execute3= avcodec_default_execute3; s->sample_aspect_ratio = (AVRational){0,1}; s->pix_fmt = AV_PIX_FMT_NONE; s->sw_pix_fmt = AV_PIX_FMT_NONE; diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c index c781d35..08d19b9 100644 --- a/libavcodec/pthread_slice.c +++ b/libavcodec/pthread_slice.c @@ -38,11 +38,13 @@ typedef int (action_func)(AVCodecContext *c, void *arg); typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int threadnr); +typedef int (main_func)(AVCodecContext *c); typedef struct SliceThreadContext { AVSliceThread *thread; action_func *func; action_func2 *func2; +main_func *m_func; void *args; int *rets; int job_size; @@ -54,6 +56,12 @@ typedef struct SliceThreadContext { pthread_mutex_t *progress_mutex; } SliceThreadContext; +static void main_function(void *priv) { +AVCodecContext *avctx = priv; +SliceThreadContext *c = avctx->internal->thread_ctx; +c->m_func(avctx); +} + static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int nb_threads) { AVCodecContext *avctx = priv; @@ -99,7 +107,8 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i c->func = func; c->rets = ret; -avpriv_slicethread_execute(c->thread, job_count, 0); +avpriv_slicethread_execute(c->thread, job_count, !!c->m_func); + return 0; } @@ -110,10 +119,20 @@ static int thread_execute2(AVCodecContext *avctx, action_func2* func2, void *arg return thread_execute(avctx, NULL, arg, ret, job_count, 0); } +static int thread_execute3(AVCodecContext *avctx, action_func2* func2, main_func* m_func, void *arg, int *ret, int job_count) +{ +SliceThreadContext *c = avctx->internal->thread_ctx; +c->func2 = func2; +c->m_func = m_func; +return thread_execute(avctx, NULL, arg, ret, job_count, 0); +} + + int ff_slice_thread_init(AVCodecContext *avctx) { SliceThreadContext *c; int thread_count = avctx->thread_count; +static void (*m_f)(void *); #if HAVE_W32THREADS w32thread_init(); @@ -142,7 +161,9 @@ int ff_slice_thread_init(AVCodecContext *avctx) } avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c)); -if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, worker_func, NULL, thread_count)) <= 1) { +m_f = avctx->codec->capabilities & AV_CODEC_SLICE_THREAD_HAS_MF ? &main_function : NULL; + +if (!c || (thread_count =
[FFmpeg-devel] [PATCH 0/2] Tile threading support for vp9
These patches introduce tile threading support for vp9. Tile threading is ~45% faster at 2 threads vs 1. Frame threading is ~55% faster at 2 threads vs 1. ffvp9 tile threading is ~25% faster than libvpx-vp9 at 2 threads execute3() function is similar to execute2(), execept it has a extra argument - main function for avpriv_slicethread_create(), it is used for the loopfilter. Ilia Valiakhmetov (2): avcodec: add execute3() api to utilize the main function of avpriv_slicethread_create(). avcodec/vp9: Add tile threading support libavcodec/avcodec.h | 7 +- libavcodec/options.c | 1 + libavcodec/pthread_slice.c | 27 +- libavcodec/utils.c | 13 + libavcodec/vp9.c | 591 +-- libavcodec/vp9_mc_template.c | 202 +++ libavcodec/vp9block.c| 526 +++--- libavcodec/vp9dec.h | 106 +--- libavcodec/vp9mvs.c | 97 +++ libavcodec/vp9prob.c | 64 ++--- libavcodec/vp9recon.c| 157 ++-- 11 files changed, 1036 insertions(+), 755 deletions(-) -- 2.8.3 ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel