[FFmpeg-devel] [PATCH] avcodec/vp9: avx2 implementation of ipred_dl_16x16_16

2017-03-12 Thread Ilia
vp9_diag_downleft_16x16_10bpp_c: 263.0
vp9_diag_downleft_16x16_10bpp_sse2: 44.7
vp9_diag_downleft_16x16_10bpp_ssse3: 32.5
vp9_diag_downleft_16x16_10bpp_avx: 31.9
vp9_diag_downleft_16x16_10bpp_avx2: 25.7
vp9_diag_downleft_16x16_12bpp_c: 264.7
vp9_diag_downleft_16x16_12bpp_sse2: 44.4
vp9_diag_downleft_16x16_12bpp_ssse3: 32.0
vp9_diag_downleft_16x16_12bpp_avx: 32.4
vp9_diag_downleft_16x16_12bpp_avx2: 25.5

Benchmarked with 1 runs

Signed-off-by: Ilia 
---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 39 +++
 2 files changed, 41 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index eb67499..4576ff1 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -51,6 +51,7 @@ decl_ipred_fns(h,   16, mmxext, sse2);
 decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
+decl_ipred_fn(dl,   16, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(2, 1,  32, avg, _16, avx2);
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
+init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
 }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index c0ac16d..212e413 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -847,6 +847,45 @@ DL_FUNCS
 INIT_XMM avx
 DL_FUNCS
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
+movifnidn   aq, amp
+movam0, [aq]   ; abcdefghijklmnop
+vpbroadcastw   xm1, [aq+30]; 
+vperm2i128  m2, m0, m1, q0201  ; ijklmnop
+vpalignrm3, m2, m0, 2  ; bcdefghijklmnopp
+vpalignrm4, m2, m0, 4  ; cdefghijklmnoppp
+LOWPASS  0,  3,  4 ; BCDEFGHIJKLMNOPp
+vperm2i128  m2, m0, m1, q0201  ; JKLMNOPp
+DEFINE_ARGS dst, stride, stride3, cnt
+mov   cntd, 2
+lea   stride3q, [strideq*3]
+.loop:
+mova  [dstq+strideq*0], m0
+vpalignrm3, m2, m0, 2
+vpalignrm4, m2, m0, 4
+mova  [dstq+strideq*1], m3
+mova  [dstq+strideq*2], m4
+vpalignrm3, m2, m0, 6
+vpalignrm4, m2, m0, 8
+mova  [dstq+stride3q ], m3
+lea   dstq, [dstq+strideq*4]
+mova  [dstq+strideq*0], m4
+vpalignrm3, m2, m0, 10
+vpalignrm4, m2, m0, 12
+mova  [dstq+strideq*1], m3
+mova  [dstq+strideq*2], m4
+vpalignrm3, m2, m0, 14
+mova  [dstq+stride3q ], m3
+lea   dstq, [dstq+strideq*4]
+movam0, m2
+vperm2i128  m2, m2, m2, q0101  ; 
+dec   cntd
+jg .loop
+RET
+%endif
+
 %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function
 cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a
 movhm0, [lq]; wxyz
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] libavcodec/vp9 ipred_dl_32x32_16 avx2 version

2017-06-04 Thread Ilia Valiakhmetov
vp9_diag_downleft_32x32_8bpp_c: 580.2
vp9_diag_downleft_32x32_8bpp_sse2: 75.6
vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
vp9_diag_downleft_32x32_8bpp_avx: 72.7
vp9_diag_downleft_32x32_10bpp_c: 1101.2
vp9_diag_downleft_32x32_10bpp_sse2: 145.4
vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
vp9_diag_downleft_32x32_10bpp_avx: 134.8
vp9_diag_downleft_32x32_10bpp_avx2: 94.0
vp9_diag_downleft_32x32_12bpp_c: 1108.5
vp9_diag_downleft_32x32_12bpp_sse2: 145.5
vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
vp9_diag_downleft_32x32_12bpp_avx: 135.2
vp9_diag_downleft_32x32_12bpp_avx2: 94.0

~30% faster than avx

---
 libavcodec/x86/vp9dsp_init_16bpp.c|  4 +-
 libavcodec/x86/vp9intrapred_16bpp.asm | 75 +++
 2 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index 4e1f24f..d1b8fcd 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,7 +52,7 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
-decl_ipred_fn(dl,   32, 32, avx2);
+decl_ipred_fn(dl,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -136,7 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
-init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 32, avx2);
+init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
 }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 2ec5381..10a0994 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
 DEFINE_ARGS dst, stride, stride3, cnt
 mov   cntd, 2
 lea   stride3q, [strideq*3]
+
 .loop:
 mova  [dstq+strideq*0], m0
 vpalignrm3, m2, m0, 2
@@ -887,24 +888,64 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
 
 cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
 movifnidn   aq, amp
-movam0, [aq+mmsize*0]   ; abcdefghijklmnop
-movam1, [aq+mmsize*1]   ; qrstuvwxyz012345
-vpbroadcastw   xm4, [aq+mmsize*1+30]; 
-vpalignrm2, m1, m0, 2   ; bcdefghijklmnopq
-vpalignrm3, m1, m0, 4   ; cdefghijklmnopqr
-vperm2i128  m5, m1, m4, q0201   ; yz012345
-LOWPASS  0,  2,  3  ; BCDEFGHIJKLMNOPQ
-vpalignrm2, m5, m1, 2   ; rstuvwxyz0123455
-vpalignrm3, m5, m1, 4   ; stuvwxyz01234555
-LOWPASS  1,  2,  3  ; RSTUVWXYZ..5
-vperm2i128  m2, m1, m4, q0201   ; Z..5
+movam0, [aq+mmsize*0+ 0]   ; abcdefghijklmnop
+movam1, [aq+mmsize*1+ 0]   ; qrstuvwxyz012345
+vpbroadcastw   xm4, [aq+mmsize*1+30]   ; 
+vperm2i128  m5, m0, m1, q0201  ; ijklmnopqrstuvwx
+vpalignrm2, m5, m0, 2  ; bcdefghijklmnopq
+vpalignrm3, m5, m0, 4  ; cdefghijklmnopqr
+LOWPASS  0,  2,  3 ; BCDEFGHIJKLMNOPQ
+vperm2i128  m5, m1, m4, q0201  ; yz012345
+vpalignrm2, m5, m1, 2  ; rstuvwxyz0123455
+vpalignrm3, m5, m1, 4  ; stuvwxyz01234555
+LOWPASS  1,  2,  3 ; RSTUVWXYZ..5
+vperm2i128  m2, m1, m4, q0201  ; Z..5
+vperm2i128  m5, m0, m1, q0201  ; JKLMNOPQRSTUVWXY
+DEFINE_ARGS dst, stride, stride3, stride5, cnt
+lea   stride3q, [strideq*3]
+lea   stride5q, [strideq*5]
+mov   cntd, 4
 
-mova   [dstq+strideq*0+0 ], m0
-mova   [dstq+strideq*0+32], m1
-vpalignrm3, m1, m0, 2
-vpalignrm4, m2, m1, 2
-mova   [dstq+strideq*1+0 ], m3
-mova   [dstq+strideq*1+32], m4
+.loop:
+mova   [dstq+strideq*0 + 0], m0
+mova   [dstq+strideq*0 +32], m1
+vpalignr m3, m5, m0, 2
+vpalignr m4, m2, m1, 2
+mova   [dstq+strideq*1 + 0], m3
+mova   [dstq+strideq*1 +32], m4
+vpalignr m3, m5, m0, 4
+vpalignr m4, m2, m1, 4
+mova   [dstq+strideq*2 + 0], m3
+mova   [dstq+strideq*2 +32], m4
+vpa

[FFmpeg-devel] [PATCH] libavcodec/vp9: ipred_dl_32x32_16 avx2 implementation

2017-06-04 Thread Ilia Valiakhmetov
vp9_diag_downleft_32x32_8bpp_c: 580.2
vp9_diag_downleft_32x32_8bpp_sse2: 75.6
vp9_diag_downleft_32x32_8bpp_ssse3: 73.7
vp9_diag_downleft_32x32_8bpp_avx: 72.7
vp9_diag_downleft_32x32_10bpp_c: 1101.2
vp9_diag_downleft_32x32_10bpp_sse2: 145.4
vp9_diag_downleft_32x32_10bpp_ssse3: 137.5
vp9_diag_downleft_32x32_10bpp_avx: 134.8
vp9_diag_downleft_32x32_10bpp_avx2: 94.0
vp9_diag_downleft_32x32_12bpp_c: 1108.5
vp9_diag_downleft_32x32_12bpp_sse2: 145.5
vp9_diag_downleft_32x32_12bpp_ssse3: 137.3
vp9_diag_downleft_32x32_12bpp_avx: 135.2
vp9_diag_downleft_32x32_12bpp_avx2: 94.0

~30% faster than avx implementation

---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 63 +++
 2 files changed, 65 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index 4576ff1..d1b8fcd 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
+decl_ipred_fn(dl,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -135,6 +136,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
 }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 212e413..5cd6a3e 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -861,6 +861,7 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
 DEFINE_ARGS dst, stride, stride3, cnt
 mov   cntd, 2
 lea   stride3q, [strideq*3]
+
 .loop:
 mova  [dstq+strideq*0], m0
 vpalignrm3, m2, m0, 2
@@ -884,6 +885,68 @@ cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a
 dec   cntd
 jg .loop
 RET
+
+cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
+movifnidn   aq, amp
+movam0, [aq+mmsize*0+ 0]   ; abcdefghijklmnop
+movam1, [aq+mmsize*1+ 0]   ; qrstuvwxyz012345
+vpbroadcastw   xm4, [aq+mmsize*1+30]   ; 
+vperm2i128  m5, m0, m1, q0201  ; ijklmnopqrstuvwx
+vpalignrm2, m5, m0, 2  ; bcdefghijklmnopq
+vpalignrm3, m5, m0, 4  ; cdefghijklmnopqr
+LOWPASS  0,  2,  3 ; BCDEFGHIJKLMNOPQ
+vperm2i128  m5, m1, m4, q0201  ; yz012345
+vpalignrm2, m5, m1, 2  ; rstuvwxyz0123455
+vpalignrm3, m5, m1, 4  ; stuvwxyz01234555
+LOWPASS  1,  2,  3 ; RSTUVWXYZ..5
+vperm2i128  m2, m1, m4, q0201  ; Z..5
+vperm2i128  m5, m0, m1, q0201  ; JKLMNOPQRSTUVWXY
+DEFINE_ARGS dst, stride, stride3, cnt
+lea   stride3q, [strideq*3]
+mov   cntd, 4
+
+.loop:
+mova   [dstq+strideq*0 + 0], m0
+mova   [dstq+strideq*0 +32], m1
+vpalignr m3, m5, m0, 2
+vpalignr m4, m2, m1, 2
+mova   [dstq+strideq*1 + 0], m3
+mova   [dstq+strideq*1 +32], m4
+vpalignr m3, m5, m0, 4
+vpalignr m4, m2, m1, 4
+mova   [dstq+strideq*2 + 0], m3
+mova   [dstq+strideq*2 +32], m4
+vpalignr m3, m5, m0, 6
+vpalignr m4, m2, m1, 6
+mova   [dstq+stride3q*1+ 0], m3
+mova   [dstq+stride3q*1+32], m4
+leadstq, [dstq+strideq*4]
+vpalignr m3, m5, m0, 8
+vpalignr m4, m2, m1, 8
+mova   [dstq+strideq*0 + 0], m3
+mova   [dstq+strideq*0 +32], m4
+vpalignr m3, m5, m0, 10
+vpalignr m4, m2, m1, 10
+mova   [dstq+strideq*1 + 0], m3
+mova   [dstq+strideq*1 +32], m4
+vpalignr m3, m5, m0, 12
+vpalignr m4, m2, m1, 12
+mova   [dstq+strideq*2+ 0], m3
+mova   [dstq+strideq*2+32], m4
+vpalignr m3, m5, m0, 14
+vpalignr m4, m2, m1, 14
+mova   [dstq+stride3q+  0], m3
+mova   [dstq+stride3q+ 32], m4
+vpalignr m3, m5, m0, 16
+vpalignr m4, m2, m1, 16
+vperm2i128   m5, m3, m4, q0201
+vperm2i128   m2, m4, m4, q0101
+mova m0, m3
+mova m1, m4
+

[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation

2017-06-08 Thread Ilia Valiakhmetov
vp9_diag_downright_16x16_12bpp_c: 149.0
vp9_diag_downright_16x16_12bpp_sse2: 67.8
vp9_diag_downright_16x16_12bpp_ssse3: 45.6
vp9_diag_downright_16x16_12bpp_avx: 36.6
vp9_diag_downright_16x16_12bpp_avx2: 25.5

~30% faster than avx

Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++
 2 files changed, 58 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index d1b8fcd..8d1aa13 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
+decl_ipred_fn(dr,   16, 16, avx2);
 decl_ipred_fn(dl,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
@@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
 }
 
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 92333bc..67b98b1 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1170,6 +1170,62 @@ DR_FUNCS 2
 INIT_XMM avx
 DR_FUNCS 2
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 6, 7, dst, stride, l, a
+movam0, [lq]   ; klmnopqrstuvwxyz
+movum1, [aq-2] ; *abcdefghijklmno
+movam2, [aq]   ; abcdefghijklmnop
+vperm2i128  m4, m2, m2, q2001  ; ijklmnop
+vpalignrm5, m4, m2, 2  ; bcdefghijklmnop.
+vperm2i128  m3, m0, m1, q0201  ; stuvwxyz*abcdefg
+LOWPASS  1,  2,  5 ; ABCDEFGHIJKLMNO.
+vpalignrm4, m3, m0, 2  ; lmnopqrstuvwxyz*
+vpalignrm5, m3, m0, 4  ; mnopqrstuvwxyz*a
+LOWPASS  0,  4,  5 ; LMNOPQRSTUVWXYZ#
+vperm2i128  m5, m0, m1, q0201  ; TUVWXYZ#ABCDEFGH
+DEFINE_ARGS dst, stride, stride3, stride5, dst3, cnt
+lea  dst3q, [dstq+strideq*4]
+lea   stride3q, [strideq*3]
+lea   stride5q, [stride3q+strideq*2]
+
+vpalignrm3, m5, m0, 2
+vpalignrm4, m1, m5, 2
+mova[dst3q+stride5q*2], m3 ; 14
+mova[ dstq+stride3q*2], m4 ; 6
+vpalignrm3, m5, m0, 4
+vpalignrm4, m1, m5, 4
+sub  dst3q, strideq
+mova[dst3q+stride5q*2], m3 ; 13
+mova[dst3q+strideq*2 ], m4 ; 5
+mova[dst3q+stride3q*4], m0 ; 15
+vpalignrm3, m5, m0, 6
+vpalignrm4, m1, m5, 6
+mova [dstq+stride3q*4], m3 ; 12
+mova [dst3q+strideq*1], m4 ; 4
+vpalignrm3, m5, m0, 8
+vpalignrm4, m1, m5, 8
+mova [dst3q+strideq*8], m3 ; 11
+mova [dst3q+strideq*0], m4 ; 3
+vpalignrm3, m5, m0, 12
+vpalignrm4, m1, m5, 12
+mova[dst3q+stride3q*2], m3 ; 9
+mova [dstq+strideq*1 ], m4 ; 1
+vpalignrm3, m5, m0, 10
+vpalignrm4, m1, m5, 10
+mova [dstq+stride5q*2], m3 ; 10
+mova [dstq+strideq*2 ], m4 ; 2
+vpalignrm3, m5, m0, 14
+vpalignrm4, m1, m5, 14
+mova  [dstq+strideq*8], m3 ; 8
+mova  [dstq+strideq*0], m4 ; 0
+sub   dstq, strideq
+mova [dst3q+strideq*4], m5 ; 7
+mova [ dstq+strideq*0], m1 ; -1
+RET
+%endif
+
+
 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
 movifnidn   aq, amp
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation

2017-06-09 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++
 2 files changed, 58 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index d1b8fcd..8d1aa13 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
+decl_ipred_fn(dr,   16, 16, avx2);
 decl_ipred_fn(dl,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
@@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
 }
 
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 92333bc..7230de2 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1170,6 +1170,62 @@ DR_FUNCS 2
 INIT_XMM avx
 DR_FUNCS 2
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 4, 6, dst, stride, l, a
+movam0, [lq]   ; klmnopqrstuvwxyz
+movum1, [aq-2] ; *abcdefghijklmno
+movam2, [aq]   ; abcdefghijklmnop
+vperm2i128  m4, m2, m2, q2001  ; ijklmnop
+vpalignrm5, m4, m2, 2  ; bcdefghijklmnop.
+vperm2i128  m3, m0, m1, q0201  ; stuvwxyz*abcdefg
+LOWPASS  1,  2,  5 ; ABCDEFGHIJKLMNO.
+vpalignrm4, m3, m0, 2  ; lmnopqrstuvwxyz*
+vpalignrm5, m3, m0, 4  ; mnopqrstuvwxyz*a
+LOWPASS  0,  4,  5 ; LMNOPQRSTUVWXYZ#
+vperm2i128  m5, m0, m1, q0201  ; TUVWXYZ#ABCDEFGH
+DEFINE_ARGS dst, stride, stride3, stride5, dst3
+lea  dst3q, [dstq+strideq*4]
+lea   stride3q, [strideq*3]
+lea   stride5q, [stride3q+strideq*2]
+
+vpalignrm3, m5, m0, 2
+vpalignrm4, m1, m5, 2
+mova[dst3q+stride5q*2], m3 ; 14
+mova[ dstq+stride3q*2], m4 ; 6
+vpalignrm3, m5, m0, 4
+vpalignrm4, m1, m5, 4
+sub  dst3q, strideq
+mova[dst3q+stride5q*2], m3 ; 13
+mova[dst3q+strideq*2 ], m4 ; 5
+mova[dst3q+stride3q*4], m0 ; 15
+vpalignrm3, m5, m0, 6
+vpalignrm4, m1, m5, 6
+mova [dstq+stride3q*4], m3 ; 12
+mova [dst3q+strideq*1], m4 ; 4
+vpalignrm3, m5, m0, 8
+vpalignrm4, m1, m5, 8
+mova [dst3q+strideq*8], m3 ; 11
+mova [dst3q+strideq*0], m4 ; 3
+vpalignrm3, m5, m0, 10
+vpalignrm4, m1, m5, 10
+mova [dstq+stride5q*2], m3 ; 10
+mova [dstq+strideq*2 ], m4 ; 2
+vpalignrm3, m5, m0, 12
+vpalignrm4, m1, m5, 12
+mova[dst3q+stride3q*2], m3 ; 9
+mova [dstq+strideq*1 ], m4 ; 1
+vpalignrm3, m5, m0, 14
+vpalignrm4, m1, m5, 14
+mova  [dstq+strideq*8], m3 ; 8
+mova  [dstq+strideq*0], m4 ; 0
+sub   dstq, strideq
+mova [dst3q+strideq*4], m5 ; 7
+mova [ dstq+strideq*0], m1 ; -1
+RET
+%endif
+
+
 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
 movifnidn   aq, amp
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/vp9: ipred_dr_16x16_16 avx2 implementation

2017-06-10 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 56 +++
 2 files changed, 58 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index d1b8fcd..8d1aa13 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,6 +52,7 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
+decl_ipred_fn(dr,   16, 16, avx2);
 decl_ipred_fn(dl,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
@@ -136,6 +137,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
+init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
 }
 
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 92333bc..764f704 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1170,6 +1170,62 @@ DR_FUNCS 2
 INIT_XMM avx
 DR_FUNCS 2
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, a
+movam0, [lq]   ; klmnopqrstuvwxyz
+movum1, [aq-2] ; *abcdefghijklmno
+movam2, [aq]   ; abcdefghijklmnop
+vperm2i128  m4, m2, m2, q2001  ; ijklmnop
+vpalignrm5, m4, m2, 2  ; bcdefghijklmnop.
+vperm2i128  m3, m0, m1, q0201  ; stuvwxyz*abcdefg
+LOWPASS  1,  2,  5 ; ABCDEFGHIJKLMNO.
+vpalignrm4, m3, m0, 2  ; lmnopqrstuvwxyz*
+vpalignrm5, m3, m0, 4  ; mnopqrstuvwxyz*a
+LOWPASS  0,  4,  5 ; LMNOPQRSTUVWXYZ#
+vperm2i128  m5, m0, m1, q0201  ; TUVWXYZ#ABCDEFGH
+DEFINE_ARGS dst, stride, stride3, stride5, dst3
+lea  dst3q, [dstq+strideq*4]
+lea   stride3q, [strideq*3]
+lea   stride5q, [stride3q+strideq*2]
+
+vpalignrm3, m5, m0, 2
+vpalignrm4, m1, m5, 2
+mova[dst3q+stride5q*2], m3 ; 14
+mova[ dstq+stride3q*2], m4 ; 6
+vpalignrm3, m5, m0, 4
+vpalignrm4, m1, m5, 4
+sub  dst3q, strideq
+mova[dst3q+stride5q*2], m3 ; 13
+mova[dst3q+strideq*2 ], m4 ; 5
+mova[dst3q+stride3q*4], m0 ; 15
+vpalignrm3, m5, m0, 6
+vpalignrm4, m1, m5, 6
+mova [dstq+stride3q*4], m3 ; 12
+mova [dst3q+strideq*1], m4 ; 4
+vpalignrm3, m5, m0, 8
+vpalignrm4, m1, m5, 8
+mova [dst3q+strideq*8], m3 ; 11
+mova [dst3q+strideq*0], m4 ; 3
+vpalignrm3, m5, m0, 10
+vpalignrm4, m1, m5, 10
+mova [dstq+stride5q*2], m3 ; 10
+mova [dstq+strideq*2 ], m4 ; 2
+vpalignrm3, m5, m0, 12
+vpalignrm4, m1, m5, 12
+mova[dst3q+stride3q*2], m3 ; 9
+mova [dstq+strideq*1 ], m4 ; 1
+vpalignrm3, m5, m0, 14
+vpalignrm4, m1, m5, 14
+mova  [dstq+strideq*8], m3 ; 8
+mova  [dstq+strideq*0], m4 ; 0
+sub   dstq, strideq
+mova [dst3q+strideq*4], m5 ; 7
+mova [ dstq+strideq*0], m1 ; -1
+RET
+%endif
+
+
 %macro VL_FUNCS 1 ; stack_mem_for_32x32_32bit_function
 cglobal vp9_ipred_vl_4x4_16, 2, 4, 3, dst, stride, l, a
 movifnidn   aq, amp
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCHv2 1/2] avcodec: add execute3() api to utilize the main function of avpriv_slicethread_create().

2017-09-05 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/avcodec.h   |  7 ++-
 libavcodec/options.c   |  1 +
 libavcodec/pthread_slice.c | 26 --
 libavcodec/utils.c | 14 ++
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 650..712f40c 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1089,6 +1089,10 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_CAP_AVOID_PROBING   (1 << 17)
 /**
+ * Codec initializes slice-based threading with a main function
+ */
+#define AV_CODEC_SLICE_THREAD_HAS_MF (1 << 18)
+/**
  * Codec is intra only.
  */
 #define AV_CODEC_CAP_INTRA_ONLY   0x4000
@@ -3233,7 +3237,7 @@ typedef struct AVCodecContext {
  * - decoding: Set by libavcodec, user can override.
  */
 int (*execute2)(struct AVCodecContext *c, int (*func)(struct 
AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, 
int count);
-
+int (*execute3)(struct AVCodecContext *c, int (*func)(struct 
AVCodecContext *c2, void *arg, int jobnr, int threadnr), int (*m_func)(struct 
AVCodecContext *c3), void *arg2, int *ret, int count);
 /**
  * noise vs. sse weight for the nsse comparison function
  * - encoding: Set by user.
@@ -5774,6 +5778,7 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, 
int profile);
 
 int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, 
void *arg2),void *arg, int *ret, int count, int size);
 int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext 
*c2, void *arg2, int, int),void *arg, int *ret, int count);
+int avcodec_default_execute3(AVCodecContext *c, int (*func)(AVCodecContext 
*c2, void *arg2, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext 
*c3), void *arg, int *ret, int count);
 //FIXME func typedef
 
 /**
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 82e1217..6d63bdb 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -117,6 +117,7 @@ static int init_context_defaults(AVCodecContext *s, const 
AVCodec *codec)
 s->get_format  = avcodec_default_get_format;
 s->execute = avcodec_default_execute;
 s->execute2= avcodec_default_execute2;
+s->execute3= avcodec_default_execute3;
 s->sample_aspect_ratio = (AVRational){0,1};
 s->pix_fmt = AV_PIX_FMT_NONE;
 s->sw_pix_fmt  = AV_PIX_FMT_NONE;
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c781d35..3aff816 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -38,11 +38,13 @@
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int 
threadnr);
+typedef int (main_func)(AVCodecContext *c);
 
 typedef struct SliceThreadContext {
 AVSliceThread *thread;
 action_func *func;
 action_func2 *func2;
+main_func *m_func;
 void *args;
 int *rets;
 int job_size;
@@ -54,6 +56,12 @@ typedef struct SliceThreadContext {
 pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
+static void main_function(void *priv) {
+AVCodecContext *avctx = priv;
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->m_func(avctx);
+}
+
 static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int 
nb_threads)
 {
 AVCodecContext *avctx = priv;
@@ -99,7 +107,8 @@ static int thread_execute(AVCodecContext *avctx, 
action_func* func, void *arg, i
 c->func = func;
 c->rets = ret;
 
-avpriv_slicethread_execute(c->thread, job_count, 0);
+avpriv_slicethread_execute(c->thread, job_count, !!c->m_func);
+
 return 0;
 }
 
@@ -110,10 +119,20 @@ static int thread_execute2(AVCodecContext *avctx, 
action_func2* func2, void *arg
 return thread_execute(avctx, NULL, arg, ret, job_count, 0);
 }
 
+static int thread_execute3(AVCodecContext *avctx, action_func2* func2, 
main_func* m_func, void *arg, int *ret, int job_count)
+{
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->func2 = func2;
+c->m_func = m_func;
+return thread_execute(avctx, NULL, arg, ret, job_count, 0);
+}
+
+
 int ff_slice_thread_init(AVCodecContext *avctx)
 {
 SliceThreadContext *c;
 int thread_count = avctx->thread_count;
+static void (*m_f)(void *);
 
 #if HAVE_W32THREADS
 w32thread_init();
@@ -142,7 +161,9 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 }
 
 avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c));
-if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, NULL, thread_count)) <= 1) {
+m_f = avctx->codec->capabilities & AV_CODEC_SLICE_THREAD_HAS_MF ? 
&main_function : NULL;
+
+if (!c || (thread_count =

[FFmpeg-devel] [PATCH 2/2] avcodec/pthread_slice: add main function support for avpriv_slicethread_create()

2017-09-07 Thread Ilia Valiakhmetov
---
 libavcodec/internal.h  |  4 
 libavcodec/pthread_slice.c | 33 ++---
 libavcodec/thread.h|  1 +
 libavutil/slicethread.h| 18 ++
 4 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index 64120ea..4668952 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -64,6 +64,10 @@
  * dimensions to coded rather than display values.
  */
 #define FF_CODEC_CAP_EXPORTS_CROPPING   (1 << 4)
+/**
+ * Codec initializes slice-based threading with a main function
+ */
+#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF(1 << 5)
 
 #ifdef TRACE
 #   define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__)
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c781d35..65e5abf 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -38,21 +38,13 @@
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int 
threadnr);
+typedef int (main_func)(AVCodecContext *c);
 
-typedef struct SliceThreadContext {
-AVSliceThread *thread;
-action_func *func;
-action_func2 *func2;
-void *args;
-int *rets;
-int job_size;
-
-int *entries;
-int entries_count;
-int thread_count;
-pthread_cond_t *progress_cond;
-pthread_mutex_t *progress_mutex;
-} SliceThreadContext;
+static void main_function(void *priv) {
+AVCodecContext *avctx = priv;
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->m_func(avctx);
+}
 
 static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int 
nb_threads)
 {
@@ -84,7 +76,7 @@ void ff_slice_thread_free(AVCodecContext *avctx)
 av_freep(&avctx->internal->thread_ctx);
 }
 
-static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, 
int *ret, int job_count, int job_size)
+int ff_thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int 
*ret, int job_count, int job_size)
 {
 SliceThreadContext *c = avctx->internal->thread_ctx;
 
@@ -99,7 +91,7 @@ static int thread_execute(AVCodecContext *avctx, action_func* 
func, void *arg, i
 c->func = func;
 c->rets = ret;
 
-avpriv_slicethread_execute(c->thread, job_count, 0);
+avpriv_slicethread_execute(c->thread, job_count, !!c->m_func);
 return 0;
 }
 
@@ -107,13 +99,14 @@ static int thread_execute2(AVCodecContext *avctx, 
action_func2* func2, void *arg
 {
 SliceThreadContext *c = avctx->internal->thread_ctx;
 c->func2 = func2;
-return thread_execute(avctx, NULL, arg, ret, job_count, 0);
+return ff_thread_execute(avctx, NULL, arg, ret, job_count, 0);
 }
 
 int ff_slice_thread_init(AVCodecContext *avctx)
 {
 SliceThreadContext *c;
 int thread_count = avctx->thread_count;
+static void (*main_f)(void *);
 
 #if HAVE_W32THREADS
 w32thread_init();
@@ -142,7 +135,8 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 }
 
 avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c));
-if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, NULL, thread_count)) <= 1) {
+main_f = avctx->codec->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF ? 
&main_function : NULL;
+if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, main_f, thread_count)) <= 1) {
 if (c)
 avpriv_slicethread_free(&c->thread);
 av_freep(&avctx->internal->thread_ctx);
@@ -150,9 +144,10 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 avctx->active_thread_type = 0;
 return 0;
 }
+c->m_func = NULL;
 avctx->thread_count = thread_count;
 
-avctx->execute = thread_execute;
+avctx->execute = ff_thread_execute;
 avctx->execute2 = thread_execute2;
 return 0;
 }
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 90864b5..dd8f5fe 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -133,6 +133,7 @@ void ff_thread_release_buffer(AVCodecContext *avctx, 
ThreadFrame *f);
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src);
 
 int ff_thread_init(AVCodecContext *s);
+int ff_thread_execute(AVCodecContext *avctx, int (*func)(AVCodecContext *c, 
void *arg), void *arg, int *ret, int job_count, int job_size);
 void ff_thread_free(AVCodecContext *s);
 
 int ff_alloc_entries(AVCodecContext *avctx, int count);
diff --git a/libavutil/slicethread.h b/libavutil/slicethread.h
index f6f6f30..9d15c96 100644
--- a/libavutil/slicethread.h
+++ b/libavutil/slicethread.h
@@ -16,11 +16,29 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "libavcodec/avcodec.h"
+
 #ifndef AVUTIL_SLICETHREAD_H
 #define AVUTIL_SLICETHREAD_H
 
 typedef struct AVSliceThread AVSliceThread;
 
+typedef struct SliceThreadContext {
+AVSliceThread *thread;
+int (*func)(AVCodecContext *c, void *arg);
+int (*func2)(AVCodecContex

[FFmpeg-devel] [PATCHv2 2/2] avcodec/pthread_slice: add ff_slice_thread_execute_with_mainfunc()

2017-09-07 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 

v2:
---
 libavcodec/internal.h  |  4 
 libavcodec/pthread_slice.c | 22 --
 libavcodec/thread.h|  4 +++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/libavcodec/internal.h b/libavcodec/internal.h
index 64120ea..4668952 100644
--- a/libavcodec/internal.h
+++ b/libavcodec/internal.h
@@ -64,6 +64,10 @@
  * dimensions to coded rather than display values.
  */
 #define FF_CODEC_CAP_EXPORTS_CROPPING   (1 << 4)
+/**
+ * Codec initializes slice-based threading with a main function
+ */
+#define FF_CODEC_CAP_SLICE_THREAD_HAS_MF(1 << 5)
 
 #ifdef TRACE
 #   define ff_tlog(ctx, ...) av_log(ctx, AV_LOG_TRACE, __VA_ARGS__)
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c781d35..d659f9b 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -38,11 +38,13 @@
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int 
threadnr);
+typedef int (main_func)(AVCodecContext *c);
 
 typedef struct SliceThreadContext {
 AVSliceThread *thread;
 action_func *func;
 action_func2 *func2;
+main_func *mainfunc;
 void *args;
 int *rets;
 int job_size;
@@ -54,6 +56,12 @@ typedef struct SliceThreadContext {
 pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
+static void main_function(void *priv) {
+AVCodecContext *avctx = priv;
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->mainfunc(avctx);
+}
+
 static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int 
nb_threads)
 {
 AVCodecContext *avctx = priv;
@@ -99,7 +107,7 @@ static int thread_execute(AVCodecContext *avctx, 
action_func* func, void *arg, i
 c->func = func;
 c->rets = ret;
 
-avpriv_slicethread_execute(c->thread, job_count, 0);
+avpriv_slicethread_execute(c->thread, job_count, !!c->mainfunc  );
 return 0;
 }
 
@@ -110,10 +118,19 @@ static int thread_execute2(AVCodecContext *avctx, 
action_func2* func2, void *arg
 return thread_execute(avctx, NULL, arg, ret, job_count, 0);
 }
 
+int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx, action_func2* 
func2, main_func *mainfunc, void *arg, int *ret, int job_count)
+{
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->func2 = func2;
+c->mainfunc = mainfunc;
+return thread_execute(avctx, NULL, arg, ret, job_count, 0);
+}
+
 int ff_slice_thread_init(AVCodecContext *avctx)
 {
 SliceThreadContext *c;
 int thread_count = avctx->thread_count;
+static void (*mainfunc)(void *);
 
 #if HAVE_W32THREADS
 w32thread_init();
@@ -142,7 +159,8 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 }
 
 avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c));
-if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, NULL, thread_count)) <= 1) {
+mainfunc = avctx->codec->caps_internal & FF_CODEC_CAP_SLICE_THREAD_HAS_MF 
? &main_function : NULL;
+if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, mainfunc, thread_count)) <= 1) {
 if (c)
 avpriv_slicethread_free(&c->thread);
 av_freep(&avctx->internal->thread_ctx);
diff --git a/libavcodec/thread.h b/libavcodec/thread.h
index 90864b5..3186193 100644
--- a/libavcodec/thread.h
+++ b/libavcodec/thread.h
@@ -133,8 +133,10 @@ void ff_thread_release_buffer(AVCodecContext *avctx, 
ThreadFrame *f);
 int ff_thread_ref_frame(ThreadFrame *dst, ThreadFrame *src);
 
 int ff_thread_init(AVCodecContext *s);
+int ff_slice_thread_execute_with_mainfunc(AVCodecContext *avctx,
+int (*action_func2)(AVCodecContext *c, void *arg, int jobnr, int 
threadnr),
+int (*main_func)(AVCodecContext *c), void *arg, int *ret, int 
job_count);
 void ff_thread_free(AVCodecContext *s);
-
 int ff_alloc_entries(AVCodecContext *avctx, int count);
 void ff_reset_entries(AVCodecContext *avctx);
 void ff_thread_report_progress2(AVCodecContext *avctx, int field, int thread, 
int n);
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 2/2] avcodec/vp9: change avctx->execute3 in favor of ff_slice_thread_execute_with_mainfunc()

2017-09-07 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 

v8:
---
 libavcodec/vp9.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index b780262..a71045e 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -1628,7 +1628,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
 }
 }
 
-avctx->execute3(avctx, decode_tiles_mt, loopfilter_proc, s->td, 
NULL, s->s.h.tiling.tile_cols);
+ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, 
loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols);
 } else {
 ret = decode_tiles(avctx, data, size);
 if (ret < 0)
@@ -1776,7 +1776,8 @@ AVCodec ff_vp9_decoder = {
 .init  = vp9_decode_init,
 .close = vp9_decode_free,
 .decode= vp9_decode_frame,
-.capabilities  = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | 
AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_SLICE_THREAD_HAS_MF,
+.capabilities  = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | 
AV_CODEC_CAP_SLICE_THREADS,
+.caps_internal = FF_CODEC_CAP_SLICE_THREAD_HAS_MF,
 .flush = vp9_decode_flush,
 .init_thread_copy  = 
ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
 .update_thread_context = 
ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/vp9: add tile threading support

2017-09-08 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 Changelog | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Changelog b/Changelog
index cae5254..8a4818a 100644
--- a/Changelog
+++ b/Changelog
@@ -43,6 +43,7 @@ version :
 - add --disable-autodetect build switch
 - drop deprecated qtkit input device (use avfoundation instead)
 - despill video filter
+- tile threading support for VP9
 
 version 3.3:
 - CrystalHD decoder moved to new decode API
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] Changelog: add vp9 tile threading support

2017-09-10 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 Changelog | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Changelog b/Changelog
index 22928de..ca0758a 100644
--- a/Changelog
+++ b/Changelog
@@ -46,6 +46,7 @@ version :
 - haas audio filter
 - SUP/PGS subtitle muxer
 - convolve video filter
+- VP9 tile threading support
 
 version 3.3:
 - CrystalHD decoder moved to new decode API
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/vp9: add 64-bit ipred_dr_32x32_16 avx2 implementation

2017-06-25 Thread Ilia Valiakhmetov
vp9_diag_downright_32x32_12bpp_c: 429.7
vp9_diag_downright_32x32_12bpp_sse2: 158.9
vp9_diag_downright_32x32_12bpp_ssse3: 144.6
vp9_diag_downright_32x32_12bpp_avx: 141.0
vp9_diag_downright_32x32_12bpp_avx2: 73.8

Almost 50% faster than avx implementation
---
 libavcodec/x86/vp9dsp_init_16bpp.c|   6 +-
 libavcodec/x86/vp9intrapred_16bpp.asm | 103 +-
 2 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index 8d1aa13..54216f0 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -52,8 +52,9 @@ decl_ipred_fns(dc,  16, mmxext, sse2);
 decl_ipred_fns(dc_top,  16, mmxext, sse2);
 decl_ipred_fns(dc_left, 16, mmxext, sse2);
 decl_ipred_fn(dl,   16, 16, avx2);
-decl_ipred_fn(dr,   16, 16, avx2);
 decl_ipred_fn(dl,   32, 16, avx2);
+decl_ipred_fn(dr,   16, 16, avx2);
+decl_ipred_fn(dr,   32, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -137,8 +138,9 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 init_fpel_func(1, 1,  64, avg, _16, avx2);
 init_fpel_func(0, 1, 128, avg, _16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2);
-init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
 init_ipred_func(dl, DIAG_DOWN_LEFT, 32, 16, avx2);
+init_ipred_func(dr, DIAG_DOWN_RIGHT, 16, 16, avx2);
+init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
 }
 
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 6d4400b..32b6982 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1221,8 +1221,109 @@ cglobal vp9_ipred_dr_16x16_16, 4, 5, 6, dst, stride, l, 
a
 mova  [dstq+strideq*0], m4 ; 0
 mova [dst3q+strideq*4], m5 ; 7
 RET
-%endif
 
+%if ARCH_X86_64
+cglobal vp9_ipred_dr_32x32_16, 4, 7, 10, dst, stride, l, a
+movam0, [lq+mmsize*0+0]; l[0-15]
+movam1, [lq+mmsize*1+0]; l[16-31]
+movum2, [aq+mmsize*0-2]; *abcdefghijklmno
+movam3, [aq+mmsize*0+0]; abcdefghijklmnop
+movam4, [aq+mmsize*1+0]; qrstuvwxyz012345
+vperm2i128  m5, m0, m1, q0201  ; lmnopqrstuvwxyz0
+vpalignrm6, m5, m0, 2  ; mnopqrstuvwxyz01
+vpalignrm7, m5, m0, 4  ; nopqrstuvwxyz012
+LOWPASS  0,  6,  7 ; L[0-15]
+vperm2i128  m7, m1, m2, q0201  ; stuvwxyz*abcdefg
+vpalignrm5, m7, m1, 2  ; lmnopqrstuvwxyz*
+vpalignrm6, m7, m1, 4  ; mnopqrstuvwxyz*a
+LOWPASS  1,  5,  6 ; L[16-31]#
+vperm2i128  m5, m3, m4, q0201  ; ijklmnopqrstuvwx
+vpalignrm6, m5, m3, 2  ; bcdefghijklmnopq
+LOWPASS  2,  3,  6 ; A[0-15]
+movum3, [aq+mmsize*1-2]; pqrstuvwxyz01234
+vperm2i128  m6, m4, m4, q2001  ; yz012345
+vpalignrm7, m6, m4, 2  ; rstuvwxyz012345.
+LOWPASS  3,  4,  7 ; A[16-31].
+vperm2i128  m4, m1, m2, q0201  ; TUVWXYZ#ABCDEFGH
+vperm2i128  m5, m0, m1, q0201  ; L[7-15]L[16-23]
+vperm2i128  m8, m2, m3, q0201  ; IJKLMNOPQRSTUVWX
+DEFINE_ARGS dst8, stride, stride3, stride7, stride5, dst24, cnt
+lea   stride3q, [strideq*3]
+lea   stride5q, [stride3q+strideq*2]
+lea   stride7q, [strideq*4+stride3q]
+lea dst24q, [dst8q+stride3q*8]
+lea  dst8q, [dst8q+strideq*8]
+mov   cntd, 2
+
+.loop:
+mova  [dst24q+stride7q+0 ], m0 ; 31 23 15 7
+mova  [dst24q+stride7q+32], m1
+mova[dst8q+stride7q+0], m1
+mova   [dst8q+stride7q+32], m2
+vpalignrm6, m4, m1, 2
+vpalignrm7, m5, m0, 2
+vpalignrm9, m8, m2, 2
+mova [dst24q+stride3q*2+0], m7 ; 30 22 14 6
+mova [dst24q+stride3q*2+32], m6
+mova  [dst8q+stride3q*2+0], m6
+mova [dst8q+stride3q*2+32], m9
+vpalignrm6, m4, m1, 4
+vpalignrm7, m5, m0, 4
+vpalignrm9, m8, m2, 4
+mova   [dst24q+stride5q+0], m7 ; 29 21 13 5
+mova  [dst24q+stride5q+32], m6
+mova[dst8q+stride5q+0], m6
+mova   [dst8q+stride5q+32], m9
+vpalignrm6, m4, m1, 6
+v

[FFmpeg-devel] [PATCH] avcodec/vp9: AVX2 ipred_dl_32x32 improvement

2017-07-03 Thread Ilia Valiakhmetov
Use symmetry properties of the ipred_dl function for better performance.

vp9_diag_downleft_32x32_12bpp_c: 1534.2
vp9_diag_downleft_32x32_12bpp_sse2: 145.9
vp9_diag_downleft_32x32_12bpp_ssse3: 140.0
vp9_diag_downleft_32x32_12bpp_avx: 134.8
vp9_diag_downleft_32x32_12bpp_avx2: 78.9

~40% faster than avx

Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/x86/vp9intrapred_16bpp.asm | 47 ---
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 8d8d65e..33a8a7f 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -901,49 +901,68 @@ cglobal vp9_ipred_dl_32x32_16, 2, 6, 7, dst, stride, l, a
 LOWPASS  1,  2,  3 ; RSTUVWXYZ..5
 vperm2i128  m2, m1, m4, q0201  ; Z..5
 vperm2i128  m5, m0, m1, q0201  ; JKLMNOPQRSTUVWXY
-DEFINE_ARGS dst, stride, stride3, cnt
+vperm2i128  m6, m2, m2, q0101
+DEFINE_ARGS dst, stride, stride3, dst16, cnt
 lea   stride3q, [strideq*3]
-mov   cntd, 4
+lea dst16q, [dstq+strideq*8]
+lea dst16q, [dst16q+strideq*8]
+mov   cntd, 2
 
 .loop:
 mova   [dstq+strideq*0 + 0], m0
 mova   [dstq+strideq*0 +32], m1
+mova  [dst16q+strideq*0+ 0], m1
+mova  [dst16q+strideq*0+32], m6
 vpalignr m3, m5, m0, 2
 vpalignr m4, m2, m1, 2
 mova   [dstq+strideq*1 + 0], m3
 mova   [dstq+strideq*1 +32], m4
+mova  [dst16q+strideq*1 +0], m4
+mova [dst16q+strideq*1 +32], m6
 vpalignr m3, m5, m0, 4
 vpalignr m4, m2, m1, 4
 mova   [dstq+strideq*2 + 0], m3
 mova   [dstq+strideq*2 +32], m4
+mova   [dst16q+strideq*2+0], m4
+mova  [dst16q+strideq*2+32], m6
 vpalignr m3, m5, m0, 6
-vpalignr m4, m2, m1, 6
+vpalignr m4, m2, m1, 6  
 mova   [dstq+stride3q*1+ 0], m3
 mova   [dstq+stride3q*1+32], m4
-leadstq, [dstq+strideq*4]
+mova  [dst16q+stride3q*1+0], m4
+mova [dst16q+stride3q*1+32], m6
 vpalignr m3, m5, m0, 8
 vpalignr m4, m2, m1, 8
+leadstq, [dstq+strideq*4]
+lea  dst16q, [dst16q+strideq*4]
 mova   [dstq+strideq*0 + 0], m3
 mova   [dstq+strideq*0 +32], m4
+mova  [dst16q+strideq*0 +0], m4
+mova [dst16q+strideq*0 +32], m6
 vpalignr m3, m5, m0, 10
 vpalignr m4, m2, m1, 10
 mova   [dstq+strideq*1 + 0], m3
 mova   [dstq+strideq*1 +32], m4
+mova  [dst16q+strideq*1 +0], m4
+mova [dst16q+strideq*1 +32], m6
 vpalignr m3, m5, m0, 12
 vpalignr m4, m2, m1, 12
-mova   [dstq+strideq*2+ 0], m3
-mova   [dstq+strideq*2+32], m4
+mova[dstq+strideq*2+ 0], m3
+mova[dstq+strideq*2+32], m4
+mova   [dst16q+strideq*2+0], m4
+mova  [dst16q+strideq*2+32], m6
 vpalignr m3, m5, m0, 14
 vpalignr m4, m2, m1, 14
-mova   [dstq+stride3q+  0], m3
-mova   [dstq+stride3q+ 32], m4
-vpalignr m3, m5, m0, 16
-vpalignr m4, m2, m1, 16
-vperm2i128   m5, m3, m4, q0201
-vperm2i128   m2, m4, m4, q0101
-mova m0, m3
-mova m1, m4
+mova[dstq+stride3q+  0], m3
+mova[dstq+stride3q+ 32], m4
+mova   [dst16q+stride3q+ 0], m4
+mova   [dst16q+stride3q+32], m6
+mova m0, m5
+mova m1, m2
+vperm2i128   m5, m5, m2, q0201
+mova m2, m6
 leadstq, [dstq+strideq*4]
+lea  dst16q, [dst16q+strideq*4]
 deccntd
 jg .loop
 RET
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] avcodec/vp9: AVX2 ipred_vl_16x16

2017-07-03 Thread Ilia Valiakhmetov
vp9_vert_left_16x16_12bpp_c: 273.8
vp9_vert_left_16x16_12bpp_sse2: 69.4
vp9_vert_left_16x16_12bpp_ssse3: 35.3
vp9_vert_left_16x16_12bpp_avx: 34.6
vp9_vert_left_16x16_12bpp_avx2: 22.4

~35% faster than avx

Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/x86/vp9dsp_init_16bpp.c|  2 ++
 libavcodec/x86/vp9intrapred_16bpp.asm | 53 +++
 2 files changed, 55 insertions(+)

diff --git a/libavcodec/x86/vp9dsp_init_16bpp.c 
b/libavcodec/x86/vp9dsp_init_16bpp.c
index 60d10a1..da8b74c 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp.c
@@ -55,6 +55,7 @@ decl_ipred_fn(dl,   16, 16, avx2);
 decl_ipred_fn(dl,   32, 16, avx2);
 decl_ipred_fn(dr,   16, 16, avx2);
 decl_ipred_fn(dr,   32, 16, avx2);
+decl_ipred_fn(vl,   16, 16, avx2);
 
 #define decl_ipred_dir_funcs(type) \
 decl_ipred_fns(type, 16, sse2,  sse2); \
@@ -143,6 +144,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
 #if ARCH_X86_64
 init_ipred_func(dr, DIAG_DOWN_RIGHT, 32, 16, avx2);
 #endif
+init_ipred_func(vl, VERT_LEFT, 16, 16, avx2);
 }
 
 #endif /* HAVE_X86ASM */
diff --git a/libavcodec/x86/vp9intrapred_16bpp.asm 
b/libavcodec/x86/vp9intrapred_16bpp.asm
index 32b6982..8d8d65e 100644
--- a/libavcodec/x86/vp9intrapred_16bpp.asm
+++ b/libavcodec/x86/vp9intrapred_16bpp.asm
@@ -1538,6 +1538,59 @@ VL_FUNCS 1
 INIT_XMM avx
 VL_FUNCS 1
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal vp9_ipred_vl_16x16_16, 2, 4, 6, dst, stride, l, a
+movifnidn   aq, amp
+movam0, [aq]; abcdefghijklmnop
+vpbroadcastw   xm5, [aq+30] ; 
+vperm2i128  m1, m0, m5, q0201   ; ijklmnop
+vpalignrm2, m1, m0, 2   ; bcdefghijklmnopp
+vpalignrm3, m1, m0, 4   ; cdefghijklmnoppp
+movam4, m2
+pavgw   m4, m0
+LOWPASS  0,  2,  3  ; BCDEFGHIJKLMNOPp
+vperm2i128  m2, m0, m5, q0201
+vperm2i128  m3, m4, m5, q0201
+DEFINE_ARGS dst, stride, stride3
+lea   stride3q, [strideq*3]
+
+mova  [dstq+strideq*0], m4
+mova  [dstq+strideq*1], m0
+vpalignrm1, m2, m0, 2
+vpalignrm5, m3, m4, 2
+mova  [dstq+strideq*2], m5
+mova  [dstq+stride3q ], m1
+vpalignrm1, m2, m0, 4
+vpalignrm5, m3, m4, 4
+lea   dstq, [dstq+strideq*4]
+mova  [dstq+strideq*0], m5
+mova  [dstq+strideq*1], m1
+vpalignrm1, m2, m0, 6
+vpalignrm5, m3, m4, 6
+mova  [dstq+strideq*2], m5
+mova  [dstq+stride3q ], m1
+vpalignrm1, m2, m0, 8
+vpalignrm5, m3, m4, 8
+lea   dstq, [dstq+strideq*4]
+mova  [dstq+strideq*0], m5
+mova  [dstq+strideq*1], m1
+vpalignrm1, m2, m0, 10
+vpalignrm5, m3, m4, 10
+mova  [dstq+strideq*2], m5
+mova  [dstq+stride3q ], m1
+vpalignrm1, m2, m0, 12
+vpalignrm5, m3, m4, 12
+lea   dstq, [dstq+strideq*4]
+mova  [dstq+strideq*0], m5
+mova  [dstq+strideq*1], m1
+vpalignrm1, m2, m0, 14
+vpalignrm5, m3, m4, 14
+mova  [dstq+strideq*2], m5
+mova  [dstq+stride3q ], m1
+RET
+%endif
+
 %macro VR_FUNCS 0
 cglobal vp9_ipred_vr_4x4_16, 4, 4, 3, dst, stride, l, a
 movum0, [aq-2]
-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH 1/2] avcodec: add execute3() api to utilize the main function of avpriv_slicethread_create().

2017-08-27 Thread Ilia Valiakhmetov
Signed-off-by: Ilia Valiakhmetov 
---
 libavcodec/avcodec.h   |  7 ++-
 libavcodec/options.c   |  1 +
 libavcodec/pthread_slice.c | 27 +--
 libavcodec/utils.c | 13 +
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 650..712f40c 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1089,6 +1089,10 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_CAP_AVOID_PROBING   (1 << 17)
 /**
+ * Codec initializes slice-based threading with a main function
+ */
+#define AV_CODEC_SLICE_THREAD_HAS_MF (1 << 18)
+/**
  * Codec is intra only.
  */
 #define AV_CODEC_CAP_INTRA_ONLY   0x4000
@@ -3233,7 +3237,7 @@ typedef struct AVCodecContext {
  * - decoding: Set by libavcodec, user can override.
  */
 int (*execute2)(struct AVCodecContext *c, int (*func)(struct 
AVCodecContext *c2, void *arg, int jobnr, int threadnr), void *arg2, int *ret, 
int count);
-
+int (*execute3)(struct AVCodecContext *c, int (*func)(struct 
AVCodecContext *c2, void *arg, int jobnr, int threadnr), int (*m_func)(struct 
AVCodecContext *c3), void *arg2, int *ret, int count);
 /**
  * noise vs. sse weight for the nsse comparison function
  * - encoding: Set by user.
@@ -5774,6 +5778,7 @@ const char *avcodec_profile_name(enum AVCodecID codec_id, 
int profile);
 
 int avcodec_default_execute(AVCodecContext *c, int (*func)(AVCodecContext *c2, 
void *arg2),void *arg, int *ret, int count, int size);
 int avcodec_default_execute2(AVCodecContext *c, int (*func)(AVCodecContext 
*c2, void *arg2, int, int),void *arg, int *ret, int count);
+int avcodec_default_execute3(AVCodecContext *c, int (*func)(AVCodecContext 
*c2, void *arg2, int jobnr, int threadnr), int (*m_func)(struct AVCodecContext 
*c3), void *arg, int *ret, int count);
 //FIXME func typedef
 
 /**
diff --git a/libavcodec/options.c b/libavcodec/options.c
index 82e1217..6d63bdb 100644
--- a/libavcodec/options.c
+++ b/libavcodec/options.c
@@ -117,6 +117,7 @@ static int init_context_defaults(AVCodecContext *s, const 
AVCodec *codec)
 s->get_format  = avcodec_default_get_format;
 s->execute = avcodec_default_execute;
 s->execute2= avcodec_default_execute2;
+s->execute3= avcodec_default_execute3;
 s->sample_aspect_ratio = (AVRational){0,1};
 s->pix_fmt = AV_PIX_FMT_NONE;
 s->sw_pix_fmt  = AV_PIX_FMT_NONE;
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c781d35..08d19b9 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -38,11 +38,13 @@
 
 typedef int (action_func)(AVCodecContext *c, void *arg);
 typedef int (action_func2)(AVCodecContext *c, void *arg, int jobnr, int 
threadnr);
+typedef int (main_func)(AVCodecContext *c);
 
 typedef struct SliceThreadContext {
 AVSliceThread *thread;
 action_func *func;
 action_func2 *func2;
+main_func *m_func;
 void *args;
 int *rets;
 int job_size;
@@ -54,6 +56,12 @@ typedef struct SliceThreadContext {
 pthread_mutex_t *progress_mutex;
 } SliceThreadContext;
 
+static void main_function(void *priv) {
+AVCodecContext *avctx = priv;
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->m_func(avctx);
+}
+
 static void worker_func(void *priv, int jobnr, int threadnr, int nb_jobs, int 
nb_threads)
 {
 AVCodecContext *avctx = priv;
@@ -99,7 +107,8 @@ static int thread_execute(AVCodecContext *avctx, 
action_func* func, void *arg, i
 c->func = func;
 c->rets = ret;
 
-avpriv_slicethread_execute(c->thread, job_count, 0);
+avpriv_slicethread_execute(c->thread, job_count, !!c->m_func);
+
 return 0;
 }
 
@@ -110,10 +119,20 @@ static int thread_execute2(AVCodecContext *avctx, 
action_func2* func2, void *arg
 return thread_execute(avctx, NULL, arg, ret, job_count, 0);
 }
 
+static int thread_execute3(AVCodecContext *avctx, action_func2* func2, 
main_func* m_func, void *arg, int *ret, int job_count)
+{
+SliceThreadContext *c = avctx->internal->thread_ctx;
+c->func2 = func2;
+c->m_func = m_func;
+return thread_execute(avctx, NULL, arg, ret, job_count, 0);
+}
+
+
 int ff_slice_thread_init(AVCodecContext *avctx)
 {
 SliceThreadContext *c;
 int thread_count = avctx->thread_count;
+static void (*m_f)(void *);
 
 #if HAVE_W32THREADS
 w32thread_init();
@@ -142,7 +161,9 @@ int ff_slice_thread_init(AVCodecContext *avctx)
 }
 
 avctx->internal->thread_ctx = c = av_mallocz(sizeof(*c));
-if (!c || (thread_count = avpriv_slicethread_create(&c->thread, avctx, 
worker_func, NULL, thread_count)) <= 1) {
+m_f = avctx->codec->capabilities & AV_CODEC_SLICE_THREAD_HAS_MF ? 
&main_function : NULL;
+
+if (!c || (thread_count =

[FFmpeg-devel] [PATCH 0/2] Tile threading support for vp9

2017-08-27 Thread Ilia Valiakhmetov
These patches introduce tile threading support for vp9.

Tile threading is ~45% faster at 2 threads vs 1.
Frame threading is ~55% faster at 2 threads vs 1.
ffvp9 tile threading is ~25% faster than libvpx-vp9 at 2 threads

execute3() function is similar to execute2(), execept it has
a extra argument - main function for avpriv_slicethread_create(), it is used 
for the loopfilter.

Ilia Valiakhmetov (2):
  avcodec: add execute3() api to utilize the main function of
avpriv_slicethread_create().
  avcodec/vp9: Add tile threading support

 libavcodec/avcodec.h |   7 +-
 libavcodec/options.c |   1 +
 libavcodec/pthread_slice.c   |  27 +-
 libavcodec/utils.c   |  13 +
 libavcodec/vp9.c | 591 +--
 libavcodec/vp9_mc_template.c | 202 +++
 libavcodec/vp9block.c| 526 +++---
 libavcodec/vp9dec.h  | 106 +---
 libavcodec/vp9mvs.c  |  97 +++
 libavcodec/vp9prob.c |  64 ++---
 libavcodec/vp9recon.c| 157 ++--
 11 files changed, 1036 insertions(+), 755 deletions(-)

-- 
2.8.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel