Hi, 2015-02-02 17:16 GMT+01:00 Mickaël Raulet <mrau...@insa-rennes.fr>: > https://github.com/OpenHEVC/FFmpeg/commit/940300945995c20f7583394ebe6907e72829b4a
No longer apply cleanly, as multiple fixes and improvements have been committed since then. The attached patch fixes that, and passes on a non-avx2 machine. I can't test it, and I'm not looking forward to do debug through a ssh shell. And who is the actual author? It has been committed under your name, but wouldn't that be P-E Lepere rather? And I guess I'll drop the previous patch for now. -- Christophe
From f326724af77a65acc42eaabf17db6c30e8b7f75c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Raulet?= <mrau...@insa-rennes.fr> Date: Thu, 31 Jul 2014 01:15:20 +0200 Subject: [PATCH] x86/hevc: add MC AVX2 optimizations before 33304 decicycles in luma_bi_1, 523066 runs, 1222 skips 38138 decicycles in luma_bi_2, 523427 runs, 861 skips 13490 decicycles in luma_uni, 516138 runs, 8150 skips after 20185 decicycles in luma_bi_1, 519970 runs, 4318 skips 24620 decicycles in luma_bi_2, 521024 runs, 3264 skips 10397 decicycles in luma_uni, 515715 runs, 8573 skips Conflicts: libavcodec/x86/hevc_mc.asm libavcodec/x86/hevcdsp_init.c --- libavcodec/x86/hevc_mc.asm | 584 +++++++++++++++++++++++++++++++----------- libavcodec/x86/hevcdsp.h | 105 ++++++++ libavcodec/x86/hevcdsp_init.c | 371 ++++++++++++++++++++++++++- 3 files changed, 910 insertions(+), 150 deletions(-) diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 8f9f939..e8d1e3a 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -20,19 +20,20 @@ ; */ %include "libavutil/x86/x86util.asm" -SECTION_RODATA -pw_8: times 8 dw (1 << 9) -pw_10: times 8 dw (1 << 11) -pw_12: times 8 dw (1 << 13) -pw_bi_8: times 8 dw (1 << 8) -pw_bi_10: times 8 dw (1 << 10) -pw_bi_12: times 8 dw (1 << 12) -max_pixels_10: times 8 dw ((1 << 10)-1) -max_pixels_12: times 8 dw ((1 << 12)-1) -zero: times 4 dd 0 -one_per_32: times 4 dd 1 - -SECTION .text +SECTION_RODATA 32 +pw_8: times 16 dw (1 << 9) +pw_10: times 16 dw (1 << 11) +pw_12: times 16 dw (1 << 13) +pw_bi_8: times 16 dw (1 << 8) +pw_bi_10: times 16 dw (1 << 10) +pw_bi_12: times 16 dw (1 << 12) +max_pixels_8: times 16 dw ((1 << 8)-1) +max_pixels_10: times 16 dw ((1 << 10)-1) +max_pixels_12: times 16 dw ((1 << 12)-1) +zero: times 8 dd 0 +one_per_32: times 8 dd 1 + +SECTION_TEXT 32 %macro EPEL_TABLE 4 hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 times %2 d%3 10, -2 @@ -51,6 +52,8 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 %endmacro +EPEL_TABLE 8,16, b, avx2 +EPEL_TABLE 10, 8, w, avx2 EPEL_TABLE 8, 8, b, sse4 EPEL_TABLE 10, 4, w, sse4 @@ -75,10 +78,15 @@ QPEL_TABLE 8, 8, b, sse4 QPEL_TABLE 10, 4, w, sse4 QPEL_TABLE 12, 4, w, sse4 +QPEL_TABLE 8,16, b, avx2 +QPEL_TABLE 10, 8, w, avx2 + %define MAX_PB_SIZE 64 %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 +%define hevc_qpel_filters_avx2_14 hevc_qpel_filters_avx2_10 + %if ARCH_X86_64 %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 @@ -87,11 +95,22 @@ QPEL_TABLE 12, 4, w, sse4 %elif %1 <= 8 movdqa %3, [%2] ; load data from source2 %elif %1 <= 12 +%if avx_enabled + mova %3, [%2] +%else movdqa %3, [%2] ; load data from source2 movq %4, [%2+16] ; load data from source2 +%endif ;avx +%elif %1 <= 16 +%if avx_enabled + movu %3, [%2] %else movdqa %3, [%2] ; load data from source2 movdqa %4, [%2+16] ; load data from source2 +%endif ; avx +%else ; %1 = 32 + movu %3, [%2] + movu %4, [%2+32] %endif %endmacro @@ -100,71 +119,108 @@ QPEL_TABLE 12, 4, w, sse4 movd %4, [%3] ; load data from source %elif %1 == 4 || (%2 == 8 && %1 <= 8) movq %4, [%3] ; load data from source +%elif notcpuflag(avx) + movu %4, [%3] ; load data from source +%elif %1 <= 8 || (%2 == 8 && %1 <= 16) + movdqu %4, [%3] %else - movdqu %4, [%3] ; load data from source + movu %4, [%3] %endif %endmacro -%macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2 -%if %1 == 2 || (%2 == 8 && %1 <= 4) - movq %4, [%3] ; load data from source2 -%elif %1 == 4 || (%2 == 8 && %1 <= 8) - movdqa %4, [%3] ; load data from source2 -%elif %1 <= 12 - movdqa %4, [%3] ; load data from source2 - movq %5, [%3+16] ; load data from source2 -%else - movdqa %4, [%3] ; load data from source2 - movdqa %5, [%3+16] ; load data from source2 -%endif -%endmacro %macro EPEL_FILTER 2-4 ; bit depth, filter index +%if avx_enabled +%assign %%offset 32 +%ifdef PIC + lea rfilterq, [hevc_epel_filters_avx2_%1] +%else + %define rfilterq hevc_epel_filters_avx2_%1 +%endif +%else +%assign %%offset 16 %ifdef PIC lea rfilterq, [hevc_epel_filters_sse4_%1] %else %define rfilterq hevc_epel_filters_sse4_%1 %endif +%endif ;avx_enabled sub %2q, 1 +%if avx_enabled + shl %2q, 6 ; multiply by 64 + %else shl %2q, 5 ; multiply by 32 - movdqa %3, [rfilterq + %2q] ; get 2 first values of filters - movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters +%endif +%if %0 == 2 + mova m14, [rfilterq + %2q] ; get 2 first values of filters + mova m15, [rfilterq + %2q+%%offset] ; get 2 last values of filters +%else + mova %3, [rfilterq + %2q] ; get 2 first values of filters + mova %4, [rfilterq + %2q+%%offset] ; get 2 last values of filters +%endif %endmacro %macro EPEL_HV_FILTER 1 +%if avx_enabled +%assign %%offset 32 +%assign %%shift 6 +%define %%table hevc_epel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 5 +%define %%table hevc_epel_filters_sse4_%1 +%endif + %ifdef PIC - lea rfilterq, [hevc_epel_filters_sse4_%1] + lea rfilterq, [%%table] %else - %define rfilterq hevc_epel_filters_sse4_%1 + %define rfilterq %%table %endif sub mxq, 1 sub myq, 1 - shl mxq, 5 ; multiply by 32 - shl myq, 5 ; multiply by 32 - movdqa m14, [rfilterq + mxq] ; get 2 first values of filters - movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 + mova m14, [rfilterq + mxq] ; get 2 first values of filters + mova m15, [rfilterq + mxq+%%offset] ; get 2 last values of filters lea r3srcq, [srcstrideq*3] +%if avx_enabled +%define %%table hevc_epel_filters_avx2_10 +%else +%define %%table hevc_epel_filters_sse4_10 +%endif %ifdef PIC - lea rfilterq, [hevc_epel_filters_sse4_10] + lea rfilterq, [%%table] %else - %define rfilterq hevc_epel_filters_sse4_10 + %define rfilterq %%table %endif - movdqa m12, [rfilterq + myq] ; get 2 first values of filters - movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters + mova m12, [rfilterq + myq] ; get 2 first values of filters + mova m13, [rfilterq + myq+%%offset] ; get 2 last values of filters %endmacro %macro QPEL_FILTER 2 + +%if avx_enabled +%assign %%offset 32 +%assign %%shift 7 +%define %%table hevc_qpel_filters_avx2_%1 +%else +%assign %%offset 16 +%assign %%shift 6 +%define %%table hevc_qpel_filters_sse4_%1 +%endif + %ifdef PIC - lea rfilterq, [hevc_qpel_filters_sse4_%1] + lea rfilterq, [%%table] %else - %define rfilterq hevc_qpel_filters_sse4_%1 + %define rfilterq %%table %endif - lea %2q, [%2q*8-8] - movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters - movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters - movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters - movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters + sub %2q, 1 + shl %2q, %%shift ; multiply by 32 + mova m12, [rfilterq + %2q] ; get 4 first values of filters + mova m13, [rfilterq + %2q + %%offset] ; get 4 first values of filters + mova m14, [rfilterq + %2q + 2*%%offset] ; get 4 first values of filters + mova m15, [rfilterq + %2q + 3*%%offset] ; get 4 first values of filters %endmacro %macro EPEL_LOAD 4 @@ -191,19 +247,18 @@ QPEL_TABLE 12, 4, w, sse4 %%load m2, [rfilterq+2*%3q] %%load m3, [rfilterq+r3srcq] %endif - %if %1 == 8 %if %4 > 8 - SBUTTERFLY bw, 0, 1, 10 - SBUTTERFLY bw, 2, 3, 10 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 %else punpcklbw m0, m1 punpcklbw m2, m3 %endif %else %if %4 > 4 - SBUTTERFLY wd, 0, 1, 10 - SBUTTERFLY wd, 2, 3, 10 + SBUTTERFLY wd, 0, 1, 7 + SBUTTERFLY wd, 2, 3, 7 %else punpcklwd m0, m1 punpcklwd m2, m3 @@ -220,7 +275,7 @@ QPEL_TABLE 12, 4, w, sse4 %elif %3 == 8 %define %%load movq %else -%define %%load movdqu +%define %%load movu %endif %else %if %3 == 2 @@ -228,7 +283,7 @@ QPEL_TABLE 12, 4, w, sse4 %elif %3 == 4 %define %%load movq %else -%define %%load movdqu +%define %%load movu %endif %endif %%load m0, [%2-3*%%stride] ;load data from source @@ -247,10 +302,10 @@ QPEL_TABLE 12, 4, w, sse4 SBUTTERFLY wd, 4, 5, %4 SBUTTERFLY wd, 6, 7, %4 %else - punpcklwd m0, m1 - punpcklwd m2, m3 - punpcklwd m4, m5 - punpcklwd m6, m7 + punpcklbw m0, m1 + punpcklbw m2, m3 + punpcklbw m4, m5 + punpcklbw m6, m7 %endif %else %if %3 > 4 @@ -259,10 +314,10 @@ QPEL_TABLE 12, 4, w, sse4 SBUTTERFLY dq, 4, 5, %4 SBUTTERFLY dq, 6, 7, %4 %else - punpckldq m0, m1 - punpckldq m2, m3 - punpckldq m4, m5 - punpckldq m6, m7 + punpcklwd m0, m1 + punpcklwd m2, m3 + punpcklwd m4, m5 + punpcklwd m6, m7 %endif %endif %endmacro @@ -270,14 +325,14 @@ QPEL_TABLE 12, 4, w, sse4 %macro QPEL_V_LOAD 5 lea %5q, [%2] sub %5q, r3srcq - movdqu m0, [%5q ] ;load x- 3*srcstride - movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride - movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride - movdqu m3, [%2 ] ;load x - movdqu m4, [%2+ %3q] ;load x+stride - movdqu m5, [%2+ 2*%3q] ;load x+2*stride - movdqu m6, [%2+r3srcq] ;load x+3*stride - movdqu m7, [%2+ 4*%3q] ;load x+4*stride + movu m0, [%5q ] ;load x- 3*srcstride + movu m1, [%5q+ %3q ] ;load x- 2*srcstride + movu m2, [%5q+ 2*%3q ] ;load x-srcstride + movu m3, [%2 ] ;load x + movu m4, [%2+ %3q] ;load x+stride + movu m5, [%2+ 2*%3q] ;load x+2*stride + movu m6, [%2+r3srcq] ;load x+3*stride + movu m7, [%2+ 4*%3q] ;load x+4*stride %if %1 == 8 %if %4 > 8 SBUTTERFLY bw, 0, 1, 8 @@ -347,8 +402,17 @@ QPEL_TABLE 12, 4, w, sse4 movq [%1+16], %3 %endmacro %macro PEL_10STORE16 3 +%if avx_enabled + movu [%1], %2 +%else PEL_10STORE8 %1, %2, %3 movdqa [%1+16], %3 +%endif +%endmacro + +%macro PEL_10STORE32 3 + PEL_10STORE16 %1, %2, %3 + movu [%1+32], %3 %endmacro %macro PEL_8STORE2 3 @@ -370,7 +434,14 @@ QPEL_TABLE 12, 4, w, sse4 movd [%1+8], %2 %endmacro %macro PEL_8STORE16 3 - movdqa [%1], %2 +%if avx_enabled + movdqu [%1], %2 +%else + mova [%1], %2 +%endif ; avx +%endmacro +%macro PEL_8STORE32 3 + movu [%1], %2 %endmacro %macro LOOP_END 3 @@ -381,65 +452,109 @@ QPEL_TABLE 12, 4, w, sse4 %endmacro -%macro MC_PIXEL_COMPUTE 2 ;width, bitdepth +%macro MC_PIXEL_COMPUTE 2-3 ;width, bitdepth %if %2 == 8 +%if avx_enabled && %0 ==3 +%if %1 > 16 + vextracti128 xm1, m0, 1 + pmovzxbw m1, xm1 + psllw m1, 14-%2 +%endif + pmovzxbw m0, xm0 +%else ; not avx %if %1 > 8 - punpckhbw m1, m0, m2 - psllw m1, 14-%2 + punpckhbw m1, m0, m2 + psllw m1, 14-%2 %endif - punpcklbw m0, m2 + punpcklbw m0, m2 %endif - psllw m0, 14-%2 +%endif ;avx + psllw m0, 14-%2 %endmacro - -%macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2 +%macro EPEL_COMPUTE 4-8 ; bitdepth, width, filter1, filter2, HV/m0, m2, m1, m3 +%if %0 == 8 +%define %%reg0 %5 +%define %%reg2 %6 +%define %%reg1 %7 +%define %%reg3 %8 +%else +%define %%reg0 m0 +%define %%reg2 m2 +%define %%reg1 m1 +%define %%reg3 m3 +%endif %if %1 == 8 - pmaddubsw m0, %3 ;x1*c1+x2*c2 - pmaddubsw m2, %4 ;x3*c3+x4*c4 - paddw m0, m2 +%if avx_enabled && (%0 == 5) +%if %2 > 16 + vextracti128 xm10, m0, 1 + vinserti128 m10, m1, xm10, 0 +%endif + vinserti128 m0, m0, xm1, 1 + mova m1, m10 +%if %2 > 16 + vextracti128 xm10, m2, 1 + vinserti128 m10, m3, xm10, 0 +%endif + vinserti128 m2, m2, xm3, 1 + mova m3, m10 +%endif + pmaddubsw %%reg0, %3 ;x1*c1+x2*c2 + pmaddubsw %%reg2, %4 ;x3*c3+x4*c4 + paddw %%reg0, %%reg2 %if %2 > 8 - pmaddubsw m1, %3 - pmaddubsw m3, %4 - paddw m1, m3 + pmaddubsw %%reg1, %3 + pmaddubsw %%reg3, %4 + paddw %%reg1, %%reg3 %endif %else - pmaddwd m0, %3 - pmaddwd m2, %4 - paddd m0, m2 + pmaddwd %%reg0, %3 + pmaddwd %%reg2, %4 + paddd %%reg0, %%reg2 %if %2 > 4 - pmaddwd m1, %3 - pmaddwd m3, %4 - paddd m1, m3 + pmaddwd %%reg1, %3 + pmaddwd %%reg3, %4 + paddd %%reg1, %%reg3 +%if %1 != 8 + psrad %%reg1, %1-8 +%endif %endif %if %1 != 8 - psrad m0, %1-8 - psrad m1, %1-8 + psrad %%reg0, %1-8 %endif - packssdw m0, m1 + packssdw %%reg0, %%reg1 %endif %endmacro %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx + +%if avx_enabled +%assign %%offset 32 +%define %%table hevc_qpel_filters_avx2_%2 +%else +%assign %%offset 16 +%define %%table hevc_qpel_filters_sse4_%2 +%endif + %ifdef PIC - lea rfilterq, [hevc_qpel_filters_sse4_%2] + lea rfilterq, [%%table] %else - %define rfilterq hevc_qpel_filters_sse4_%2 + %define rfilterq %%table %endif %if %2 == 8 pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 - pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4 - pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6 - pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8 + pmaddubsw m2, [rfilterq + %3q*8+%%offset] ;x3*c3+x4*c4 + pmaddubsw m4, [rfilterq + %3q*8+2*%%offset] ;x5*c5+x6*c6 + pmaddubsw m6, [rfilterq + %3q*8+3*%%offset] ;x7*c7+x8*c8 paddw m0, m2 paddw m4, m6 paddw m0, m4 %else pmaddwd m0, [rfilterq + %3q*8 ] - pmaddwd m2, [rfilterq + %3q*8+16] - pmaddwd m4, [rfilterq + %3q*8+32] - pmaddwd m6, [rfilterq + %3q*8+48] + pmaddwd m2, [rfilterq + %3q*8+%%offset] + pmaddwd m4, [rfilterq + %3q*8+2*%%offset] + pmaddwd m6, [rfilterq + %3q*8+3*%%offset] paddd m0, m2 paddd m4, m6 paddd m0, m4 @@ -448,9 +563,9 @@ QPEL_TABLE 12, 4, w, sse4 %endif %if %1 > 4 pmaddwd m1, [rfilterq + %3q*8 ] - pmaddwd m3, [rfilterq + %3q*8+16] - pmaddwd m5, [rfilterq + %3q*8+32] - pmaddwd m7, [rfilterq + %3q*8+48] + pmaddwd m3, [rfilterq + %3q*8+%%offset] + pmaddwd m5, [rfilterq + %3q*8+2*%%offset] + pmaddwd m7, [rfilterq + %3q*8+3*%%offset] paddd m1, m3 paddd m5, m7 paddd m1, m5 @@ -462,8 +577,32 @@ QPEL_TABLE 12, 4, w, sse4 %endif %endmacro -%macro QPEL_COMPUTE 2 ; width, bitdepth +%macro QPEL_COMPUTE 2-3 ; width, bitdepth %if %2 == 8 +%if avx_enabled && (%0 == 3) + + vextracti128 xm10, m0, 1 + vinserti128 m10, m1, xm10, 0 + vinserti128 m0, m0, xm1, 1 + mova m1, m10 + + vextracti128 xm10, m2, 1 + vinserti128 m10, m3, xm10, 0 + vinserti128 m2, m2, xm3, 1 + mova m3, m10 + + + vextracti128 xm10, m4, 1 + vinserti128 m10, m5, xm10, 0 + vinserti128 m4, m4, xm5, 1 + mova m5, m10 + + vextracti128 xm10, m6, 1 + vinserti128 m10, m7, xm10, 0 + vinserti128 m6, m6, xm7, 1 + mova m7, m10 +%endif + pmaddubsw m0, m12 ;x1*c1+x2*c2 pmaddubsw m2, m13 ;x3*c3+x4*c4 pmaddubsw m4, m14 ;x5*c5+x6*c6 @@ -506,12 +645,16 @@ QPEL_TABLE 12, 4, w, sse4 %endif %endmacro -%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw +%macro BI_COMPUTE 7-8 ; width, bitd, src1l, src1h, scr2l, scr2h, pw paddsw %3, %5 %if %1 > 8 paddsw %4, %6 %endif UNI_COMPUTE %1, %2, %3, %4, %7 +%if %0 == 8 && avx_enabled && (%2 == 8) + vpermq %3, %3, 216 + vpermq %4, %4, 216 +%endif %endmacro %macro UNI_COMPUTE 5 @@ -524,14 +667,14 @@ QPEL_TABLE 12, 4, w, sse4 %else pminsw %3, [max_pixels_%2] pmaxsw %3, [zero] -%if %1 > 8 +%if (%1 > 8 && notcpuflag(avx)) || %1 > 16 pminsw %4, [max_pixels_%2] pmaxsw %4, [zero] %endif %endif %endmacro -INIT_XMM sse4 ; adds ff_ and _sse4 to function name + ; ****************************** ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, ; uint8_t *_src, ptrdiff_t _srcstride, @@ -539,16 +682,25 @@ INIT_XMM sse4 ; adds ff_ and _sse4 to functio ; ****************************** %macro HEVC_PUT_HEVC_PEL_PIXELS 2 +HEVC_PEL_PIXELS %1, %2 +HEVC_UNI_PEL_PIXELS %1, %2 +HEVC_BI_PEL_PIXELS %1, %2 +%endmacro + +%macro HEVC_PEL_PIXELS 2 cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height pxor m2, m2 .loop SIMPLE_LOAD %1, %2, srcq, m0 - MC_PIXEL_COMPUTE %1, %2 + MC_PIXEL_COMPUTE %1, %2, 1 PEL_10STORE%1 dstq, m0, m1 LOOP_END dst, src, srcstride RET + %endmacro -cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height +%macro HEVC_UNI_PEL_PIXELS 2 +cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height + pxor m2, m2 .loop SIMPLE_LOAD %1, %2, srcq, m0 PEL_%2STORE%1 dstq, m0, m1 @@ -557,15 +709,17 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri dec heightd ; cmp height jnz .loop ; height loop RET +%endmacro +%macro HEVC_BI_PEL_PIXELS 2 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height pxor m2, m2 movdqa m5, [pw_bi_%2] .loop SIMPLE_LOAD %1, %2, srcq, m0 SIMPLE_BILOAD %1, src2q, m3, m4 - MC_PIXEL_COMPUTE %1, %2 - BI_COMPUTE %1, %2, m0, m1, m3, m4, m5 + MC_PIXEL_COMPUTE %1, %2, 1 + BI_COMPUTE %1, %2, m0, m1, m3, m4, m5, 1 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -573,7 +727,6 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid dec heightd ; cmp height jnz .loop ; height loop RET - %endmacro @@ -591,7 +744,7 @@ cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rf EPEL_FILTER %2, mx, m4, m5 .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 - EPEL_COMPUTE %2, %1, m4, m5 + EPEL_COMPUTE %2, %1, m4, m5, 1 PEL_10STORE%1 dstq, m0, m1 LOOP_END dst, src, srcstride RET @@ -616,9 +769,9 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride, EPEL_FILTER %2, mx, m4, m5 .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 - EPEL_COMPUTE %2, %1, m4, m5 + EPEL_COMPUTE %2, %1, m4, m5, 1 SIMPLE_BILOAD %1, src2q, m2, m3 - BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -640,7 +793,7 @@ cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 11, dst, src, srcstride, height, r3src, EPEL_FILTER %2, my, m4, m5 .loop EPEL_LOAD %2, srcq, srcstride, %1 - EPEL_COMPUTE %2, %1, m4, m5 + EPEL_COMPUTE %2, %1, m4, m5, 1 PEL_10STORE%1 dstq, m0, m1 LOOP_END dst, src, srcstride RET @@ -669,9 +822,9 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride, EPEL_FILTER %2, my, m4, m5 .loop EPEL_LOAD %2, srcq, srcstride, %1 - EPEL_COMPUTE %2, %1, m4, m5 + EPEL_COMPUTE %2, %1, m4, m5, 1 SIMPLE_BILOAD %1, src2q, m2, m3 - BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 + BI_COMPUTE %1, %2, m0, m1, m2, m3, m6, 1 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -695,19 +848,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx, EPEL_HV_FILTER %2 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif SWAP m4, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif SWAP m5, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif SWAP m6, m0 add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif SWAP m7, m0 punpcklwd m0, m4, m5 punpcklwd m2, m6, m7 @@ -716,10 +881,31 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx, punpckhwd m3, m6, m7 %endif EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 +%if avx_enabled + vinserti128 m2, m0, xm4, 1 + vextracti128 xm3, m0, 1 + vinserti128 m3, m4, xm3, 0 + PEL_10STORE%1 dstq, m2, m3 +%else + PEL_10STORE%1 dstq, m0, m4 +%endif +%else PEL_10STORE%1 dstq, m0, m1 +%endif movdqa m4, m5 movdqa m5, m6 movdqa m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif LOOP_END dst, src, srcstride RET @@ -729,20 +915,32 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid EPEL_HV_FILTER %2 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif SWAP m4, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif SWAP m5, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif SWAP m6, m0 add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 - SWAP m7, m0 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif + mova m7, m0 punpcklwd m0, m4, m5 punpcklwd m2, m6, m7 %if %1 > 4 @@ -750,37 +948,62 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid punpckhwd m3, m6, m7 %endif EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + UNI_COMPUTE %1, %2, m0, m4, [pw_%2] +%else UNI_COMPUTE %1, %2, m0, m1, [pw_%2] +%endif PEL_%2STORE%1 dstq, m0, m1 - movdqa m4, m5 - movdqa m5, m6 - movdqa m6, m7 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride dec heightd ; cmp height jnz .loop ; height loop RET - cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter %assign %%stride ((%2 + 7)/8) sub srcq, srcstrideq EPEL_HV_FILTER %2 EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m8, m1 +%endif SWAP m4, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m9, m1 +%endif SWAP m5, m0 add srcq, srcstrideq EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m10, m1 +%endif SWAP m6, m0 add srcq, srcstrideq .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m14, m15 +%if (%1 > 8 && (%2 == 8)) + SWAP m11, m1 +%endif SWAP m7, m0 punpcklwd m0, m4, m5 punpcklwd m2, m6, m7 @@ -789,12 +1012,34 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride punpckhwd m3, m6, m7 %endif EPEL_COMPUTE 14, %1, m12, m13 +%if (%1 > 8 && (%2 == 8)) + punpcklwd m4, m8, m9 + punpcklwd m2, m10, m11 + punpckhwd m8, m8, m9 + punpckhwd m3, m10, m11 + EPEL_COMPUTE 14, %1, m12, m13, m4, m2, m8, m3 + SIMPLE_BILOAD %1, src2q, m8, m3 +%if avx_enabled + vinserti128 m1, m8, xm3, 1 + vextracti128 xm8, m8, 1 + vinserti128 m2, m3, xm8, 0 + BI_COMPUTE %1, %2, m0, m4, m1, m2, [pw_bi_%2] +%else + BI_COMPUTE %1, %2, m0, m4, m8, m3, [pw_bi_%2] +%endif +%else SIMPLE_BILOAD %1, src2q, m8, m9 BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] - PEL_%2STORE%1 dstq, m0, m1 - movdqa m4, m5 - movdqa m5, m6 - movdqa m6, m7 +%endif + PEL_%2STORE%1 dstq, m0, m4 + mova m4, m5 + mova m5, m6 + mova m6, m7 +%if (%1 > 8 && (%2 == 8)) + mova m8, m9 + mova m9, m10 + mova m10, m11 +%endif add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride add src2q, 2*MAX_PB_SIZE ; src += srcstride @@ -814,7 +1059,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf QPEL_FILTER %2, mx .loop QPEL_H_LOAD %2, srcq, %1, 10 - QPEL_COMPUTE %1, %2 + QPEL_COMPUTE %1, %2, 1 %if %2 > 8 packssdw m0, m1 %endif @@ -823,7 +1068,7 @@ cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 16, dst, src, srcstride, height, mx, rf RET cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 16 , dst, dststride, src, srcstride, height, mx, rfilter - movdqa m9, [pw_%2] + mova m9, [pw_%2] QPEL_FILTER %2, mx .loop QPEL_H_LOAD %2, srcq, %1, 10 @@ -844,12 +1089,12 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, QPEL_FILTER %2, mx .loop QPEL_H_LOAD %2, srcq, %1, 10 - QPEL_COMPUTE %1, %2 + QPEL_COMPUTE %1, %2, 1 %if %2 > 8 packssdw m0, m1 %endif SIMPLE_BILOAD %1, src2q, m10, m11 - BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -870,7 +1115,7 @@ cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 16, dst, src, srcstride, height, r3src, QPEL_FILTER %2, my .loop QPEL_V_LOAD %2, srcq, srcstride, %1, r7 - QPEL_COMPUTE %1, %2 + QPEL_COMPUTE %1, %2, 1 %if %2 > 8 packssdw m0, m1 %endif @@ -901,13 +1146,13 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, lea r3srcq, [srcstrideq*3] QPEL_FILTER %2, my .loop - SIMPLE_BILOAD %1, src2q, m10, m11 QPEL_V_LOAD %2, srcq, srcstride, %1, r9 - QPEL_COMPUTE %1, %2 + QPEL_COMPUTE %1, %2, 1 %if %2 > 8 packssdw m0, m1 %endif - BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 + SIMPLE_BILOAD %1, src2q, m10, m11 + BI_COMPUTE %1, %2, m0, m1, m10, m11, m9, 1 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -925,8 +1170,15 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, ; ****************************** %macro HEVC_PUT_HEVC_QPEL_HV 2 cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, my, r3src, rfilter - lea mxq, [mxq*8-8] - lea myq, [myq*8-8] +%if avx_enabled +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 lea r3srcq, [srcstrideq*3] sub srcq, r3srcq QPEL_H_LOAD %2, srcq, %1, 15 @@ -994,8 +1246,15 @@ cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 16, dst, src, srcstride, height, mx, m RET cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter - lea mxq, [mxq*8-8] - lea myq, [myq*8-8] +%if avx_enabled +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 lea r3srcq, [srcstrideq*3] sub srcq, r3srcq QPEL_H_LOAD %2, srcq, %1, 15 @@ -1053,13 +1312,13 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid movq m13, m14 movq m14, m15 %else - movdqa m8, m9 - movdqa m9, m10 - movdqa m10, m11 - movdqa m11, m12 - movdqa m12, m13 - movdqa m13, m14 - movdqa m14, m15 + mova m8, m9 + mova m9, m10 + mova m10, m11 + mova m11, m12 + mova m12, m13 + mova m13, m14 + mova m14, m15 %endif add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -1068,8 +1327,15 @@ cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid RET cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter - lea mxq, [mxq*8-8] - lea myq, [myq*8-8] +%if avx_enabled +%assign %%shift 4 +%else +%assign %%shift 3 +%endif + sub mxq, 1 + sub myq, 1 + shl mxq, %%shift ; multiply by 32 + shl myq, %%shift ; multiply by 32 lea r3srcq, [srcstrideq*3] sub srcq, r3srcq QPEL_H_LOAD %2, srcq, %1, 15 @@ -1286,6 +1552,8 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, RET %endmacro +INIT_XMM sse4 ; adds ff_ and _sse4 to function name + WEIGHTING_FUNCS 2, 8 WEIGHTING_FUNCS 4, 8 WEIGHTING_FUNCS 6, 8 @@ -1340,6 +1608,7 @@ HEVC_PUT_HEVC_EPEL_HV 2, 8 HEVC_PUT_HEVC_EPEL_HV 4, 8 HEVC_PUT_HEVC_EPEL_HV 6, 8 HEVC_PUT_HEVC_EPEL_HV 8, 8 +HEVC_PUT_HEVC_EPEL_HV 16, 8 HEVC_PUT_HEVC_EPEL_HV 2, 10 HEVC_PUT_HEVC_EPEL_HV 4, 10 @@ -1377,4 +1646,23 @@ HEVC_PUT_HEVC_QPEL_HV 4, 12 HEVC_PUT_HEVC_QPEL_HV 6, 12 HEVC_PUT_HEVC_QPEL_HV 8, 12 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 ; adds ff_ and _avx2 to function name & enables 256b registers : m0 for 256b, xm0 for 128b. avx_enabled = 1 / notcpuflag(avx) = 0 + +HEVC_PUT_HEVC_PEL_PIXELS 32, 8 +HEVC_PUT_HEVC_PEL_PIXELS 16, 10 + +HEVC_PUT_HEVC_EPEL 32, 8 +HEVC_PUT_HEVC_EPEL 16, 10 + +HEVC_PUT_HEVC_EPEL_HV 16, 10 +HEVC_PUT_HEVC_EPEL_HV 32, 8 + +HEVC_PUT_HEVC_QPEL 32, 8 + +HEVC_PUT_HEVC_QPEL 16, 10 + +HEVC_PUT_HEVC_QPEL_HV 16, 10 + +%endif ;AVX2 %endif ; ARCH_X86_64 diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index 8dea142..7864163 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -96,6 +96,40 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst EPEL_PROTOTYPES(pel_pixels , 8, sse4); EPEL_PROTOTYPES(pel_pixels , 10, sse4); EPEL_PROTOTYPES(pel_pixels , 12, sse4); + +void ff_hevc_put_hevc_pel_pixels16_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + +void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); + + + +void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); +void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width); //used for 10bit +void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride,uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my,int width);//used for 10bit + + +void ff_hevc_put_hevc_bi_pel_pixels16_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + +void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); +void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width); + /////////////////////////////////////////////////////////////////////////////// // EPEL /////////////////////////////////////////////////////////////////////////////// @@ -111,6 +145,42 @@ EPEL_PROTOTYPES(epel_hv , 8, sse4); EPEL_PROTOTYPES(epel_hv , 10, sse4); EPEL_PROTOTYPES(epel_hv , 12, sse4); +PEL_PROTOTYPE(epel_h16, 8, avx2); +PEL_PROTOTYPE(epel_h24, 8, avx2); +PEL_PROTOTYPE(epel_h32, 8, avx2); +PEL_PROTOTYPE(epel_h48, 8, avx2); +PEL_PROTOTYPE(epel_h64, 8, avx2); + +PEL_PROTOTYPE(epel_h16,10, avx2); +PEL_PROTOTYPE(epel_h24,10, avx2); +PEL_PROTOTYPE(epel_h32,10, avx2); +PEL_PROTOTYPE(epel_h48,10, avx2); +PEL_PROTOTYPE(epel_h64,10, avx2); + +PEL_PROTOTYPE(epel_v16, 8, avx2); +PEL_PROTOTYPE(epel_v24, 8, avx2); +PEL_PROTOTYPE(epel_v32, 8, avx2); +PEL_PROTOTYPE(epel_v48, 8, avx2); +PEL_PROTOTYPE(epel_v64, 8, avx2); + +PEL_PROTOTYPE(epel_v16,10, avx2); +PEL_PROTOTYPE(epel_v24,10, avx2); +PEL_PROTOTYPE(epel_v32,10, avx2); +PEL_PROTOTYPE(epel_v48,10, avx2); +PEL_PROTOTYPE(epel_v64,10, avx2); + +PEL_PROTOTYPE(epel_hv16, 8, avx2); +PEL_PROTOTYPE(epel_hv24, 8, avx2); +PEL_PROTOTYPE(epel_hv32, 8, avx2); +PEL_PROTOTYPE(epel_hv48, 8, avx2); +PEL_PROTOTYPE(epel_hv64, 8, avx2); + +PEL_PROTOTYPE(epel_hv16,10, avx2); +PEL_PROTOTYPE(epel_hv24,10, avx2); +PEL_PROTOTYPE(epel_hv32,10, avx2); +PEL_PROTOTYPE(epel_hv48,10, avx2); +PEL_PROTOTYPE(epel_hv64,10, avx2); + /////////////////////////////////////////////////////////////////////////////// // QPEL /////////////////////////////////////////////////////////////////////////////// @@ -126,6 +196,41 @@ QPEL_PROTOTYPES(qpel_hv, 8, sse4); QPEL_PROTOTYPES(qpel_hv, 10, sse4); QPEL_PROTOTYPES(qpel_hv, 12, sse4); +PEL_PROTOTYPE(qpel_h16, 8, avx2); +PEL_PROTOTYPE(qpel_h24, 8, avx2); +PEL_PROTOTYPE(qpel_h32, 8, avx2); +PEL_PROTOTYPE(qpel_h48, 8, avx2); +PEL_PROTOTYPE(qpel_h64, 8, avx2); + +PEL_PROTOTYPE(qpel_h16,10, avx2); +PEL_PROTOTYPE(qpel_h24,10, avx2); +PEL_PROTOTYPE(qpel_h32,10, avx2); +PEL_PROTOTYPE(qpel_h48,10, avx2); +PEL_PROTOTYPE(qpel_h64,10, avx2); + +PEL_PROTOTYPE(qpel_v16, 8, avx2); +PEL_PROTOTYPE(qpel_v24, 8, avx2); +PEL_PROTOTYPE(qpel_v32, 8, avx2); +PEL_PROTOTYPE(qpel_v48, 8, avx2); +PEL_PROTOTYPE(qpel_v64, 8, avx2); + +PEL_PROTOTYPE(qpel_v16,10, avx2); +PEL_PROTOTYPE(qpel_v24,10, avx2); +PEL_PROTOTYPE(qpel_v32,10, avx2); +PEL_PROTOTYPE(qpel_v48,10, avx2); +PEL_PROTOTYPE(qpel_v64,10, avx2); + +PEL_PROTOTYPE(qpel_hv16, 8, avx2); +PEL_PROTOTYPE(qpel_hv24, 8, avx2); +PEL_PROTOTYPE(qpel_hv32, 8, avx2); +PEL_PROTOTYPE(qpel_hv48, 8, avx2); +PEL_PROTOTYPE(qpel_hv64, 8, avx2); + +PEL_PROTOTYPE(qpel_hv16,10, avx2); +PEL_PROTOTYPE(qpel_hv24,10, avx2); +PEL_PROTOTYPE(qpel_hv32,10, avx2); +PEL_PROTOTYPE(qpel_hv48,10, avx2); +PEL_PROTOTYPE(qpel_hv64,10, avx2); WEIGHTING_PROTOTYPES(8, sse4); WEIGHTING_PROTOTYPES(10, sse4); diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 5a01ff6..dd5f49a 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -165,6 +165,149 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL +#define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \ +} + +#define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t _srcstride, int16_t*src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \ + height, mx, my, width); \ + ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2, \ + height, mx, my, width); \ +} + +#define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \ +void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \ + height, mx, my, width); \ + ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \ + height, mx, my, width); \ +} + +#define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \ +mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \ +mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4); \ +mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) + +#define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \ + ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \ +} + +#define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ + ptrdiff_t _srcstride, int16_t*src2, \ + int height, intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, src2, \ + height, mx, my, width); \ + ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, src2+width2, \ + height, mx, my, width); \ +} + +#define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \ +void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t _srcstride, int height, \ + intptr_t mx, intptr_t my, int width) \ +{ \ + ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \ + height, mx, my, width); \ + ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \ + height, mx, my, width); \ +} + +#define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \ +mc_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ +mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2); \ +mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) + +#if HAVE_AVX2_EXTERNAL + +mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4); +mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4); + +mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32); +mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32); + + +mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32); +mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32); + + +mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2);//used for 10bit +mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2); //used for 10bit + +mc_rep_funcs(pel_pixels, 8, 32, 64, avx2); + +mc_rep_func(pel_pixels, 10, 16, 32, avx2); +mc_rep_func(pel_pixels, 10, 16, 48, avx2); +mc_rep_func(pel_pixels, 10, 32, 64, avx2); + +mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2); +mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2); +mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2); + +mc_rep_funcs(epel_h, 8, 32, 64, avx2); + +mc_rep_funcs(epel_v, 8, 32, 64, avx2); + +mc_rep_funcs(epel_h, 10, 16, 32, avx2); +mc_rep_funcs(epel_h, 10, 16, 48, avx2); +mc_rep_funcs(epel_h, 10, 32, 64, avx2); + +mc_rep_funcs(epel_v, 10, 16, 32, avx2); +mc_rep_funcs(epel_v, 10, 16, 48, avx2); +mc_rep_funcs(epel_v, 10, 32, 64, avx2); + + +mc_rep_funcs(epel_hv, 8, 32, 64, avx2); + +mc_rep_funcs(epel_hv, 10, 16, 32, avx2); +mc_rep_funcs(epel_hv, 10, 16, 48, avx2); +mc_rep_funcs(epel_hv, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_h, 8, 32, 64, avx2); +mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4); + +mc_rep_funcs(qpel_v, 8, 32, 64, avx2); +mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4); + +mc_rep_funcs(qpel_h, 10, 16, 32, avx2); +mc_rep_funcs(qpel_h, 10, 16, 48, avx2); +mc_rep_funcs(qpel_h, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_v, 10, 16, 32, avx2); +mc_rep_funcs(qpel_v, 10, 16, 48, avx2); +mc_rep_funcs(qpel_v, 10, 32, 64, avx2); + +mc_rep_funcs(qpel_hv, 10, 16, 32, avx2); +mc_rep_funcs(qpel_hv, 10, 16, 48, avx2); +mc_rep_funcs(qpel_hv, 10, 32, 64, avx2); + +#endif //AVX2 + mc_rep_funcs(pel_pixels, 8, 16, 64, sse4); mc_rep_funcs(pel_pixels, 8, 16, 48, sse4); mc_rep_funcs(pel_pixels, 8, 16, 32, sse4); @@ -218,7 +361,6 @@ mc_rep_funcs(epel_hv, 8, 8, 64, sse4); mc_rep_funcs(epel_hv, 8, 8, 48, sse4); mc_rep_funcs(epel_hv, 8, 8, 32, sse4); mc_rep_funcs(epel_hv, 8, 8, 24, sse4); -mc_rep_funcs(epel_hv, 8, 8, 16, sse4); mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4); mc_rep_funcs(epel_hv,10, 8, 64, sse4); mc_rep_funcs(epel_hv,10, 8, 48, sse4); @@ -500,7 +642,7 @@ SAO_BAND_FILTER_FUNCS(8, avx2); SAO_BAND_FILTER_FUNCS(10, avx2); SAO_BAND_FILTER_FUNCS(12, avx2); -#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \ +#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt) \ PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \ PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \ PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \ @@ -589,6 +731,89 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2; if (ARCH_X86_64) { SAO_BAND_INIT(8, avx2); + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; + c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; + c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; + + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2; + + c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + + c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; + c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; + c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; + + c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2; + c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2; + c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2; + + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2; + c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2; + c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2; + + c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2; + c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2; + c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2; + + c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2; + c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2; + c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2; + + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2; + c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2; + c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2; + + c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2; + c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2; + c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2; + + c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2; + c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2; + c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2; + + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2; + c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2; + c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2; + + c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2; + c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2; + c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2; + + c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2; + c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2; + c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2; + + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2; + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2; + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2; + + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2; + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2; + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2; + + c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2; + c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2; + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2; + + c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2; + c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2; + c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2; + + c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2; + c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2; + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2; + + c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2; + c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2; + c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2; } c->transform_add[3] = ff_hevc_transform_add32_8_avx2; @@ -648,6 +873,148 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2; if (ARCH_X86_64) { SAO_BAND_INIT(10, avx2); + c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2; + c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2; + c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2; + c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2; + c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2; + + c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2; + c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2; + c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2; + c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2; + c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2; + + c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2; + c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2; + + c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2; + c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2; + c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2; + c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2; + c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2; + + c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2; + c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2; + c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2; + c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2; + c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2; + c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2; + c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2; + c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2; + c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2; + c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2; + + c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2; + c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2; + c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2; + c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2; + c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2; + + c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2; + c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2; + c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2; + c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2; + c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2; + + c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2; + c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2; + c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2; + c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2; + c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2; + + c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2; + c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2; + c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2; + c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2; + c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2; + + c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2; + c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2; + c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2; + c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2; + c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2; + + c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2; + c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2; + c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2; + c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2; + c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2; + + c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2; + c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2; + c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2; + c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2; + c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2; + + c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2; + c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2; + c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2; + c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2; + c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2; + + c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2; + c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2; + c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2; + c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2; + c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2; + + c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2; + c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2; + c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2; + c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2; + c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2; + + c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2; + c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2; + c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2; + c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2; + c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2; + + c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2; + c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2; + c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2; + c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2; + c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2; + + c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2; + c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2; + c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2; + c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2; + c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2; + + c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2; + c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2; + c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2; + c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2; + c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2; + + c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2; + c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2; + c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2; + c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2; + c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2; + + c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2; + c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2; + c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2; + c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2; + c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2; + + c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2; + c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2; + c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2; + c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2; + c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2; + + c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2; + c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2; + c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2; + c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2; + c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2; } c->transform_add[2] = ff_hevc_transform_add16_10_avx2; -- 1.9.2.msysgit.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel