The only sse4 instruction is pextrw, which is used on rather minor functions for small blocks. Therefore use whichever GPR is available to extract the output word.
Before (sse4), for block_w == 6: 4627 decicycles in epel_uni, 16377 runs, 7 skips 7422 decicycles in epel_bi, 65501 runs, 35 skips After: 4649 decicycles in epel_uni, 16371 runs, 13 skips 7432 decicycles in epel_bi, 65505 runs, 31 skips --- libavcodec/x86/hevc_mc.asm | 80 +++++-- libavcodec/x86/hevcdsp.h | 48 ++-- libavcodec/x86/hevcdsp_init.c | 522 +++++++++++++++++++++--------------------- 3 files changed, 338 insertions(+), 312 deletions(-) diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 9ce6bd1..52cc66e 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -31,6 +31,8 @@ max_pixels_10: times 8 dw ((1 << 10)-1) max_pixels_12: times 8 dw ((1 << 12)-1) zero: times 4 dd 0 one_per_32: times 4 dd 1 +pd_8000: times 4 dd 0x8000 +pw_8000: times 8 dw 0x8000 SECTION .text %macro EPEL_TABLE 4 @@ -52,9 +54,9 @@ hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 -EPEL_TABLE 8, 8, b, sse4 -EPEL_TABLE 10, 4, w, sse4 -EPEL_TABLE 12, 4, w, sse4 +EPEL_TABLE 8, 8, b, ssse3 +EPEL_TABLE 10, 4, w, ssse3 +EPEL_TABLE 12, 4, w, ssse3 %macro QPEL_TABLE 4 hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 @@ -71,13 +73,13 @@ hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 times %2 d%3 4, -1 %endmacro -QPEL_TABLE 8, 8, b, sse4 -QPEL_TABLE 10, 4, w, sse4 -QPEL_TABLE 12, 4, w, sse4 +QPEL_TABLE 8, 8, b, ssse3 +QPEL_TABLE 10, 4, w, ssse3 +QPEL_TABLE 12, 4, w, ssse3 %define MAX_PB_SIZE 64 -%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 +%define hevc_qpel_filters_ssse3_14 hevc_qpel_filters_ssse3_10 %if ARCH_X86_64 @@ -121,9 +123,9 @@ QPEL_TABLE 12, 4, w, sse4 %macro EPEL_FILTER 2-4 ; bit depth, filter index %ifdef PIC - lea rfilterq, [hevc_epel_filters_sse4_%1] + lea rfilterq, [hevc_epel_filters_ssse3_%1] %else - %define rfilterq hevc_epel_filters_sse4_%1 + %define rfilterq hevc_epel_filters_ssse3_%1 %endif sub %2q, 1 shl %2q, 5 ; multiply by 32 @@ -138,9 +140,9 @@ QPEL_TABLE 12, 4, w, sse4 %macro EPEL_HV_FILTER 1 %ifdef PIC - lea rfilterq, [hevc_epel_filters_sse4_%1] + lea rfilterq, [hevc_epel_filters_ssse3_%1] %else - %define rfilterq hevc_epel_filters_sse4_%1 + %define rfilterq hevc_epel_filters_ssse3_%1 %endif sub mxq, 1 sub myq, 1 @@ -151,9 +153,9 @@ QPEL_TABLE 12, 4, w, sse4 lea r3srcq, [srcstrideq*3] %ifdef PIC - lea rfilterq, [hevc_epel_filters_sse4_10] + lea rfilterq, [hevc_epel_filters_ssse3_10] %else - %define rfilterq hevc_epel_filters_sse4_10 + %define rfilterq hevc_epel_filters_ssse3_10 %endif movdqa m12, [rfilterq + myq] ; get 2 first values of filters movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters @@ -161,9 +163,9 @@ QPEL_TABLE 12, 4, w, sse4 %macro QPEL_FILTER 2 %ifdef PIC - lea rfilterq, [hevc_qpel_filters_sse4_%1] + lea rfilterq, [hevc_qpel_filters_ssse3_%1] %else - %define rfilterq hevc_qpel_filters_sse4_%1 + %define rfilterq hevc_qpel_filters_ssse3_%1 %endif lea %2q, [%2q*8-8] movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters @@ -357,14 +359,25 @@ QPEL_TABLE 12, 4, w, sse4 %endmacro %macro PEL_8STORE2 3 +%if cpuflag(sse4) pextrw [%1], %2, 0 +%else + movd rfilterd, %2 + mov [%1], rfilterw +%endif %endmacro %macro PEL_8STORE4 3 movd [%1], %2 %endmacro %macro PEL_8STORE6 3 movd [%1], %2 +%if cpuflag(sse4) pextrw [%1+4], %2, 2 +%else + psrldq %2, 4 + movd rfilterd, %2 + mov [%1+4], rfilterw +%endif %endmacro %macro PEL_8STORE8 3 movq [%1], %2 @@ -426,7 +439,7 @@ QPEL_TABLE 12, 4, w, sse4 %endmacro %macro QPEL_SET_POINTER 2 - lea rfilterq, [hevc_qpel_filters_sse4_%1] + lea rfilterq, [hevc_qpel_filters_ssse3_%1] lea %2q, [rfilterq + 8*%2q] %endmacro @@ -535,7 +548,6 @@ QPEL_TABLE 12, 4, w, sse4 %endif %endmacro -INIT_XMM sse4 ; adds ff_ and _sse4 to function name ; ****************************** ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, ; uint8_t *_src, ptrdiff_t _srcstride, @@ -543,7 +555,7 @@ INIT_XMM sse4 ; adds ff_ and _sse4 to functio ; ****************************** %macro HEVC_PUT_HEVC_PEL_PIXELS 2 -cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height +cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 5, 3, dst, src, srcstride, height, rfilter pxor m2, m2 .loop SIMPLE_LOAD %1, %2, srcq, m0 @@ -552,7 +564,7 @@ cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height LOOP_END dst, src, srcstride RET -cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height +cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 6, 2, dst, dststride, src, srcstride, height, rfilter .loop SIMPLE_LOAD %1, %2, srcq, m0 PEL_%2STORE%1 dstq, m0, m1 @@ -562,7 +574,7 @@ cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstri jnz .loop ; height loop RET -cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height +cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 7, 6, dst, dststride, src, srcstride, src2, height, rfilter pxor m2, m2 movdqa m5, [pw_bi_%2] .loop @@ -1154,14 +1166,29 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride RET %endmacro +%macro PACKUSDW 2 +%if cpuflag(sse4) + packusdw %1, %2 +%else + psubd %1, [pd_8000] + psubd %2, [pd_8000] + packssdw %1, %2 + paddw %1, [pw_8000] +%endif +%endmacro + %macro WEIGHTING_FUNCS 2 %if WIN64 || ARCH_X86_32 -cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox +cglobal hevc_put_hevc_uni_w%1_%2, 4, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox mov r4d, denomm %define SHIFT r4d +%define rfilterd r5d +%define rfilterw r5w %else -cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox +cglobal hevc_put_hevc_uni_w%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, denom, wx, ox %define SHIFT denomd +%define rfilterd r6d +%define rfilterw r6w %endif lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom %if %1 <= 4 @@ -1208,7 +1235,7 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh paddd m0, m3 paddd m1, m3 %endif - packusdw m0, m1 + PACKUSDW m0, m1 %if %2 == 8 packuswb m0, m0 %else @@ -1221,7 +1248,9 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh jnz .loop ; height loop RET -cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1 +cglobal hevc_put_hevc_bi_w%1_%2, 5, 8, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1 +%define rfilterd r7d +%define rfilterw r7w mov r6d, denomm %if %1 <= 4 pxor m1, m1 @@ -1279,7 +1308,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, psrad m0, m5 psrad m1, m5 %endif - packusdw m0, m1 + PACKUSDW m0, m1 %if %2 == 8 packuswb m0, m0 %else @@ -1294,6 +1323,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, RET %endmacro +INIT_XMM ssse3 WEIGHTING_FUNCS 2, 8 WEIGHTING_FUNCS 4, 8 WEIGHTING_FUNCS 6, 8 diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h index df49269..a652cbf 100644 --- a/libavcodec/x86/hevcdsp.h +++ b/libavcodec/x86/hevcdsp.h @@ -91,43 +91,43 @@ void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dstst /////////////////////////////////////////////////////////////////////////////// // QPEL_PIXELS EPEL_PIXELS /////////////////////////////////////////////////////////////////////////////// -EPEL_PROTOTYPES(pel_pixels , 8, sse4); -EPEL_PROTOTYPES(pel_pixels , 10, sse4); -EPEL_PROTOTYPES(pel_pixels , 12, sse4); +EPEL_PROTOTYPES(pel_pixels , 8, ssse3); +EPEL_PROTOTYPES(pel_pixels , 10, ssse3); +EPEL_PROTOTYPES(pel_pixels , 12, ssse3); /////////////////////////////////////////////////////////////////////////////// // EPEL /////////////////////////////////////////////////////////////////////////////// -EPEL_PROTOTYPES(epel_h , 8, sse4); -EPEL_PROTOTYPES(epel_h , 10, sse4); -EPEL_PROTOTYPES(epel_h , 12, sse4); +EPEL_PROTOTYPES(epel_h , 8, ssse3); +EPEL_PROTOTYPES(epel_h , 10, ssse3); +EPEL_PROTOTYPES(epel_h , 12, ssse3); -EPEL_PROTOTYPES(epel_v , 8, sse4); -EPEL_PROTOTYPES(epel_v , 10, sse4); -EPEL_PROTOTYPES(epel_v , 12, sse4); +EPEL_PROTOTYPES(epel_v , 8, ssse3); +EPEL_PROTOTYPES(epel_v , 10, ssse3); +EPEL_PROTOTYPES(epel_v , 12, ssse3); -EPEL_PROTOTYPES(epel_hv , 8, sse4); -EPEL_PROTOTYPES(epel_hv , 10, sse4); -EPEL_PROTOTYPES(epel_hv , 12, sse4); +EPEL_PROTOTYPES(epel_hv , 8, ssse3); +EPEL_PROTOTYPES(epel_hv , 10, ssse3); +EPEL_PROTOTYPES(epel_hv , 12, ssse3); /////////////////////////////////////////////////////////////////////////////// // QPEL /////////////////////////////////////////////////////////////////////////////// -QPEL_PROTOTYPES(qpel_h , 8, sse4); -QPEL_PROTOTYPES(qpel_h , 10, sse4); -QPEL_PROTOTYPES(qpel_h , 12, sse4); +QPEL_PROTOTYPES(qpel_h , 8, ssse3); +QPEL_PROTOTYPES(qpel_h , 10, ssse3); +QPEL_PROTOTYPES(qpel_h , 12, ssse3); -QPEL_PROTOTYPES(qpel_v, 8, sse4); -QPEL_PROTOTYPES(qpel_v, 10, sse4); -QPEL_PROTOTYPES(qpel_v, 12, sse4); +QPEL_PROTOTYPES(qpel_v, 8, ssse3); +QPEL_PROTOTYPES(qpel_v, 10, ssse3); +QPEL_PROTOTYPES(qpel_v, 12, ssse3); -QPEL_PROTOTYPES(qpel_hv, 8, sse4); -QPEL_PROTOTYPES(qpel_hv, 10, sse4); -QPEL_PROTOTYPES(qpel_hv, 12, sse4); +QPEL_PROTOTYPES(qpel_hv, 8, ssse3); +QPEL_PROTOTYPES(qpel_hv, 10, ssse3); +QPEL_PROTOTYPES(qpel_hv, 12, ssse3); -WEIGHTING_PROTOTYPES(8, sse4); -WEIGHTING_PROTOTYPES(10, sse4); -WEIGHTING_PROTOTYPES(12, sse4); +WEIGHTING_PROTOTYPES(8, ssse3); +WEIGHTING_PROTOTYPES(10, ssse3); +WEIGHTING_PROTOTYPES(12, ssse3); /////////////////////////////////////////////////////////////////////////////// // TRANSFORM_ADD diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index acb82c4..95f69e0 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -163,126 +163,126 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dsts mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \ mc_rep_bi_func2(name, bitd, step1, step2, W, opt) -#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL - -mc_rep_funcs(pel_pixels, 8, 16, 64, sse4); -mc_rep_funcs(pel_pixels, 8, 16, 48, sse4); -mc_rep_funcs(pel_pixels, 8, 16, 32, sse4); -mc_rep_funcs(pel_pixels, 8, 8, 24, sse4); -mc_rep_funcs(pel_pixels,10, 8, 64, sse4); -mc_rep_funcs(pel_pixels,10, 8, 48, sse4); -mc_rep_funcs(pel_pixels,10, 8, 32, sse4); -mc_rep_funcs(pel_pixels,10, 8, 24, sse4); -mc_rep_funcs(pel_pixels,10, 8, 16, sse4); -mc_rep_funcs(pel_pixels,10, 4, 12, sse4); -mc_rep_funcs(pel_pixels,12, 8, 64, sse4); -mc_rep_funcs(pel_pixels,12, 8, 48, sse4); -mc_rep_funcs(pel_pixels,12, 8, 32, sse4); -mc_rep_funcs(pel_pixels,12, 8, 24, sse4); -mc_rep_funcs(pel_pixels,12, 8, 16, sse4); -mc_rep_funcs(pel_pixels,12, 4, 12, sse4); - -mc_rep_funcs(epel_h, 8, 16, 64, sse4); -mc_rep_funcs(epel_h, 8, 16, 48, sse4); -mc_rep_funcs(epel_h, 8, 16, 32, sse4); -mc_rep_funcs(epel_h, 8, 8, 24, sse4); -mc_rep_funcs(epel_h,10, 8, 64, sse4); -mc_rep_funcs(epel_h,10, 8, 48, sse4); -mc_rep_funcs(epel_h,10, 8, 32, sse4); -mc_rep_funcs(epel_h,10, 8, 24, sse4); -mc_rep_funcs(epel_h,10, 8, 16, sse4); -mc_rep_funcs(epel_h,10, 4, 12, sse4); -mc_rep_funcs(epel_h,12, 8, 64, sse4); -mc_rep_funcs(epel_h,12, 8, 48, sse4); -mc_rep_funcs(epel_h,12, 8, 32, sse4); -mc_rep_funcs(epel_h,12, 8, 24, sse4); -mc_rep_funcs(epel_h,12, 8, 16, sse4); -mc_rep_funcs(epel_h,12, 4, 12, sse4); -mc_rep_funcs(epel_v, 8, 16, 64, sse4); -mc_rep_funcs(epel_v, 8, 16, 48, sse4); -mc_rep_funcs(epel_v, 8, 16, 32, sse4); -mc_rep_funcs(epel_v, 8, 8, 24, sse4); -mc_rep_funcs(epel_v,10, 8, 64, sse4); -mc_rep_funcs(epel_v,10, 8, 48, sse4); -mc_rep_funcs(epel_v,10, 8, 32, sse4); -mc_rep_funcs(epel_v,10, 8, 24, sse4); -mc_rep_funcs(epel_v,10, 8, 16, sse4); -mc_rep_funcs(epel_v,10, 4, 12, sse4); -mc_rep_funcs(epel_v,12, 8, 64, sse4); -mc_rep_funcs(epel_v,12, 8, 48, sse4); -mc_rep_funcs(epel_v,12, 8, 32, sse4); -mc_rep_funcs(epel_v,12, 8, 24, sse4); -mc_rep_funcs(epel_v,12, 8, 16, sse4); -mc_rep_funcs(epel_v,12, 4, 12, sse4); -mc_rep_funcs(epel_hv, 8, 8, 64, sse4); -mc_rep_funcs(epel_hv, 8, 8, 48, sse4); -mc_rep_funcs(epel_hv, 8, 8, 32, sse4); -mc_rep_funcs(epel_hv, 8, 8, 24, sse4); -mc_rep_funcs(epel_hv, 8, 8, 16, sse4); -mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4); -mc_rep_funcs(epel_hv,10, 8, 64, sse4); -mc_rep_funcs(epel_hv,10, 8, 48, sse4); -mc_rep_funcs(epel_hv,10, 8, 32, sse4); -mc_rep_funcs(epel_hv,10, 8, 24, sse4); -mc_rep_funcs(epel_hv,10, 8, 16, sse4); -mc_rep_funcs(epel_hv,10, 4, 12, sse4); -mc_rep_funcs(epel_hv,12, 8, 64, sse4); -mc_rep_funcs(epel_hv,12, 8, 48, sse4); -mc_rep_funcs(epel_hv,12, 8, 32, sse4); -mc_rep_funcs(epel_hv,12, 8, 24, sse4); -mc_rep_funcs(epel_hv,12, 8, 16, sse4); -mc_rep_funcs(epel_hv,12, 4, 12, sse4); - -mc_rep_funcs(qpel_h, 8, 16, 64, sse4); -mc_rep_funcs(qpel_h, 8, 16, 48, sse4); -mc_rep_funcs(qpel_h, 8, 16, 32, sse4); -mc_rep_funcs(qpel_h, 8, 8, 24, sse4); -mc_rep_funcs(qpel_h,10, 8, 64, sse4); -mc_rep_funcs(qpel_h,10, 8, 48, sse4); -mc_rep_funcs(qpel_h,10, 8, 32, sse4); -mc_rep_funcs(qpel_h,10, 8, 24, sse4); -mc_rep_funcs(qpel_h,10, 8, 16, sse4); -mc_rep_funcs(qpel_h,10, 4, 12, sse4); -mc_rep_funcs(qpel_h,12, 8, 64, sse4); -mc_rep_funcs(qpel_h,12, 8, 48, sse4); -mc_rep_funcs(qpel_h,12, 8, 32, sse4); -mc_rep_funcs(qpel_h,12, 8, 24, sse4); -mc_rep_funcs(qpel_h,12, 8, 16, sse4); -mc_rep_funcs(qpel_h,12, 4, 12, sse4); -mc_rep_funcs(qpel_v, 8, 16, 64, sse4); -mc_rep_funcs(qpel_v, 8, 16, 48, sse4); -mc_rep_funcs(qpel_v, 8, 16, 32, sse4); -mc_rep_funcs(qpel_v, 8, 8, 24, sse4); -mc_rep_funcs(qpel_v,10, 8, 64, sse4); -mc_rep_funcs(qpel_v,10, 8, 48, sse4); -mc_rep_funcs(qpel_v,10, 8, 32, sse4); -mc_rep_funcs(qpel_v,10, 8, 24, sse4); -mc_rep_funcs(qpel_v,10, 8, 16, sse4); -mc_rep_funcs(qpel_v,10, 4, 12, sse4); -mc_rep_funcs(qpel_v,12, 8, 64, sse4); -mc_rep_funcs(qpel_v,12, 8, 48, sse4); -mc_rep_funcs(qpel_v,12, 8, 32, sse4); -mc_rep_funcs(qpel_v,12, 8, 24, sse4); -mc_rep_funcs(qpel_v,12, 8, 16, sse4); -mc_rep_funcs(qpel_v,12, 4, 12, sse4); -mc_rep_funcs(qpel_hv, 8, 8, 64, sse4); -mc_rep_funcs(qpel_hv, 8, 8, 48, sse4); -mc_rep_funcs(qpel_hv, 8, 8, 32, sse4); -mc_rep_funcs(qpel_hv, 8, 8, 24, sse4); -mc_rep_funcs(qpel_hv, 8, 8, 16, sse4); -mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4); -mc_rep_funcs(qpel_hv,10, 8, 64, sse4); -mc_rep_funcs(qpel_hv,10, 8, 48, sse4); -mc_rep_funcs(qpel_hv,10, 8, 32, sse4); -mc_rep_funcs(qpel_hv,10, 8, 24, sse4); -mc_rep_funcs(qpel_hv,10, 8, 16, sse4); -mc_rep_funcs(qpel_hv,10, 4, 12, sse4); -mc_rep_funcs(qpel_hv,12, 8, 64, sse4); -mc_rep_funcs(qpel_hv,12, 8, 48, sse4); -mc_rep_funcs(qpel_hv,12, 8, 32, sse4); -mc_rep_funcs(qpel_hv,12, 8, 24, sse4); -mc_rep_funcs(qpel_hv,12, 8, 16, sse4); -mc_rep_funcs(qpel_hv,12, 4, 12, sse4); +#if ARCH_X86_64 && HAVE_SSSE3_EXTERNAL + +mc_rep_funcs(pel_pixels, 8, 16, 64, ssse3); +mc_rep_funcs(pel_pixels, 8, 16, 48, ssse3); +mc_rep_funcs(pel_pixels, 8, 16, 32, ssse3); +mc_rep_funcs(pel_pixels, 8, 8, 24, ssse3); +mc_rep_funcs(pel_pixels,10, 8, 64, ssse3); +mc_rep_funcs(pel_pixels,10, 8, 48, ssse3); +mc_rep_funcs(pel_pixels,10, 8, 32, ssse3); +mc_rep_funcs(pel_pixels,10, 8, 24, ssse3); +mc_rep_funcs(pel_pixels,10, 8, 16, ssse3); +mc_rep_funcs(pel_pixels,10, 4, 12, ssse3); +mc_rep_funcs(pel_pixels,12, 8, 64, ssse3); +mc_rep_funcs(pel_pixels,12, 8, 48, ssse3); +mc_rep_funcs(pel_pixels,12, 8, 32, ssse3); +mc_rep_funcs(pel_pixels,12, 8, 24, ssse3); +mc_rep_funcs(pel_pixels,12, 8, 16, ssse3); +mc_rep_funcs(pel_pixels,12, 4, 12, ssse3); + +mc_rep_funcs(epel_h, 8, 16, 64, ssse3); +mc_rep_funcs(epel_h, 8, 16, 48, ssse3); +mc_rep_funcs(epel_h, 8, 16, 32, ssse3); +mc_rep_funcs(epel_h, 8, 8, 24, ssse3); +mc_rep_funcs(epel_h,10, 8, 64, ssse3); +mc_rep_funcs(epel_h,10, 8, 48, ssse3); +mc_rep_funcs(epel_h,10, 8, 32, ssse3); +mc_rep_funcs(epel_h,10, 8, 24, ssse3); +mc_rep_funcs(epel_h,10, 8, 16, ssse3); +mc_rep_funcs(epel_h,10, 4, 12, ssse3); +mc_rep_funcs(epel_h,12, 8, 64, ssse3); +mc_rep_funcs(epel_h,12, 8, 48, ssse3); +mc_rep_funcs(epel_h,12, 8, 32, ssse3); +mc_rep_funcs(epel_h,12, 8, 24, ssse3); +mc_rep_funcs(epel_h,12, 8, 16, ssse3); +mc_rep_funcs(epel_h,12, 4, 12, ssse3); +mc_rep_funcs(epel_v, 8, 16, 64, ssse3); +mc_rep_funcs(epel_v, 8, 16, 48, ssse3); +mc_rep_funcs(epel_v, 8, 16, 32, ssse3); +mc_rep_funcs(epel_v, 8, 8, 24, ssse3); +mc_rep_funcs(epel_v,10, 8, 64, ssse3); +mc_rep_funcs(epel_v,10, 8, 48, ssse3); +mc_rep_funcs(epel_v,10, 8, 32, ssse3); +mc_rep_funcs(epel_v,10, 8, 24, ssse3); +mc_rep_funcs(epel_v,10, 8, 16, ssse3); +mc_rep_funcs(epel_v,10, 4, 12, ssse3); +mc_rep_funcs(epel_v,12, 8, 64, ssse3); +mc_rep_funcs(epel_v,12, 8, 48, ssse3); +mc_rep_funcs(epel_v,12, 8, 32, ssse3); +mc_rep_funcs(epel_v,12, 8, 24, ssse3); +mc_rep_funcs(epel_v,12, 8, 16, ssse3); +mc_rep_funcs(epel_v,12, 4, 12, ssse3); +mc_rep_funcs(epel_hv, 8, 8, 64, ssse3); +mc_rep_funcs(epel_hv, 8, 8, 48, ssse3); +mc_rep_funcs(epel_hv, 8, 8, 32, ssse3); +mc_rep_funcs(epel_hv, 8, 8, 24, ssse3); +mc_rep_funcs(epel_hv, 8, 8, 16, ssse3); +mc_rep_funcs2(epel_hv,8, 8, 4, 12, ssse3); +mc_rep_funcs(epel_hv,10, 8, 64, ssse3); +mc_rep_funcs(epel_hv,10, 8, 48, ssse3); +mc_rep_funcs(epel_hv,10, 8, 32, ssse3); +mc_rep_funcs(epel_hv,10, 8, 24, ssse3); +mc_rep_funcs(epel_hv,10, 8, 16, ssse3); +mc_rep_funcs(epel_hv,10, 4, 12, ssse3); +mc_rep_funcs(epel_hv,12, 8, 64, ssse3); +mc_rep_funcs(epel_hv,12, 8, 48, ssse3); +mc_rep_funcs(epel_hv,12, 8, 32, ssse3); +mc_rep_funcs(epel_hv,12, 8, 24, ssse3); +mc_rep_funcs(epel_hv,12, 8, 16, ssse3); +mc_rep_funcs(epel_hv,12, 4, 12, ssse3); + +mc_rep_funcs(qpel_h, 8, 16, 64, ssse3); +mc_rep_funcs(qpel_h, 8, 16, 48, ssse3); +mc_rep_funcs(qpel_h, 8, 16, 32, ssse3); +mc_rep_funcs(qpel_h, 8, 8, 24, ssse3); +mc_rep_funcs(qpel_h,10, 8, 64, ssse3); +mc_rep_funcs(qpel_h,10, 8, 48, ssse3); +mc_rep_funcs(qpel_h,10, 8, 32, ssse3); +mc_rep_funcs(qpel_h,10, 8, 24, ssse3); +mc_rep_funcs(qpel_h,10, 8, 16, ssse3); +mc_rep_funcs(qpel_h,10, 4, 12, ssse3); +mc_rep_funcs(qpel_h,12, 8, 64, ssse3); +mc_rep_funcs(qpel_h,12, 8, 48, ssse3); +mc_rep_funcs(qpel_h,12, 8, 32, ssse3); +mc_rep_funcs(qpel_h,12, 8, 24, ssse3); +mc_rep_funcs(qpel_h,12, 8, 16, ssse3); +mc_rep_funcs(qpel_h,12, 4, 12, ssse3); +mc_rep_funcs(qpel_v, 8, 16, 64, ssse3); +mc_rep_funcs(qpel_v, 8, 16, 48, ssse3); +mc_rep_funcs(qpel_v, 8, 16, 32, ssse3); +mc_rep_funcs(qpel_v, 8, 8, 24, ssse3); +mc_rep_funcs(qpel_v,10, 8, 64, ssse3); +mc_rep_funcs(qpel_v,10, 8, 48, ssse3); +mc_rep_funcs(qpel_v,10, 8, 32, ssse3); +mc_rep_funcs(qpel_v,10, 8, 24, ssse3); +mc_rep_funcs(qpel_v,10, 8, 16, ssse3); +mc_rep_funcs(qpel_v,10, 4, 12, ssse3); +mc_rep_funcs(qpel_v,12, 8, 64, ssse3); +mc_rep_funcs(qpel_v,12, 8, 48, ssse3); +mc_rep_funcs(qpel_v,12, 8, 32, ssse3); +mc_rep_funcs(qpel_v,12, 8, 24, ssse3); +mc_rep_funcs(qpel_v,12, 8, 16, ssse3); +mc_rep_funcs(qpel_v,12, 4, 12, ssse3); +mc_rep_funcs(qpel_hv, 8, 8, 64, ssse3); +mc_rep_funcs(qpel_hv, 8, 8, 48, ssse3); +mc_rep_funcs(qpel_hv, 8, 8, 32, ssse3); +mc_rep_funcs(qpel_hv, 8, 8, 24, ssse3); +mc_rep_funcs(qpel_hv, 8, 8, 16, ssse3); +mc_rep_funcs2(qpel_hv,8, 8, 4, 12, ssse3); +mc_rep_funcs(qpel_hv,10, 8, 64, ssse3); +mc_rep_funcs(qpel_hv,10, 8, 48, ssse3); +mc_rep_funcs(qpel_hv,10, 8, 32, ssse3); +mc_rep_funcs(qpel_hv,10, 8, 24, ssse3); +mc_rep_funcs(qpel_hv,10, 8, 16, ssse3); +mc_rep_funcs(qpel_hv,10, 4, 12, ssse3); +mc_rep_funcs(qpel_hv,12, 8, 64, ssse3); +mc_rep_funcs(qpel_hv,12, 8, 48, ssse3); +mc_rep_funcs(qpel_hv,12, 8, 32, ssse3); +mc_rep_funcs(qpel_hv,12, 8, 24, ssse3); +mc_rep_funcs(qpel_hv,12, 8, 16, ssse3); +mc_rep_funcs(qpel_hv,12, 4, 12, ssse3); #define mc_rep_uni_w(bitd, step, W, opt) \ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\ @@ -299,26 +299,26 @@ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststri } \ } -mc_rep_uni_w(8, 6, 12, sse4); -mc_rep_uni_w(8, 8, 16, sse4); -mc_rep_uni_w(8, 8, 24, sse4); -mc_rep_uni_w(8, 8, 32, sse4); -mc_rep_uni_w(8, 8, 48, sse4); -mc_rep_uni_w(8, 8, 64, sse4); - -mc_rep_uni_w(10, 6, 12, sse4); -mc_rep_uni_w(10, 8, 16, sse4); -mc_rep_uni_w(10, 8, 24, sse4); -mc_rep_uni_w(10, 8, 32, sse4); -mc_rep_uni_w(10, 8, 48, sse4); -mc_rep_uni_w(10, 8, 64, sse4); - -mc_rep_uni_w(12, 6, 12, sse4); -mc_rep_uni_w(12, 8, 16, sse4); -mc_rep_uni_w(12, 8, 24, sse4); -mc_rep_uni_w(12, 8, 32, sse4); -mc_rep_uni_w(12, 8, 48, sse4); -mc_rep_uni_w(12, 8, 64, sse4); +mc_rep_uni_w(8, 6, 12, ssse3); +mc_rep_uni_w(8, 8, 16, ssse3); +mc_rep_uni_w(8, 8, 24, ssse3); +mc_rep_uni_w(8, 8, 32, ssse3); +mc_rep_uni_w(8, 8, 48, ssse3); +mc_rep_uni_w(8, 8, 64, ssse3); + +mc_rep_uni_w(10, 6, 12, ssse3); +mc_rep_uni_w(10, 8, 16, ssse3); +mc_rep_uni_w(10, 8, 24, ssse3); +mc_rep_uni_w(10, 8, 32, ssse3); +mc_rep_uni_w(10, 8, 48, ssse3); +mc_rep_uni_w(10, 8, 64, ssse3); + +mc_rep_uni_w(12, 6, 12, ssse3); +mc_rep_uni_w(12, 8, 16, ssse3); +mc_rep_uni_w(12, 8, 24, ssse3); +mc_rep_uni_w(12, 8, 32, ssse3); +mc_rep_uni_w(12, 8, 48, ssse3); +mc_rep_uni_w(12, 8, 64, ssse3); #define mc_rep_bi_w(bitd, step, W, opt) \ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \ @@ -338,26 +338,26 @@ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststrid } \ } -mc_rep_bi_w(8, 6, 12, sse4); -mc_rep_bi_w(8, 8, 16, sse4); -mc_rep_bi_w(8, 8, 24, sse4); -mc_rep_bi_w(8, 8, 32, sse4); -mc_rep_bi_w(8, 8, 48, sse4); -mc_rep_bi_w(8, 8, 64, sse4); - -mc_rep_bi_w(10, 6, 12, sse4); -mc_rep_bi_w(10, 8, 16, sse4); -mc_rep_bi_w(10, 8, 24, sse4); -mc_rep_bi_w(10, 8, 32, sse4); -mc_rep_bi_w(10, 8, 48, sse4); -mc_rep_bi_w(10, 8, 64, sse4); - -mc_rep_bi_w(12, 6, 12, sse4); -mc_rep_bi_w(12, 8, 16, sse4); -mc_rep_bi_w(12, 8, 24, sse4); -mc_rep_bi_w(12, 8, 32, sse4); -mc_rep_bi_w(12, 8, 48, sse4); -mc_rep_bi_w(12, 8, 64, sse4); +mc_rep_bi_w(8, 6, 12, ssse3); +mc_rep_bi_w(8, 8, 16, ssse3); +mc_rep_bi_w(8, 8, 24, ssse3); +mc_rep_bi_w(8, 8, 32, ssse3); +mc_rep_bi_w(8, 8, 48, ssse3); +mc_rep_bi_w(8, 8, 64, ssse3); + +mc_rep_bi_w(10, 6, 12, ssse3); +mc_rep_bi_w(10, 8, 16, ssse3); +mc_rep_bi_w(10, 8, 24, ssse3); +mc_rep_bi_w(10, 8, 32, ssse3); +mc_rep_bi_w(10, 8, 48, ssse3); +mc_rep_bi_w(10, 8, 64, ssse3); + +mc_rep_bi_w(12, 6, 12, ssse3); +mc_rep_bi_w(12, 8, 16, ssse3); +mc_rep_bi_w(12, 8, 24, ssse3); +mc_rep_bi_w(12, 8, 32, ssse3); +mc_rep_bi_w(12, 8, 48, ssse3); +mc_rep_bi_w(12, 8, 64, ssse3); #define mc_uni_w_func(name, bitd, W, opt) \ static void put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ @@ -381,41 +381,41 @@ static void put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _ mc_uni_w_func(name, bitd, 48, opt); \ mc_uni_w_func(name, bitd, 64, opt) -mc_uni_w_funcs(pel_pixels, 8, sse4); -mc_uni_w_func(pel_pixels, 8, 6, sse4); -mc_uni_w_funcs(epel_h, 8, sse4); -mc_uni_w_func(epel_h, 8, 6, sse4); -mc_uni_w_funcs(epel_v, 8, sse4); -mc_uni_w_func(epel_v, 8, 6, sse4); -mc_uni_w_funcs(epel_hv, 8, sse4); -mc_uni_w_func(epel_hv, 8, 6, sse4); -mc_uni_w_funcs(qpel_h, 8, sse4); -mc_uni_w_funcs(qpel_v, 8, sse4); -mc_uni_w_funcs(qpel_hv, 8, sse4); - -mc_uni_w_funcs(pel_pixels, 10, sse4); -mc_uni_w_func(pel_pixels, 10, 6, sse4); -mc_uni_w_funcs(epel_h, 10, sse4); -mc_uni_w_func(epel_h, 10, 6, sse4); -mc_uni_w_funcs(epel_v, 10, sse4); -mc_uni_w_func(epel_v, 10, 6, sse4); -mc_uni_w_funcs(epel_hv, 10, sse4); -mc_uni_w_func(epel_hv, 10, 6, sse4); -mc_uni_w_funcs(qpel_h, 10, sse4); -mc_uni_w_funcs(qpel_v, 10, sse4); -mc_uni_w_funcs(qpel_hv, 10, sse4); - -mc_uni_w_funcs(pel_pixels, 12, sse4); -mc_uni_w_func(pel_pixels, 12, 6, sse4); -mc_uni_w_funcs(epel_h, 12, sse4); -mc_uni_w_func(epel_h, 12, 6, sse4); -mc_uni_w_funcs(epel_v, 12, sse4); -mc_uni_w_func(epel_v, 12, 6, sse4); -mc_uni_w_funcs(epel_hv, 12, sse4); -mc_uni_w_func(epel_hv, 12, 6, sse4); -mc_uni_w_funcs(qpel_h, 12, sse4); -mc_uni_w_funcs(qpel_v, 12, sse4); -mc_uni_w_funcs(qpel_hv, 12, sse4); +mc_uni_w_funcs(pel_pixels, 8, ssse3); +mc_uni_w_func(pel_pixels, 8, 6, ssse3); +mc_uni_w_funcs(epel_h, 8, ssse3); +mc_uni_w_func(epel_h, 8, 6, ssse3); +mc_uni_w_funcs(epel_v, 8, ssse3); +mc_uni_w_func(epel_v, 8, 6, ssse3); +mc_uni_w_funcs(epel_hv, 8, ssse3); +mc_uni_w_func(epel_hv, 8, 6, ssse3); +mc_uni_w_funcs(qpel_h, 8, ssse3); +mc_uni_w_funcs(qpel_v, 8, ssse3); +mc_uni_w_funcs(qpel_hv, 8, ssse3); + +mc_uni_w_funcs(pel_pixels, 10, ssse3); +mc_uni_w_func(pel_pixels, 10, 6, ssse3); +mc_uni_w_funcs(epel_h, 10, ssse3); +mc_uni_w_func(epel_h, 10, 6, ssse3); +mc_uni_w_funcs(epel_v, 10, ssse3); +mc_uni_w_func(epel_v, 10, 6, ssse3); +mc_uni_w_funcs(epel_hv, 10, ssse3); +mc_uni_w_func(epel_hv, 10, 6, ssse3); +mc_uni_w_funcs(qpel_h, 10, ssse3); +mc_uni_w_funcs(qpel_v, 10, ssse3); +mc_uni_w_funcs(qpel_hv, 10, ssse3); + +mc_uni_w_funcs(pel_pixels, 12, ssse3); +mc_uni_w_func(pel_pixels, 12, 6, ssse3); +mc_uni_w_funcs(epel_h, 12, ssse3); +mc_uni_w_func(epel_h, 12, 6, ssse3); +mc_uni_w_funcs(epel_v, 12, ssse3); +mc_uni_w_func(epel_v, 12, 6, ssse3); +mc_uni_w_funcs(epel_hv, 12, ssse3); +mc_uni_w_func(epel_hv, 12, 6, ssse3); +mc_uni_w_funcs(qpel_h, 12, ssse3); +mc_uni_w_funcs(qpel_v, 12, ssse3); +mc_uni_w_funcs(qpel_hv, 12, ssse3); #define mc_bi_w_func(name, bitd, W, opt) \ static void put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \ @@ -441,42 +441,42 @@ static void put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _d mc_bi_w_func(name, bitd, 48, opt); \ mc_bi_w_func(name, bitd, 64, opt) -mc_bi_w_funcs(pel_pixels, 8, sse4); -mc_bi_w_func(pel_pixels, 8, 6, sse4); -mc_bi_w_funcs(epel_h, 8, sse4); -mc_bi_w_func(epel_h, 8, 6, sse4); -mc_bi_w_funcs(epel_v, 8, sse4); -mc_bi_w_func(epel_v, 8, 6, sse4); -mc_bi_w_funcs(epel_hv, 8, sse4); -mc_bi_w_func(epel_hv, 8, 6, sse4); -mc_bi_w_funcs(qpel_h, 8, sse4); -mc_bi_w_funcs(qpel_v, 8, sse4); -mc_bi_w_funcs(qpel_hv, 8, sse4); - -mc_bi_w_funcs(pel_pixels, 10, sse4); -mc_bi_w_func(pel_pixels, 10, 6, sse4); -mc_bi_w_funcs(epel_h, 10, sse4); -mc_bi_w_func(epel_h, 10, 6, sse4); -mc_bi_w_funcs(epel_v, 10, sse4); -mc_bi_w_func(epel_v, 10, 6, sse4); -mc_bi_w_funcs(epel_hv, 10, sse4); -mc_bi_w_func(epel_hv, 10, 6, sse4); -mc_bi_w_funcs(qpel_h, 10, sse4); -mc_bi_w_funcs(qpel_v, 10, sse4); -mc_bi_w_funcs(qpel_hv, 10, sse4); - -mc_bi_w_funcs(pel_pixels, 12, sse4); -mc_bi_w_func(pel_pixels, 12, 6, sse4); -mc_bi_w_funcs(epel_h, 12, sse4); -mc_bi_w_func(epel_h, 12, 6, sse4); -mc_bi_w_funcs(epel_v, 12, sse4); -mc_bi_w_func(epel_v, 12, 6, sse4); -mc_bi_w_funcs(epel_hv, 12, sse4); -mc_bi_w_func(epel_hv, 12, 6, sse4); -mc_bi_w_funcs(qpel_h, 12, sse4); -mc_bi_w_funcs(qpel_v, 12, sse4); -mc_bi_w_funcs(qpel_hv, 12, sse4); -#endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL +mc_bi_w_funcs(pel_pixels, 8, ssse3); +mc_bi_w_func(pel_pixels, 8, 6, ssse3); +mc_bi_w_funcs(epel_h, 8, ssse3); +mc_bi_w_func(epel_h, 8, 6, ssse3); +mc_bi_w_funcs(epel_v, 8, ssse3); +mc_bi_w_func(epel_v, 8, 6, ssse3); +mc_bi_w_funcs(epel_hv, 8, ssse3); +mc_bi_w_func(epel_hv, 8, 6, ssse3); +mc_bi_w_funcs(qpel_h, 8, ssse3); +mc_bi_w_funcs(qpel_v, 8, ssse3); +mc_bi_w_funcs(qpel_hv, 8, ssse3); + +mc_bi_w_funcs(pel_pixels, 10, ssse3); +mc_bi_w_func(pel_pixels, 10, 6, ssse3); +mc_bi_w_funcs(epel_h, 10, ssse3); +mc_bi_w_func(epel_h, 10, 6, ssse3); +mc_bi_w_funcs(epel_v, 10, ssse3); +mc_bi_w_func(epel_v, 10, 6, ssse3); +mc_bi_w_funcs(epel_hv, 10, ssse3); +mc_bi_w_func(epel_hv, 10, 6, ssse3); +mc_bi_w_funcs(qpel_h, 10, ssse3); +mc_bi_w_funcs(qpel_v, 10, ssse3); +mc_bi_w_funcs(qpel_hv, 10, ssse3); + +mc_bi_w_funcs(pel_pixels, 12, ssse3); +mc_bi_w_func(pel_pixels, 12, 6, ssse3); +mc_bi_w_funcs(epel_h, 12, ssse3); +mc_bi_w_func(epel_h, 12, 6, ssse3); +mc_bi_w_funcs(epel_v, 12, ssse3); +mc_bi_w_func(epel_v, 12, 6, ssse3); +mc_bi_w_funcs(epel_hv, 12, ssse3); +mc_bi_w_func(epel_hv, 12, 6, ssse3); +mc_bi_w_funcs(qpel_h, 12, ssse3); +mc_bi_w_funcs(qpel_v, 12, ssse3); +mc_bi_w_funcs(qpel_hv, 12, ssse3); +#endif //ARCH_X86_64 && HAVE_SSSE3_EXTERNAL #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \ @@ -528,18 +528,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; - } - if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { - EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4); - EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4); + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, ssse3); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, ssse3); - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4); + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, ssse3); } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx; @@ -581,17 +579,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; - } - if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { - EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4); - EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4); - - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4); + + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, ssse3); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, ssse3); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, ssse3); } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx; @@ -630,17 +627,16 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3; - } - if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) { - EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4); - EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4); - EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4); - - QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4); - QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4); + + EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, ssse3); + EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, ssse3); + EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, ssse3); + + QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, ssse3); + QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, ssse3); } if (EXTERNAL_AVX(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx; -- 1.9.2.msysgit.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel