--- libavcodec/x86/vp9dsp_init.h | 6 ++ libavcodec/x86/vp9dsp_init_16bpp_template.c | 21 ++++++- libavcodec/x86/vp9itxfm.asm | 58 ----------------- libavcodec/x86/vp9itxfm_16bpp.asm | 96 ++++++++++++++++++++++------- libavcodec/x86/vp9itxfm_template.asm | 58 +++++++++++++++++ 5 files changed, 157 insertions(+), 82 deletions(-)
diff --git a/libavcodec/x86/vp9dsp_init.h b/libavcodec/x86/vp9dsp_init.h index 5d07b62..b3b0558 100644 --- a/libavcodec/x86/vp9dsp_init.h +++ b/libavcodec/x86/vp9dsp_init.h @@ -62,6 +62,12 @@ void cat(ff_vp9_##typea##_##typeb##_##size##x##size##_add_, bpp, _##opt)(uint8_t int16_t *block, \ int eob) +#define decl_itxfm_funcs(size, bpp, opt) \ +decl_itxfm_func(idct, idct, size, bpp, opt); \ +decl_itxfm_func(iadst, idct, size, bpp, opt); \ +decl_itxfm_func(idct, iadst, size, bpp, opt); \ +decl_itxfm_func(iadst, iadst, size, bpp, opt) + #define mc_rep_func(avg, sz, hsz, hszb, dir, opt, type, f_sz, bpp) \ static av_always_inline void \ ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c b/libavcodec/x86/vp9dsp_init_16bpp_template.c index 6e12af3..93fc684 100644 --- a/libavcodec/x86/vp9dsp_init_16bpp_template.c +++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c @@ -126,8 +126,11 @@ decl_ipred_fns(tm, BPC, mmxext, sse2); decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); #if BPC == 10 -decl_itxfm_func(idct, idct, 4, BPC, mmxext); -decl_itxfm_func(idct, idct, 4, BPC, ssse3); +decl_itxfm_func(idct, idct, 4, BPC, mmxext); +decl_itxfm_func(idct, iadst, 4, BPC, sse2); +decl_itxfm_func(iadst, idct, 4, BPC, sse2); +decl_itxfm_func(iadst, iadst, 4, BPC, sse2); +decl_itxfm_funcs(4, BPC, ssse3); #endif #endif /* HAVE_YASM */ @@ -169,6 +172,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) init_itx_func(idx, ADST_DCT, typea, typeb, size, bpp, opt); \ init_itx_func(idx, DCT_ADST, typea, typeb, size, bpp, opt); \ init_itx_func(idx, ADST_ADST, typea, typeb, size, bpp, opt) +#define init_itx_funcs(idx, size, bpp, opt) \ + init_itx_func(idx, DCT_DCT, idct, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_DCT, idct, iadst, size, bpp, opt); \ + init_itx_func(idx, DCT_ADST, iadst, idct, size, bpp, opt); \ + init_itx_func(idx, ADST_ADST, iadst, iadst, size, bpp, opt); \ if (EXTERNAL_MMXEXT(cpu_flags)) { init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); @@ -185,13 +193,20 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) init_subpel3(1, avg, BPC, sse2); init_lpf_funcs(BPC, sse2); init_8_16_32_ipred_funcs(tm, TM_VP8, BPC, sse2); +#if BPC == 10 + if (!bitexact) { + init_itx_func(TX_4X4, ADST_DCT, idct, iadst, 4, 10, sse2); + init_itx_func(TX_4X4, DCT_ADST, iadst, idct, 4, 10, sse2); + init_itx_func(TX_4X4, ADST_ADST, iadst, iadst, 4, 10, sse2); + } +#endif } if (EXTERNAL_SSSE3(cpu_flags)) { init_lpf_funcs(BPC, ssse3); #if BPC == 10 if (!bitexact) { - init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3); + init_itx_funcs(TX_4X4, 4, BPC, ssse3); } #endif } diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm index 200f15e..a3e0f86 100644 --- a/libavcodec/x86/vp9itxfm.asm +++ b/libavcodec/x86/vp9itxfm.asm @@ -289,64 +289,6 @@ IDCT_4x4_FN ssse3 ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); ;------------------------------------------------------------------------------------------- -%macro VP9_IADST4_1D 0 - movq2dq xmm0, m0 - movq2dq xmm1, m1 - movq2dq xmm2, m2 - movq2dq xmm3, m3 -%if cpuflag(ssse3) - paddw m3, m0 -%endif - punpcklwd xmm0, xmm1 - punpcklwd xmm2, xmm3 - pmaddwd xmm1, xmm0, [pw_5283_13377] - pmaddwd xmm4, xmm0, [pw_9929_13377] -%if notcpuflag(ssse3) - pmaddwd xmm6, xmm0, [pw_13377_0] -%endif - pmaddwd xmm0, [pw_15212_m13377] - pmaddwd xmm3, xmm2, [pw_15212_9929] -%if notcpuflag(ssse3) - pmaddwd xmm7, xmm2, [pw_m13377_13377] -%endif - pmaddwd xmm2, [pw_m5283_m15212] -%if cpuflag(ssse3) - psubw m3, m2 -%else - paddd xmm6, xmm7 -%endif - paddd xmm0, xmm2 - paddd xmm3, xmm5 - paddd xmm2, xmm5 -%if notcpuflag(ssse3) - paddd xmm6, xmm5 -%endif - paddd xmm1, xmm3 - paddd xmm0, xmm3 - paddd xmm4, xmm2 - psrad xmm1, 14 - psrad xmm0, 14 - psrad xmm4, 14 -%if cpuflag(ssse3) - pmulhrsw m3, [pw_13377x2] ; out2 -%else - psrad xmm6, 14 -%endif - packssdw xmm0, xmm0 - packssdw xmm1, xmm1 - packssdw xmm4, xmm4 -%if notcpuflag(ssse3) - packssdw xmm6, xmm6 -%endif - movdq2q m0, xmm0 ; out3 - movdq2q m1, xmm1 ; out0 - movdq2q m2, xmm4 ; out1 -%if notcpuflag(ssse3) - movdq2q m3, xmm6 ; out2 -%endif - SWAP 0, 1, 2, 3 -%endmacro - %macro IADST4_FN 5 INIT_MMX %5 cglobal vp9_%1_%3_4x4_add, 3, 3, 0, dst, stride, block, eob diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm b/libavcodec/x86/vp9itxfm_16bpp.asm index e067438..cf7c5d6 100644 --- a/libavcodec/x86/vp9itxfm_16bpp.asm +++ b/libavcodec/x86/vp9itxfm_16bpp.asm @@ -38,6 +38,15 @@ pw_m15137_6270: times 4 dw -15137, 6270 pw_6270_15137: times 4 dw 6270, 15137 pw_11585x2: times 8 dw 11585*2 +pw_5283_13377: times 4 dw 5283, 13377 +pw_9929_13377: times 4 dw 9929, 13377 +pw_15212_m13377: times 4 dw 15212, -13377 +pw_15212_9929: times 4 dw 15212, 9929 +pw_m5283_m15212: times 4 dw -5283, -15212 +pw_13377x2: times 8 dw 13377*2 +pw_m13377_13377: times 4 dw -13377, 13377 +pw_13377_0: times 4 dw 13377, 0 + SECTION .text %macro VP9_STORE_2X 6-7 dstq ; reg1, reg2, tmp1, tmp2, min, max, dst @@ -129,6 +138,30 @@ IWHT4_FN 10, 1023 INIT_MMX mmxext IWHT4_FN 12, 4095 +%macro VP9_IDCT4_WRITEOUT 0 +%if cpuflag(ssse3) + mova m5, [pw_2048] + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pmulhrsw m2, m5 + pmulhrsw m3, m5 +%else + mova m5, [pw_8] + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + psraw m0, 4 + psraw m1, 4 + psraw m2, 4 + psraw m3, 4 +%endif + mova m5, [pw_1023] + VP9_STORE_2X 0, 1, 6, 7, 4, 5 + lea dstq, [dstq+2*strideq] + VP9_STORE_2X 2, 3, 6, 7, 4, 5 +%endmacro + ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits ; in 15+1 words without additional effort, since the coefficients are 15bpp. @@ -186,27 +219,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob pxor m4, m4 ZERO_BLOCK blockq, 16, 4, m4 -%if cpuflag(ssse3) - mova m5, [pw_2048] - pmulhrsw m0, m5 - pmulhrsw m1, m5 - pmulhrsw m2, m5 - pmulhrsw m3, m5 -%else - mova m5, [pw_8] - paddw m0, m5 - paddw m1, m5 - paddw m2, m5 - paddw m3, m5 - psraw m0, 4 - psraw m1, 4 - psraw m2, 4 - psraw m3, 4 -%endif - mova m5, [pw_1023] - VP9_STORE_2X 0, 1, 6, 7, 4, 5 - lea dstq, [dstq+2*strideq] - VP9_STORE_2X 2, 3, 6, 7, 4, 5 + VP9_IDCT4_WRITEOUT RET %endmacro @@ -214,3 +227,44 @@ INIT_MMX mmxext IDCT4_10_FN INIT_MMX ssse3 IDCT4_10_FN + +%macro IADST4_FN 4 +cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob +%if WIN64 && notcpuflag(ssse3) + WIN64_SPILL_XMM 8 +%endif + movdqa xmm5, [pd_8192] + mova m0, [blockq+0*16+0] + mova m1, [blockq+1*16+0] + packssdw m0, [blockq+0*16+8] + packssdw m1, [blockq+1*16+8] + mova m2, [blockq+2*16+0] + mova m3, [blockq+3*16+0] + packssdw m2, [blockq+2*16+8] + packssdw m3, [blockq+3*16+8] + +%if cpuflag(ssse3) + mova m6, [pw_11585x2] +%endif +%ifnidn %1%3, iadstiadst + movdq2q m7, xmm5 +%endif + VP9_%2_1D + TRANSPOSE4x4W 0, 1, 2, 3, 4 + VP9_%4_1D + + pxor m4, m4 + ZERO_BLOCK blockq, 16, 4, m4 + VP9_IDCT4_WRITEOUT + RET +%endmacro + +INIT_MMX sse2 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 + +INIT_MMX ssse3 +IADST4_FN idct, IDCT4, iadst, IADST4 +IADST4_FN iadst, IADST4, idct, IDCT4 +IADST4_FN iadst, IADST4, iadst, IADST4 diff --git a/libavcodec/x86/vp9itxfm_template.asm b/libavcodec/x86/vp9itxfm_template.asm index f1a05a5..d2f2257 100644 --- a/libavcodec/x86/vp9itxfm_template.asm +++ b/libavcodec/x86/vp9itxfm_template.asm @@ -82,3 +82,61 @@ VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 VP9_IDCT4_1D_FINALIZE %endmacro + +%macro VP9_IADST4_1D 0 + movq2dq xmm0, m0 + movq2dq xmm1, m1 + movq2dq xmm2, m2 + movq2dq xmm3, m3 +%if cpuflag(ssse3) + paddw m3, m0 +%endif + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + pmaddwd xmm1, xmm0, [pw_5283_13377] + pmaddwd xmm4, xmm0, [pw_9929_13377] +%if notcpuflag(ssse3) + pmaddwd xmm6, xmm0, [pw_13377_0] +%endif + pmaddwd xmm0, [pw_15212_m13377] + pmaddwd xmm3, xmm2, [pw_15212_9929] +%if notcpuflag(ssse3) + pmaddwd xmm7, xmm2, [pw_m13377_13377] +%endif + pmaddwd xmm2, [pw_m5283_m15212] +%if cpuflag(ssse3) + psubw m3, m2 +%else + paddd xmm6, xmm7 +%endif + paddd xmm0, xmm2 + paddd xmm3, xmm5 + paddd xmm2, xmm5 +%if notcpuflag(ssse3) + paddd xmm6, xmm5 +%endif + paddd xmm1, xmm3 + paddd xmm0, xmm3 + paddd xmm4, xmm2 + psrad xmm1, 14 + psrad xmm0, 14 + psrad xmm4, 14 +%if cpuflag(ssse3) + pmulhrsw m3, [pw_13377x2] ; out2 +%else + psrad xmm6, 14 +%endif + packssdw xmm0, xmm0 + packssdw xmm1, xmm1 + packssdw xmm4, xmm4 +%if notcpuflag(ssse3) + packssdw xmm6, xmm6 +%endif + movdq2q m0, xmm0 ; out3 + movdq2q m1, xmm1 ; out0 + movdq2q m2, xmm4 ; out1 +%if notcpuflag(ssse3) + movdq2q m3, xmm6 ; out2 +%endif + SWAP 0, 1, 2, 3 +%endmacro -- 2.1.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel