This is not a clean equivalent of the openhevc patch, as we don't have the same history on hevc_mc.
-- Christophe
From 0eedfb7d5902cc388d018111c670cc31ad22db60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Raulet?= <mrau...@insa-rennes.fr> Date: Thu, 31 Jul 2014 19:26:57 +0200 Subject: [PATCH] x86: hevc_mc: remove non necessary moves Signed-off-by: Christophe Gisquet <christophe.gisq...@gmail.com> --- libavcodec/x86/hevc_mc.asm | 48 +++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm index 42fe65a..99f8c0a 100644 --- a/libavcodec/x86/hevc_mc.asm +++ b/libavcodec/x86/hevc_mc.asm @@ -81,20 +81,6 @@ QPEL_TABLE 12, 4, w, sse4 %if ARCH_X86_64 -%macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 -%if %1 <= 4 - movq %3, [%2] ; load data from source2 -%elif %1 <= 8 - movdqa %3, [%2] ; load data from source2 -%elif %1 <= 12 - movdqa %3, [%2] ; load data from source2 - movq %4, [%2+16] ; load data from source2 -%else - movdqa %3, [%2] ; load data from source2 - movdqa %4, [%2+16] ; load data from source2 -%endif -%endmacro - %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 %if %1 == 2 || (%2 == 8 && %1 <= 4) movd %4, [%3] ; load data from source @@ -505,11 +491,20 @@ QPEL_TABLE 12, 4, w, sse4 %endif %endmacro -%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw +%macro SIMPLE_BILOAD 5 ;width, tab, r1, r2 +%if %1 <= 4 + movq %5, [%2] ; load data from source2 paddsw %3, %5 +%else + paddsw %3, [%2] %if %1 > 8 - paddsw %4, %6 + paddsw %4, [%2+16] ; load data from source2 +%endif %endif +%endmacro + +%macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw + SIMPLE_BILOAD %1, %6, %3, %4, %5 UNI_COMPUTE %1, %2, %3, %4, %7 %endmacro @@ -562,9 +557,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid movdqa m5, [pw_bi_%2] .loop SIMPLE_LOAD %1, %2, srcq, m0 - SIMPLE_BILOAD %1, src2q, m3, m4 MC_PIXEL_COMPUTE %1, %2 - BI_COMPUTE %1, %2, m0, m1, m3, m4, m5 + BI_COMPUTE %1, %2, m0, m1, m3, src2q, m5 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -616,8 +610,7 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride, .loop EPEL_LOAD %2, srcq-%%stride, %%stride, %1 EPEL_COMPUTE %2, %1, m4, m5 - SIMPLE_BILOAD %1, src2q, m2, m3 - BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 + BI_COMPUTE %1, %2, m0, m1, m2, src2q, m6 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -669,8 +662,7 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride, .loop EPEL_LOAD %2, srcq, srcstride, %1 EPEL_COMPUTE %2, %1, m4, m5 - SIMPLE_BILOAD %1, src2q, m2, m3 - BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 + BI_COMPUTE %1, %2, m0, m1, m2, src2q, m6 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -788,8 +780,7 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride punpckhwd m3, m6, m7 %endif EPEL_COMPUTE 14, %1, m12, m13 - SIMPLE_BILOAD %1, src2q, m8, m9 - BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] + BI_COMPUTE %1, %2, m0, m1, m8, src2q, [pw_bi_%2] PEL_%2STORE%1 dstq, m0, m1 movdqa m4, m5 movdqa m5, m6 @@ -847,8 +838,7 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, %if %2 > 8 packssdw m0, m1 %endif - SIMPLE_BILOAD %1, src2q, m10, m11 - BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 + BI_COMPUTE %1, %2, m0, m1, m10, src2q, m9 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -900,13 +890,12 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, lea r3srcq, [srcstrideq*3] QPEL_FILTER %2, my .loop - SIMPLE_BILOAD %1, src2q, m10, m11 QPEL_V_LOAD %2, srcq, srcstride, %1, r9 QPEL_COMPUTE %1, %2 %if %2 > 8 packssdw m0, m1 %endif - BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 + BI_COMPUTE %1, %2, m0, m1, m10, src2q, m9 PEL_%2STORE%1 dstq, m0, m1 add dstq, dststrideq ; dst += dststride add srcq, srcstrideq ; src += srcstride @@ -1120,8 +1109,7 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride punpckhwd m7, m14, m15 %endif QPEL_HV_COMPUTE %1, 14, my, ackssdw - SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case - BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] + BI_COMPUTE %1, %2, m0, m1, m8, src2q, [pw_bi_%2] PEL_%2STORE%1 dstq, m0, m1 %if %1 <= 4 -- 1.9.2.msysgit.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel