This is not a clean equivalent of the openhevc patch, as we don't have
the same history on hevc_mc.

-- 
Christophe
From 0eedfb7d5902cc388d018111c670cc31ad22db60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Raulet?= <mrau...@insa-rennes.fr>
Date: Thu, 31 Jul 2014 19:26:57 +0200
Subject: [PATCH] x86: hevc_mc: remove non necessary moves

Signed-off-by: Christophe Gisquet <christophe.gisq...@gmail.com>
---
 libavcodec/x86/hevc_mc.asm | 48 +++++++++++++++++-----------------------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 42fe65a..99f8c0a 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -81,20 +81,6 @@ QPEL_TABLE 12, 4, w, sse4
 
 %if ARCH_X86_64
 
-%macro SIMPLE_BILOAD 4   ;width, tab, r1, r2
-%if %1 <= 4
-    movq              %3, [%2]                                              ; load data from source2
-%elif %1 <= 8
-    movdqa            %3, [%2]                                              ; load data from source2
-%elif %1 <= 12
-    movdqa            %3, [%2]                                              ; load data from source2
-    movq              %4, [%2+16]                                           ; load data from source2
-%else
-    movdqa            %3, [%2]                                              ; load data from source2
-    movdqa            %4, [%2+16]                                           ; load data from source2
-%endif
-%endmacro
-
 %macro SIMPLE_LOAD 4    ;width, bitd, tab, r1
 %if %1 == 2 || (%2 == 8 && %1 <= 4)
     movd              %4, [%3]                                               ; load data from source
@@ -505,11 +491,20 @@ QPEL_TABLE 12, 4, w, sse4
 %endif
 %endmacro
 
-%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+%macro SIMPLE_BILOAD 5   ;width, tab, r1, r2
+%if %1 <= 4
+    movq              %5, [%2]                                              ; load data from source2
     paddsw            %3, %5
+%else
+    paddsw            %3, [%2]
 %if %1 > 8
-    paddsw            %4, %6
+    paddsw            %4, [%2+16]                                           ; load data from source2
+%endif
 %endif
+%endmacro
+
+%macro BI_COMPUTE 7     ; width, bitd, src1l, src1h, scr2l, scr2h, pw
+    SIMPLE_BILOAD     %1, %6, %3, %4, %5
     UNI_COMPUTE       %1, %2, %3, %4, %7
 %endmacro
 
@@ -562,9 +557,8 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
     movdqa            m5, [pw_bi_%2]
 .loop
     SIMPLE_LOAD       %1, %2, srcq, m0
-    SIMPLE_BILOAD     %1, src2q, m3, m4
     MC_PIXEL_COMPUTE  %1, %2
-    BI_COMPUTE        %1, %2, m0, m1, m3, m4, m5
+    BI_COMPUTE        %1, %2, m0, m1, m3, src2q, m5
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -616,8 +610,7 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride,
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m4, m5
-    SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    BI_COMPUTE        %1, %2, m0, m1, m2, src2q, m6
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -669,8 +662,7 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 11, dst, dststride, src, srcstride,
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
     EPEL_COMPUTE      %2, %1, m4, m5
-    SIMPLE_BILOAD     %1, src2q, m2, m3
-    BI_COMPUTE        %1, %2, m0, m1, m2, m3, m6
+    BI_COMPUTE        %1, %2, m0, m1, m2, src2q, m6
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -788,8 +780,7 @@ cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
     punpckhwd         m3, m6, m7
 %endif
     EPEL_COMPUTE      14, %1, m12, m13
-    SIMPLE_BILOAD     %1, src2q, m8, m9
-    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    BI_COMPUTE        %1, %2, m0, m1, m8, src2q, [pw_bi_%2]
     PEL_%2STORE%1   dstq, m0, m1
     movdqa            m4, m5
     movdqa            m5, m6
@@ -847,8 +838,7 @@ cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride,
 %if %2 > 8
     packssdw          m0, m1
 %endif
-    SIMPLE_BILOAD     %1, src2q, m10, m11
-    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    BI_COMPUTE        %1, %2, m0, m1, m10, src2q, m9
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -900,13 +890,12 @@ cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride,
     lea           r3srcq, [srcstrideq*3]
     QPEL_FILTER       %2, my
 .loop
-    SIMPLE_BILOAD     %1, src2q, m10, m11
     QPEL_V_LOAD       %2, srcq, srcstride, %1, r9
     QPEL_COMPUTE      %1, %2
 %if %2 > 8
     packssdw          m0, m1
 %endif
-    BI_COMPUTE        %1, %2, m0, m1, m10, m11, m9
+    BI_COMPUTE        %1, %2, m0, m1, m10, src2q, m9
     PEL_%2STORE%1   dstq, m0, m1
     add             dstq, dststrideq             ; dst += dststride
     add             srcq, srcstrideq             ; src += srcstride
@@ -1120,8 +1109,7 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride
     punpckhwd         m7, m14, m15
 %endif
     QPEL_HV_COMPUTE   %1, 14, my, ackssdw
-    SIMPLE_BILOAD     %1, src2q, m8, m9 ;m9 not used in this case
-    BI_COMPUTE        %1, %2, m0, m1, m8, m9, [pw_bi_%2]
+    BI_COMPUTE        %1, %2, m0, m1, m8, src2q, [pw_bi_%2]
     PEL_%2STORE%1   dstq, m0, m1
 
 %if %1 <= 4
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to