2015-02-16 10:43 GMT+01:00 Christophe Gisquet <christophe.gisq...@gmail.com>:
> Obviously I shouldn't unconditionally use r3srcq or equivalent, as
> !PIC just directly access the %%table
> I probably need to define an intermediate, say TABLE, which is either
> r3srcq or %%table, and use it for loading the xmm regs.

The new attached patch might work, but that's a shot in the dark.

I'll sent the following ones if this one is fine.

-- 
Christophe
From 96a5a2e6ceab9350468ea2697c2cf05d5b88cac6 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisq...@gmail.com>
Date: Sat, 7 Feb 2015 18:49:38 +0000
Subject: [PATCH 1/3] x86: hevc_mc: save 1 gpr in epel filter loading

The 3*stride value stored in r3src can be loaded much later,
so use r3src instead of a dedicated gpr when possible.
---
 libavcodec/x86/hevc_mc.asm | 71 +++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index 9a7367a..6ef8a60 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -136,20 +136,22 @@ QPEL_TABLE 10, 8, w, avx2
 %endmacro
 
 
-%macro EPEL_FILTER 2-4                            ; bit depth, filter index
+%macro EPEL_FILTER 5 ; bit depth, filter index, xmma, xmmb, gprtmp
 %if cpuflag(avx2)
 %assign %%offset 32
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_avx2_%1]
+    lea              %5q, [hevc_epel_filters_avx2_%1]
+    %define FILTER %5q
 %else
-    %define rfilterq hevc_epel_filters_avx2_%1
+    %define FILTER hevc_epel_filters_avx2_%1
 %endif
 %else
 %assign %%offset 16
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea              %5q, [hevc_epel_filters_sse4_%1]
+    %define FILTER %5q
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define FILTER hevc_epel_filters_sse4_%1
 %endif
 %endif ;cpuflag(avx2)
     sub              %2q, 1
@@ -158,13 +160,8 @@ QPEL_TABLE 10, 8, w, avx2
   %else
     shl              %2q, 5                      ; multiply by 32
 %endif
-%if %0 == 2
-    mova           m14, [rfilterq + %2q]        ; get 2 first values of filters
-    mova           m15, [rfilterq + %2q+%%offset]     ; get 2 last values of filters
-%else
-    mova           %3, [rfilterq + %2q]        ; get 2 first values of filters
-    mova           %4, [rfilterq + %2q+%%offset]     ; get 2 last values of filters
-%endif
+    mova           %3, [FILTER + %2q]        ; get 2 first values of filters
+    mova           %4, [FILTER + %2q+%%offset]     ; get 2 last values of filters
 %endmacro
 
 %macro EPEL_HV_FILTER 1
@@ -179,17 +176,17 @@ QPEL_TABLE 10, 8, w, avx2
 %endif
 
 %ifdef PIC
-    lea         rfilterq, [%%table]
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
 %else
-    %define rfilterq %%table
+    %define FILTER %%table
 %endif
     sub              mxq, 1
     sub              myq, 1
     shl              mxq, %%shift                ; multiply by 32
     shl              myq, %%shift                ; multiply by 32
-    mova             m14, [rfilterq + mxq]        ; get 2 first values of filters
-    mova             m15, [rfilterq + mxq+%%offset]     ; get 2 last values of filters
-    lea           r3srcq, [srcstrideq*3]
+    mova             m14, [FILTER + mxq]        ; get 2 first values of filters
+    mova             m15, [FILTER + mxq+%%offset]     ; get 2 last values of filters
 
 %if cpuflag(avx2)
 %define %%table  hevc_epel_filters_avx2_10
@@ -197,12 +194,14 @@ QPEL_TABLE 10, 8, w, avx2
 %define %%table  hevc_epel_filters_sse4_10
 %endif
 %ifdef PIC
-    lea         rfilterq, [%%table]
+    lea           r3srcq, [%%table]
+    %define FILTER r3srcq
 %else
-    %define rfilterq %%table
+    %define FILTER %%table
 %endif
-    mova             m12, [rfilterq + myq]        ; get 2 first values of filters
-    mova             m13, [rfilterq + myq+%%offset]     ; get 2 last values of filters
+    mova             m12, [FILTER + myq]        ; get 2 first values of filters
+    mova             m13, [FILTER + myq+%%offset]     ; get 2 last values of filters
+    lea           r3srcq, [srcstrideq*3]
 %endmacro
 
 %macro QPEL_FILTER 2
@@ -733,7 +732,7 @@ cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstrid
 %macro HEVC_PUT_HEVC_EPEL 2
 cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rfilter
 %assign %%stride ((%2 + 7)/8)
-    EPEL_FILTER       %2, mx, m4, m5
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m4, m5, 1
@@ -744,7 +743,7 @@ cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 11, dst, src, srcstride, height, mx, rf
 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 11, dst, dststride, src, srcstride, height, mx, rfilter
 %assign %%stride ((%2 + 7)/8)
     movdqa            m6, [pw_%2]
-    EPEL_FILTER       %2, mx, m4, m5
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m4, m5
@@ -758,7 +757,7 @@ cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 11, dst, dststride, src, srcstride,
 
 cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride, src2, height, mx, rfilter
     movdqa            m6, [pw_bi_%2]
-    EPEL_FILTER       %2, mx, m4, m5
+    EPEL_FILTER       %2, mx, m4, m5, rfilter
 .loop
     EPEL_LOAD         %2, srcq-%%stride, %%stride, %1
     EPEL_COMPUTE      %2, %1, m4, m5, 1
@@ -778,11 +777,11 @@ cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 11, dst, dststride, src, srcstride,
 ;                      int height, int mx, int my, int width)
 ; ******************************
 
-cglobal hevc_put_hevc_epel_v%1_%2, 4, 7, 11, dst, src, srcstride, height, r3src, my, rfilter
+cglobal hevc_put_hevc_epel_v%1_%2, 4, 6, 11, dst, src, srcstride, height, r3src, my
     movifnidn        myd, mym
-    lea           r3srcq, [srcstrideq*3]
     sub             srcq, srcstrideq
-    EPEL_FILTER       %2, my, m4, m5
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
     EPEL_COMPUTE      %2, %1, m4, m5, 1
@@ -790,12 +789,12 @@ cglobal hevc_put_hevc_epel_v%1_%2, 4, 7, 11, dst, src, srcstride, height, r3src,
     LOOP_END          dst, src, srcstride
     RET
 
-cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 8, 11, dst, dststride, src, srcstride, height, r3src, my, rfilter
+cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 7, 11, dst, dststride, src, srcstride, height, r3src, my
     movifnidn        myd, mym
-    lea           r3srcq, [srcstrideq*3]
     movdqa            m6, [pw_%2]
     sub             srcq, srcstrideq
-    EPEL_FILTER       %2, my, m4, m5
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
     EPEL_COMPUTE      %2, %1, m4, m5
@@ -808,12 +807,12 @@ cglobal hevc_put_hevc_uni_epel_v%1_%2, 5, 8, 11, dst, dststride, src, srcstride,
     RET
 
 
-cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 9, 11, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter
+cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 8, 11, dst, dststride, src, srcstride, src2, height, r3src, my
     movifnidn        myd, mym
-    lea           r3srcq, [srcstrideq*3]
     movdqa            m6, [pw_bi_%2]
     sub             srcq, srcstrideq
-    EPEL_FILTER       %2, my, m4, m5
+    EPEL_FILTER       %2, my, m4, m5, r3src
+    lea           r3srcq, [srcstrideq*3]
 .loop
     EPEL_LOAD         %2, srcq, srcstride, %1
     EPEL_COMPUTE      %2, %1, m4, m5, 1
@@ -836,7 +835,7 @@ cglobal hevc_put_hevc_bi_epel_v%1_%2, 6, 9, 11, dst, dststride, src, srcstride,
 ; ******************************
 
 %macro HEVC_PUT_HEVC_EPEL_HV 2
-cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_epel_hv%1_%2, 6, 7, 16 , dst, src, srcstride, height, mx, my, r3src
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
     EPEL_HV_FILTER    %2
@@ -902,7 +901,7 @@ cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 16 , dst, src, srcstride, height, mx,
     LOOP_END         dst, src, srcstride
     RET
 
-cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, height, mx, my, r3src
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
     EPEL_HV_FILTER    %2
@@ -966,7 +965,7 @@ cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 16 , dst, dststride, src, srcstrid
     jnz               .loop                      ; height loop
     RET
 
-cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter
+cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 9, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
     EPEL_HV_FILTER    %2
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to