Re: [FFmpeg-devel] [PATCH] x86/sbrdsp: add ff_sbr_autocorrelate_{sse, sse3}

Christophe Gisquet Sun, 25 Jan 2015 05:12:33 -0800

Hi,

2015-01-25 2:05 GMT+01:00 James Almer <jamr...@gmail.com>:
> 2 to 2.5 times faster.
>
> Signed-off-by: James Almer <jamr...@gmail.com>
> ---
>  libavcodec/x86/sbrdsp.asm    | 114 
> +++++++++++++++++++++++++++++++++++++++++++


Not the first time that I notice that, but memmoves are often
suboptimal using old SSE ones.
While movlhps is fine, movlps isn't, on my old core i5. You may want
to validate this with the attached patch, where storing ps_mask3 in m8
is a gain in Win64 (the gain does not match the number of loops, but
it is still there).

Benchmarks:
x64:  6023 decicycles in g, 262108 runs, 36 skips
SSE:  3049 decicycles in g, 262130 runs, 14 skips
SSE3: 2843 decicycles in g, 262086 runs, 58 skips
movq: 2693 decicycles in g, 262117 runs, 27 skips
m8:   2648 decicycles in g, 262083 runs, 61 skips

Thanks for doing it, I had only 3yo scraps left and no further
motivation to tackle the start/tail parts.
-- 
Christophe

From 49e41dd86eb65a774f3561420dd5e9de83f328f2 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisq...@gmail.com>
Date: Sun, 25 Jan 2015 13:52:16 +0100
Subject: [PATCH 2/2] Use different mem moves.

---
 libavcodec/x86/sbrdsp.asm | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index c9f2d88..955d6cc 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -448,19 +448,27 @@ cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c
     REP_RET
 
 %macro SBR_AUTOCORRELATE 0
-cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
+cglobal sbr_autocorrelate, 2,3,8+ARCH_X86_64,32, x, phi, cnt
     mov    cntq, 37*8
     add    xq, cntq
     neg    cntq
+%if ARCH_X86_64
+    mova   m8, [ps_mask3]
+%define  MASK  m8
+%else
+%define  MASK  [ps_mask3]
+%endif
 
 %if cpuflag(sse3)
+%define MOVH   movq
     movddup m5, [xq+cntq]
 %else
+%define MOVH   movlps
     movlps  m5, [xq+cntq]
     movlhps m5, m5
 %endif
-    movlps  m7, [xq+cntq+8 ]
-    movlps  m1, [xq+cntq+16]
+    MOVH    m7, [xq+cntq+8 ]
+    MOVH    m1, [xq+cntq+16]
     shufps  m7, m7, q0110
     shufps  m1, m1, q0110
     mulps   m3, m5, m7   ;              x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
@@ -470,7 +478,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
     movaps  [rsp+16], m4
     add     cntq, 8
 
-    movlps  m2, [xq+cntq+16]
+    MOVH    m2, [xq+cntq+16]
     movlhps m7, m7
     shufps  m2, m2, q0110
     mulps   m6, m7, m1   ; real_sum1  = x[1][0] * x[2][0], x[1][1] * x[2][1], x[1][0] * x[2][1], x[1][1] * x[2][0]
@@ -481,7 +489,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
 align 16
 .loop:
     add     cntq, 8
-    movlps  m0, [xq+cntq+16]
+    MOVH    m0, [xq+cntq+16]
     movlhps m1, m1
     shufps  m0, m0, q0110
     mulps   m3, m1, m2
@@ -491,7 +499,7 @@ align 16
     addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1], x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
     addps   m7, m1       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
     add     cntq, 8
-    movlps  m1, [xq+cntq+16]
+    MOVH    m1, [xq+cntq+16]
     movlhps m2, m2
     shufps  m1, m1, q0110
     mulps   m3, m2, m0
@@ -501,7 +509,7 @@ align 16
     addps   m5, m4       ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1], x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
     addps   m7, m2       ; real_sum0 += x[i][0] * x[i][0],     x[i][1] * x[i][1];
     add     cntq, 8
-    movlps  m2, [xq+cntq+16]
+    MOVH    m2, [xq+cntq+16]
     movlhps m0, m0
     shufps  m2, m2, q0110
     mulps   m3, m0, m1
@@ -520,9 +528,9 @@ align 16
     addps   m6, [rsp   ] ; real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0];
     addps   m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1];
 
-    xorps   m4, [ps_mask3]
-    xorps   m5, [ps_mask3]
-    xorps   m6, [ps_mask3]
+    xorps   m4, MASK
+    xorps   m5, MASK
+    xorps   m6, MASK
 %if cpuflag(sse3)
     movshdup m2, m1
     haddps  m4, m5
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] x86/sbrdsp: add ff_sbr_autocorrelate_{sse, sse3}

Reply via email to