Hi, 2015-01-25 2:05 GMT+01:00 James Almer <jamr...@gmail.com>: > 2 to 2.5 times faster. > > Signed-off-by: James Almer <jamr...@gmail.com> > --- > libavcodec/x86/sbrdsp.asm | 114 > +++++++++++++++++++++++++++++++++++++++++++
Not the first time that I notice that, but memmoves are often suboptimal using old SSE ones. While movlhps is fine, movlps isn't, on my old core i5. You may want to validate this with the attached patch, where storing ps_mask3 in m8 is a gain in Win64 (the gain does not match the number of loops, but it is still there). Benchmarks: x64: 6023 decicycles in g, 262108 runs, 36 skips SSE: 3049 decicycles in g, 262130 runs, 14 skips SSE3: 2843 decicycles in g, 262086 runs, 58 skips movq: 2693 decicycles in g, 262117 runs, 27 skips m8: 2648 decicycles in g, 262083 runs, 61 skips Thanks for doing it, I had only 3yo scraps left and no further motivation to tackle the start/tail parts. -- Christophe
From 49e41dd86eb65a774f3561420dd5e9de83f328f2 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet <christophe.gisq...@gmail.com> Date: Sun, 25 Jan 2015 13:52:16 +0100 Subject: [PATCH 2/2] Use different mem moves. --- libavcodec/x86/sbrdsp.asm | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm index c9f2d88..955d6cc 100644 --- a/libavcodec/x86/sbrdsp.asm +++ b/libavcodec/x86/sbrdsp.asm @@ -448,19 +448,27 @@ cglobal sbr_qmf_deint_neg, 2,4,4,v,src,vrev,c REP_RET %macro SBR_AUTOCORRELATE 0 -cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt +cglobal sbr_autocorrelate, 2,3,8+ARCH_X86_64,32, x, phi, cnt mov cntq, 37*8 add xq, cntq neg cntq +%if ARCH_X86_64 + mova m8, [ps_mask3] +%define MASK m8 +%else +%define MASK [ps_mask3] +%endif %if cpuflag(sse3) +%define MOVH movq movddup m5, [xq+cntq] %else +%define MOVH movlps movlps m5, [xq+cntq] movlhps m5, m5 %endif - movlps m7, [xq+cntq+8 ] - movlps m1, [xq+cntq+16] + MOVH m7, [xq+cntq+8 ] + MOVH m1, [xq+cntq+16] shufps m7, m7, q0110 shufps m1, m1, q0110 mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0] @@ -470,7 +478,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt movaps [rsp+16], m4 add cntq, 8 - movlps m2, [xq+cntq+16] + MOVH m2, [xq+cntq+16] movlhps m7, m7 shufps m2, m2, q0110 mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1], x[1][0] * x[2][1], x[1][1] * x[2][0] @@ -481,7 +489,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt align 16 .loop: add cntq, 8 - movlps m0, [xq+cntq+16] + MOVH m0, [xq+cntq+16] movlhps m1, m1 shufps m0, m0, q0110 mulps m3, m1, m2 @@ -491,7 +499,7 @@ align 16 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1], x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; add cntq, 8 - movlps m1, [xq+cntq+16] + MOVH m1, [xq+cntq+16] movlhps m2, m2 shufps m1, m1, q0110 mulps m3, m2, m0 @@ -501,7 +509,7 @@ align 16 addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1], x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0]; addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1]; add cntq, 8 - movlps m2, [xq+cntq+16] + MOVH m2, [xq+cntq+16] movlhps m0, m0 shufps m2, m2, q0110 mulps m3, m0, m1 @@ -520,9 +528,9 @@ align 16 addps m6, [rsp ] ; real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]; imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0]; addps m7, [rsp+16] ; real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1]; - xorps m4, [ps_mask3] - xorps m5, [ps_mask3] - xorps m6, [ps_mask3] + xorps m4, MASK + xorps m5, MASK + xorps m6, MASK %if cpuflag(sse3) movshdup m2, m1 haddps m4, m5 -- 1.9.2.msysgit.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel