On Fri, Jul 24, 2015 at 11:00:55PM -0300, James Almer wrote: > Between 1.5 and 2.5 times faster > > Signed-off-by: James Almer <jamr...@gmail.com> > --- > There's a couple missing, like ps_stereo_interpolate_ipdopd which i wanted to > write > but couldn't test because it was not used by any of the samples i tried. > > libavcodec/aacps.c | 4 +- > libavcodec/aacpsdsp.h | 1 + > libavcodec/aacpsdsp_template.c | 2 + > libavcodec/x86/Makefile | 6 +- > libavcodec/x86/aacpsdsp.asm | 212 > +++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/aacpsdsp_init.c | 55 +++++++++++ > 6 files changed, 276 insertions(+), 4 deletions(-) > create mode 100644 libavcodec/x86/aacpsdsp.asm > create mode 100644 libavcodec/x86/aacpsdsp_init.c > > diff --git a/libavcodec/aacps.c b/libavcodec/aacps.c > index bf60475..eec6e30 100644 > --- a/libavcodec/aacps.c > +++ b/libavcodec/aacps.c > @@ -936,8 +936,8 @@ static void stereo_processing(PSContext *ps, INTFLOAT > (*l)[32][2], INTFLOAT (*r) > H22[0][e+1][b] = h22; > } > for (k = 0; k < NR_BANDS[is34]; k++) { > - INTFLOAT h[2][4]; > - INTFLOAT h_step[2][4]; > + LOCAL_ALIGNED_16(INTFLOAT, h, [2], [4]); > + LOCAL_ALIGNED_16(INTFLOAT, h_step, [2], [4]); > int start = ps->border_position[e]; > int stop = ps->border_position[e+1]; > INTFLOAT width = Q30(1.f) / (stop - start); > diff --git a/libavcodec/aacpsdsp.h b/libavcodec/aacpsdsp.h > index 9e3c5aa..c194bbe 100644 > --- a/libavcodec/aacpsdsp.h > +++ b/libavcodec/aacpsdsp.h > @@ -52,5 +52,6 @@ typedef struct PSDSPContext { > void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s); > void ff_psdsp_init_arm(PSDSPContext *s); > void ff_psdsp_init_mips(PSDSPContext *s); > +void ff_psdsp_init_x86(PSDSPContext *s); > > #endif /* LIBAVCODEC_AACPSDSP_H */ > diff --git a/libavcodec/aacpsdsp_template.c b/libavcodec/aacpsdsp_template.c > index bfec828..3049ce8 100644 > --- a/libavcodec/aacpsdsp_template.c > +++ b/libavcodec/aacpsdsp_template.c > @@ -224,5 +224,7 @@ av_cold void AAC_RENAME(ff_psdsp_init)(PSDSPContext *s) > ff_psdsp_init_arm(s); > if (ARCH_MIPS) > ff_psdsp_init_mips(s); > + if (ARCH_X86) > + ff_psdsp_init_x86(s); > #endif /* !USE_FIXED */ > } > diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile > index a515ebd..c403770 100644 > --- a/libavcodec/x86/Makefile > +++ b/libavcodec/x86/Makefile > @@ -38,7 +38,8 @@ OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp_init.o > OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o > > # decoders/encoders > -OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o > +OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp_init.o \ > + x86/sbrdsp_init.o > OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp_init.o > OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp_init.o > OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o > @@ -130,7 +131,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o > \ > x86/vp8dsp_loopfilter.o > > # decoders/encoders > -YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o > +YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/aacpsdsp.o \ > + x86/sbrdsp.o > YASM-OBJS-$(CONFIG_ADPCM_G722_DECODER) += x86/g722dsp.o > YASM-OBJS-$(CONFIG_ADPCM_G722_ENCODER) += x86/g722dsp.o > YASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o > diff --git a/libavcodec/x86/aacpsdsp.asm b/libavcodec/x86/aacpsdsp.asm > new file mode 100644 > index 0000000..d416944 > --- /dev/null > +++ b/libavcodec/x86/aacpsdsp.asm > @@ -0,0 +1,212 @@ > +;****************************************************************************** > +;* SIMD optimized MPEG-4 Parametric Stereo decoding functions > +;* > +;* Copyright (C) 2015 James Almer > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 > + > +SECTION_TEXT > + > +;************************************************************************* > +;void ff_ps_add_squares_<opt>(float *dst, const float (*src)[2], int n); > +;************************************************************************* > +%macro PS_ADD_SQUARES 1 > +cglobal ps_add_squares, 3, 3, %1, dst, src, n > +.loop: > + movaps m0, [srcq] > + movaps m1, [srcq+mmsize] > + mulps m0, m0 > + mulps m1, m1 > +%if cpuflag(sse3) > + haddps m0, m1 > +%else > + movaps m3, m0 > + movaps m4, m1 > + shufps m3, m3, q0301 > + shufps m4, m4, q0301 > + addps m0, m3 > + addps m1, m4 > + shufps m0, m1, q2020 > +%endif > + addps m0, [dstq] > + movaps [dstq], m0 > + add dstq, mmsize > + add srcq, mmsize*2 > + sub nd, mmsize/4 > + jg .loop > + REP_RET > +%endmacro > + > +INIT_XMM sse > +PS_ADD_SQUARES 3 > +INIT_XMM sse3 > +PS_ADD_SQUARES 5 > + > +;******************************************************************* > +;void ff_ps_mul_pair_single_sse(float (*dst)[2], float (*src0)[2], > +; float *src1, int n); > +;******************************************************************* > +INIT_XMM sse > +cglobal ps_mul_pair_single, 4, 5, 4, dst, src1, src2, n > + xor r4q, r4q > + > +.loop: > + movu m0, [src1q+r4q] > + movu m1, [src1q+r4q+mmsize] > + mova m2, [src2q] > + mova m3, m2 > + unpcklps m2, m2 > + unpckhps m3, m3 > + mulps m0, m2 > + mulps m1, m3 > + mova [dstq+r4q], m0 > + mova [dstq+r4q+mmsize], m1 > + add src2q, mmsize > + add r4q, mmsize*2 > + sub nd, mmsize/4 > + jg .loop > + REP_RET > +
> +;*********************************************************************** > +;void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], > +; float h[2][4], float h_step[2][4], > +; int len); > +;*********************************************************************** > +INIT_XMM sse3 > +cglobal ps_stereo_interpolate, 5, 5, 6, l, r, h, h_step, n > + movaps m0, [hq] > + movaps m1, [h_stepq] > + shl nd, 3 > + add lq, nq > + add rq, nq > + neg nq > + > +align 16 > +.loop: this assumes n >= 0 i dont think the calling code guratees this either the calling code should be changed or this should be checked for [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB In a rich man's house there is no place to spit but his face. -- Diogenes of Sinope
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel