On 11/26/2017 7:51 PM, James Darnley wrote: > When compared to the SSE4.2 version runtime, is reduced by 1 to 26%. The > function itself is around 2 times faster. > --- > libavcodec/x86/flac_dsp_gpl.asm | 56 > +++++++++++++++++++++++++++++++---------- > libavcodec/x86/flacdsp_init.c | 5 +++- > 2 files changed, 47 insertions(+), 14 deletions(-) > > diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm > index 91989ce560..749e66dec8 100644 > --- a/libavcodec/x86/flac_dsp_gpl.asm > +++ b/libavcodec/x86/flac_dsp_gpl.asm > @@ -22,11 +22,11 @@ > > %include "libavutil/x86/x86util.asm" > > -SECTION_RODATA > +SECTION_RODATA 32 > > -pd_0_int_min: times 2 dd 0, -2147483648 > -pq_int_min: times 2 dq -2147483648 > -pq_int_max: times 2 dq 2147483647 > +pd_0_int_min: times 4 dd 0, -2147483648 > +pq_int_min: times 4 dq -2147483648 > +pq_int_max: times 4 dq 2147483647 > > SECTION .text > > @@ -123,7 +123,10 @@ RET > %endmacro > > %macro PMINSQ 3 > - pcmpgtq %3, %2, %1 > + mova %3, %2 > + ; We cannot use the 3-operand format because the memory location cannot > be > + ; the second operand, only the third. > + pcmpgtq %3, %1 > pand %1, %3 > pandn %3, %2 > por %1, %3 > @@ -177,11 +180,11 @@ lea resq, [resq+orderq*4] > lea smpq, [smpq+orderq*4] > lea coefsq, [coefsq+orderq*4] > sub length, orderd > -movd m3, r5m > +movd xm3, r5m > neg orderq > > movu m4, [pd_0_int_min] ; load 1 bit > -psrad m4, m3 ; turn that into shift+1 bits > +psrad m4, xm3 ; turn that into shift+1 bits > pslld m4, 1 ; reduce that > mova [rsp], m4 ; save sign extend mask > > @@ -197,8 +200,20 @@ mova [rsp], m4 ; save sign extend mask > xor negj, negj > > .looporder1: > +%if cpuflag(avx)
Either avx2, or check instead for mmsize == 32 > + vbroadcastss m2, [coefsq+posj*4] vpbroadcastd. Or just use the VPBROADCASTD macro to cover both the avx2 and sse4 cases without ifdeffery. > +%else > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > +%endif > +%if cpuflag(avx) > + vpmuldq m1, m2, [smpq+negj*4-4] > + vpmuldq m5, m2, [smpq+negj*4-4+mmsize] > + vpmuldq m7, m2, [smpq+negj*4-4+mmsize*2] > + vpaddq m0, m1 > + vpaddq m4, m5 > + vpaddq m6, m7 > +%else > movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] > movu m5, [smpq+negj*4-4+mmsize] > movu m7, [smpq+negj*4-4+mmsize*2] > @@ -212,14 +227,15 @@ mova [rsp], m4 ; save sign extend mask > paddq m0, m1 ; p += c * s > paddq m4, m5 > paddq m6, m7 > +%endif > > dec negj > inc posj > jnz .looporder1 > > - HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > - HACK_PSRAQ m4, m3, [rsp], m2 > - HACK_PSRAQ m6, m3, [rsp], m2 > + HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift > + HACK_PSRAQ m4, xm3, [rsp], m2 > + HACK_PSRAQ m6, xm3, [rsp], m2 > CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > CLIPQ m4, [pq_int_min], [pq_int_max], m2 > CLIPQ m6, [pq_int_min], [pq_int_max], m2 > @@ -241,8 +257,20 @@ mova [rsp], m4 ; save sign extend mask > xor negj, negj > > .looporder2: > +%if cpuflag(avx) > + vbroadcastss m2, [coefsq+posj*4] Same > +%else > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > +%endif > +%if cpuflag(avx) > + vpmuldq m1, m2, [smpq+negj*4] > + vpmuldq m5, m2, [smpq+negj*4+mmsize] > + vpmuldq m7, m2, [smpq+negj*4+mmsize*2] > + vpaddq m0, m1 > + vpaddq m4, m5 > + vpaddq m6, m7 > +%else > movu m1, [smpq+negj*4] ; s = smp[i-j-1] > movu m5, [smpq+negj*4+mmsize] > movu m7, [smpq+negj*4+mmsize*2] > @@ -252,14 +280,15 @@ mova [rsp], m4 ; save sign extend mask > paddq m0, m1 ; p += c * s > paddq m4, m5 > paddq m6, m7 > +%endif > > dec negj > inc posj > jnz .looporder2 > > - HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > - HACK_PSRAQ m4, m3, [rsp], m2 > - HACK_PSRAQ m6, m3, [rsp], m2 > + HACK_PSRAQ m0, xm3, [rsp], m2 ; p >>= shift > + HACK_PSRAQ m4, xm3, [rsp], m2 > + HACK_PSRAQ m6, xm3, [rsp], m2 > CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > CLIPQ m4, [pq_int_min], [pq_int_max], m2 > CLIPQ m6, [pq_int_min], [pq_int_max], m2 > @@ -300,3 +329,4 @@ FUNCTION_BODY_32 > > INIT_YMM avx2 > FUNCTION_BODY_16 > +FUNCTION_BODY_32 > diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c > index f827186c26..fbe70894a0 100644 > --- a/libavcodec/x86/flacdsp_init.c > +++ b/libavcodec/x86/flacdsp_init.c > @@ -30,6 +30,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int > coeffs[32], int order, > void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const > int32_t *,int); > void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const > int32_t *,int); > void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const > int32_t *,int); > +void ff_flac_enc_lpc_32_avx2(int32_t *, const int32_t *, int, int, const > int32_t *,int); > > #define DECORRELATE_FUNCS(fmt, opt) > \ > void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int > channels, \ > @@ -117,8 +118,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum > AVSampleFormat fmt, int > c->lpc32_encode = ff_flac_enc_lpc_32_sse42; > } > if (EXTERNAL_AVX2(cpu_flags)) { > - if (CONFIG_GPL) > + if (CONFIG_GPL) { > c->lpc16_encode = ff_flac_enc_lpc_16_avx2; > + c->lpc32_encode = ff_flac_enc_lpc_32_avx2; > + } > } > #endif > #endif /* HAVE_X86ASM */ > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel