From 1.3 to 2.5 times faster. Runtime reduced by 4 to 58%. As with the 16-bit version the speed-up generally increases with compression_level.
Also like the 16-bit version, it is not used with levels less than 3. After this bug fix in long, long ago in e609cfd697 this 32-bit lpc encoder is heavily used with 16-bit samples. --- libavcodec/x86/flac_dsp_gpl.asm | 106 ++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/flacdsp_init.c | 5 ++ 2 files changed, 111 insertions(+) diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm index c461c666be..618306eb5f 100644 --- a/libavcodec/x86/flac_dsp_gpl.asm +++ b/libavcodec/x86/flac_dsp_gpl.asm @@ -22,6 +22,12 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA + +pd_0_int_min: times 2 dd 0, -2147483648 +pq_int_min: times 2 dq -2147483648 +pq_int_max: times 2 dq 2147483647 + SECTION .text %macro FUNCTION_BODY_16 0 @@ -116,8 +122,108 @@ RET %endmacro +%macro PMINSQ 3 + pcmpgtq %3, %2, %1 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro PMAXSQ 3 + pcmpgtq %3, %1, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endmacro + +%macro CLIPQ 4 ; reg, min, max, tmp + PMAXSQ %1, %2, %4 + PMINSQ %1, %3, %4 +%endmacro + +%macro HACK_PSRAQ 4 ; dst, src (shift), sign extend mask, tmp + pxor %4, %4 ; zero + pcmpgtq %4, %1 ; mask where 0 > dst + pand %4, %3 ; mask & sign extend mask + psrlq %1, %2 ; dst >>= shift + por %1, %4 ; dst | mask +%endmacro + +%macro FUNCTION_BODY_32 0 + +%if ARCH_X86_64 + cglobal flac_enc_lpc_32, 5, 7, 4, mmsize, res, smp, len, order, coefs + DECLARE_REG_TMP 5, 6 + %define length r2d + + movsxd orderq, orderd +%else + cglobal flac_enc_lpc_32, 5, 6, 4, mmsize, res, smp, len, order, coefs + DECLARE_REG_TMP 2, 5 + %define length r2mp +%endif + +; Here we assume that the maximum order value is 32. This means that we only +; need to copy a maximum of 32 samples. Therefore we let the preprocessor +; unroll this loop and copy all 32. +%assign iter 0 +%rep 32/(mmsize/4) + movu m0, [smpq+iter] + movu [resq+iter], m0 + %assign iter iter+mmsize +%endrep + +lea resq, [resq+orderq*4] +lea smpq, [smpq+orderq*4] +lea coefsq, [coefsq+orderq*4] +sub length, orderd +movd m3, r5m +neg orderq + +movu m4, [pd_0_int_min] ; load 1 bit +psrad m4, m3 ; turn that into shift+1 bits +pslld m4, 1 ; reduce that +mova [rsp], m4 ; save sign extend mask + +%define posj t0q +%define negj t1q + +.looplen: + pxor m0, m0 + mov posj, orderq + xor negj, negj + + .looporder: + movd m2, [coefsq+posj*4] ; c = coefs[j] + SPLATD m2 + pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] + pmuldq m1, m2 + paddq m0, m1 ; p += c * s + + dec negj + inc posj + jnz .looporder + + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) + pshufd m0, m0, q0020 ; pack into first 2 dwords + movh m1, [smpq] + psubd m1, m0 ; smp[i] - p + movh [resq], m1 ; res[i] = smp[i] - (p >> shift) + + add resq, mmsize/2 + add smpq, mmsize/2 + sub length, mmsize/8 +jg .looplen +RET + +%endmacro ; FUNCTION_BODY_32 + INIT_XMM sse4 FUNCTION_BODY_16 +INIT_XMM sse42 +FUNCTION_BODY_32 + INIT_YMM avx2 FUNCTION_BODY_16 diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c index 0a5c01859f..f827186c26 100644 --- a/libavcodec/x86/flacdsp_init.c +++ b/libavcodec/x86/flacdsp_init.c @@ -29,6 +29,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order, void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int); void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int); +void ff_flac_enc_lpc_32_sse42(int32_t *, const int32_t *, int, int, const int32_t *,int); #define DECORRELATE_FUNCS(fmt, opt) \ void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \ @@ -111,6 +112,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int if (CONFIG_GPL) c->lpc16_encode = ff_flac_enc_lpc_16_sse4; } + if (EXTERNAL_SSE42(cpu_flags)) { + if (CONFIG_GPL) + c->lpc32_encode = ff_flac_enc_lpc_32_sse42; + } if (EXTERNAL_AVX2(cpu_flags)) { if (CONFIG_GPL) c->lpc16_encode = ff_flac_enc_lpc_16_avx2; -- 2.15.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel