On 26 November 2017 at 22:51, James Darnley <james.darn...@gmail.com> wrote:
> Around 1.1 times faster and reduces runtime by up to 6%. > --- > libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++ > ++--------- > 1 file changed, 72 insertions(+), 19 deletions(-) > > diff --git a/libavcodec/x86/flac_dsp_gpl.asm > b/libavcodec/x86/flac_dsp_gpl.asm > index 952fc8b86b..91989ce560 100644 > --- a/libavcodec/x86/flac_dsp_gpl.asm > +++ b/libavcodec/x86/flac_dsp_gpl.asm > @@ -152,13 +152,13 @@ RET > %macro FUNCTION_BODY_32 0 > > %if ARCH_X86_64 > - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order, > coefs > Why x4, shouldn't this be x2? > DECLARE_REG_TMP 5, 6 > %define length r2d > > movsxd orderq, orderd > %else > - cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs > + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order, > coefs > DECLARE_REG_TMP 2, 5 > %define length r2mp > %endif > @@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask > %define negj t1q > > .looplen: > + ; process "odd" samples > pxor m0, m0 > pxor m4, m4 > pxor m6, m6 > mov posj, orderq > xor negj, negj > > - .looporder: > + .looporder1: > movd m2, [coefsq+posj*4] ; c = coefs[j] > SPLATD m2 > - pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1] > - pmovzxdq m5, [smpq+negj*4-4+mmsize/2] > - pmovzxdq m7, [smpq+negj*4-4+mmsize] > + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1] > + movu m5, [smpq+negj*4-4+mmsize] > + movu m7, [smpq+negj*4-4+mmsize*2] > + ; Rather than explicitly unpack adjacent samples into qwords we > can let > + ; the pmuldq instruction unpack the 0th and 2nd samples for us > when it > + ; does its multiply. This saves an unpack for every sample in > the inner > + ; loop meaning it should be (much) quicker. > pmuldq m1, m2 > pmuldq m5, m2 > pmuldq m7, m2 > @@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask > > dec negj > inc posj > - jnz .looporder > + jnz .looporder1 > > HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > HACK_PSRAQ m4, m3, [rsp], m2 > @@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask > CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > CLIPQ m4, [pq_int_min], [pq_int_max], m2 > CLIPQ m6, [pq_int_min], [pq_int_max], m2 > - pshufd m0, m0, q0020 ; pack into first 2 dwords > - pshufd m4, m4, q0020 > - pshufd m6, m6, q0020 > - movh m1, [smpq] > - movh m5, [smpq+mmsize/2] > - movh m7, [smpq+mmsize] > + movu m1, [smpq] > + movu m5, [smpq+mmsize] > + movu m7, [smpq+mmsize*2] > psubd m1, m0 ; smp[i] - p > psubd m5, m4 > psubd m7, m6 > - movh [resq], m1 ; res[i] = smp[i] - (p >> shift) > - movh [resq+mmsize/2], m5 > - movh [resq+mmsize], m7 > + mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift) > + mova [rsp+mmsize*2], m5 > + mova [rsp+mmsize*3], m7 > + > + ; process "even" samples > + pxor m0, m0 > + pxor m4, m4 > + pxor m6, m6 > + mov posj, orderq > + xor negj, negj > + > + .looporder2: > + movd m2, [coefsq+posj*4] ; c = coefs[j] > + SPLATD m2 > + movu m1, [smpq+negj*4] ; s = smp[i-j-1] > + movu m5, [smpq+negj*4+mmsize] > + movu m7, [smpq+negj*4+mmsize*2] > + pmuldq m1, m2 > + pmuldq m5, m2 > + pmuldq m7, m2 > + paddq m0, m1 ; p += c * s > + paddq m4, m5 > + paddq m6, m7 > + > + dec negj > + inc posj > + jnz .looporder2 > + > + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift > + HACK_PSRAQ m4, m3, [rsp], m2 > + HACK_PSRAQ m6, m3, [rsp], m2 > + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift) > + CLIPQ m4, [pq_int_min], [pq_int_max], m2 > + CLIPQ m6, [pq_int_min], [pq_int_max], m2 > + movu m1, [smpq+4] > + movu m5, [smpq+4+mmsize] > + movu m7, [smpq+4+mmsize*2] > + psubd m1, m0 ; smp[i] - p > + psubd m5, m4 > + psubd m7, m6 > + > + ; interleave odd and even samples > + pslldq m1, 4 > + pslldq m5, 4 > + pslldq m7, 4 > + > + pblendw m1, [rsp+mmsize], q0303 > + pblendw m5, [rsp+mmsize*2], q0303 > + pblendw m7, [rsp+mmsize*3], q0303 > + > + movu [resq], m1 > + movu [resq+mmsize], m5 > + movu [resq+mmsize*2], m7 > + > + add resq, 3*mmsize > + add smpq, 3*mmsize > + sub length, (3*mmsize)/4 > > - add resq, (3*mmsize)/2 > - add smpq, (3*mmsize)/2 > - sub length, (3*mmsize)/8 > jg .looplen > RET > > -- > 2.15.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > Apart from that lgtm _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel