On 2/21/15, James Almer <jamr...@gmail.com> wrote: > Zeroing them allows us to use scalarproduct_int16 even if the length of > vectors is not multiple of 16. > > Signed-off-by: James Almer <jamr...@gmail.com> > --- > libavcodec/takdec.c | 28 +++++----------------------- > 1 file changed, 5 insertions(+), 23 deletions(-) > > diff --git a/libavcodec/takdec.c b/libavcodec/takdec.c > index 2f0155d..0f808e0 100644 > --- a/libavcodec/takdec.c > +++ b/libavcodec/takdec.c > @@ -451,6 +451,7 @@ static int decode_subframe(TAKDecContext *s, int32_t > *decoded, > s->filter[j] = x - ((tfilter[i] + y) >> (15 - filter_quant)); > s->filter[i] = x - ((tfilter[j] + y) >> (15 - filter_quant)); > } > + memset(s->filter + filter_order, 0, sizeof(s->filter) - > filter_order*2); > > if ((ret = decode_residues(s, &decoded[filter_order], > subframe_size - filter_order)) < 0) > @@ -467,15 +468,8 @@ static int decode_subframe(TAKDecContext *s, int32_t > *decoded, > for (i = 0; i < tmp; i++) { > int v = 1 << (filter_quant - 1); > > - if (filter_order & -16) > - v += s->adsp.scalarproduct_int16(&s->residues[i], > s->filter, > - filter_order & -16); > - for (j = filter_order & -16; j < filter_order; j += 4) { > - v += s->residues[i + j + 3] * s->filter[j + 3] + > - s->residues[i + j + 2] * s->filter[j + 2] + > - s->residues[i + j + 1] * s->filter[j + 1] + > - s->residues[i + j ] * s->filter[j ]; > - } > + v += s->adsp.scalarproduct_int16(&s->residues[i], s->filter, > + FFALIGN(filter_order, 16)); > v = (av_clip(v >> filter_quant, -8192, 8191) << dshift) - > *decoded; > *decoded++ = v; > s->residues[filter_order + i] = v >> dshift; > @@ -601,6 +595,7 @@ static int decorrelate(TAKDecContext *s, int c1, int c2, > int length) > code_size = 14 - get_bits(gb, 3); > s->filter[i] = get_sbits(gb, code_size); > } > + memset(s->filter + filter_order, 0, sizeof(s->filter) - > filter_order*2); > > order_half = filter_order / 2; > length2 = length - (filter_order - 1); > @@ -638,20 +633,7 @@ static int decorrelate(TAKDecContext *s, int c1, int > c2, int length) > for (i = 0; i < tmp; i++) { > int v = 1 << 9; > > - if (filter_order == 16) { > - v += s->adsp.scalarproduct_int16(&s->residues[i], > s->filter, > - filter_order); > - } else { > - v += s->residues[i + 7] * s->filter[7] + > - s->residues[i + 6] * s->filter[6] + > - s->residues[i + 5] * s->filter[5] + > - s->residues[i + 4] * s->filter[4] + > - s->residues[i + 3] * s->filter[3] + > - s->residues[i + 2] * s->filter[2] + > - s->residues[i + 1] * s->filter[1] + > - s->residues[i ] * s->filter[0]; > - } > - > + v += s->adsp.scalarproduct_int16(&s->residues[i], > s->filter, 16); > v = (av_clip(v >> 10, -8192, 8191) << dshift) - *p1; > *p1++ = v; > } > -- > 2.3.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
Have you measured performance drop before and after? _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel