On 4/27/2016 2:22 PM, foo86 wrote: > --- > Changelog | 1 + > libavcodec/Makefile | 2 +- > libavcodec/dca_lbr.c | 1858 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > libavcodec/dca_lbr.h | 130 ++++ > libavcodec/dcadata.c | 460 +++++++++++++ > libavcodec/dcadata.h | 46 ++ > libavcodec/dcadec.c | 22 +- > libavcodec/dcadec.h | 5 +- > libavcodec/dcadsp.c | 27 + > libavcodec/dcadsp.h | 4 + > libavcodec/dcahuff.c | 245 ++++++- > libavcodec/dcahuff.h | 13 + > 12 files changed, 2807 insertions(+), 6 deletions(-) > create mode 100644 libavcodec/dca_lbr.c > create mode 100644 libavcodec/dca_lbr.h
[...] > +#define SW0 0.022810893 > +#define SW1 0.41799772 > +#define SW2 0.9084481 > +#define SW3 0.99973983 > + > +#define C1 0.068974845 > +#define C2 0.34675997 > +#define C3 0.29396889 > +#define C4 0.19642374 > + > +#define AL1 0.30865827 > +#define AL2 0.038060233 Make sure these are float. gcc, clang and icc are all converting the time_samples below to double before multiplying, making the following function much slower than it should be. > + > +static void transform_channel(DCALbrDecoder *s, int ch, float *output) > +{ > + LOCAL_ALIGNED(32, float, values, [DCA_LBR_SUBBANDS ], [4]); > + LOCAL_ALIGNED(32, float, result, [DCA_LBR_SUBBANDS * 2], [4]); LOCAL_ALIGNED_32(float, ...) > + int i, sf, nsubbands = s->nsubbands, noutsubbands = 8 << s->freq_range; > + > + // Clear inactive subbands > + if (nsubbands < noutsubbands) > + memset(values[nsubbands], 0, (noutsubbands - nsubbands) * > sizeof(values[0])); > + > + for (sf = 0; sf < DCA_LBR_TIME_SAMPLES / 4; sf++) { > + // Short window and 8 point forward MDCT According to perf, a lot of CPU time is spent on this and the aliasing cancellation code below, at least with a mono sample i found in the wild. Fixing the constants above helps a lot, though. It looks like it shouldn't be hard to write using simd, so maybe it would be a good idea to move this part to dcadsp. > + for (i = 0; i < nsubbands; i++) { > + float *samples = &s->time_samples[ch][i][sf * 4]; > + > + float a = samples[-4] * SW0 - samples[-1] * SW3; > + float b = samples[-3] * SW1 - samples[-2] * SW2; > + float c = samples[ 2] * SW1 + samples[ 1] * SW2; > + float d = samples[ 3] * SW0 + samples[ 0] * SW3; > + > + values[i][0] = C1 * b - C2 * c + C4 * a - C3 * d; > + values[i][1] = C1 * d - C2 * a - C4 * b - C3 * c; > + values[i][2] = C3 * b + C2 * d - C4 * c + C1 * a; > + values[i][3] = C3 * a - C2 * b + C4 * d - C1 * c; > + } > + > + // Aliasing cancellation for high frequencies > + for (i = 12; i < nsubbands - 1; i++) { > + float a = values[i ][3] * AL1; > + float b = values[i+1][0] * AL1; > + values[i ][3] += b - a; > + values[i+1][0] -= b + a; > + a = values[i ][2] * AL2; > + b = values[i+1][1] * AL2; > + values[i ][2] += b - a; > + values[i+1][1] -= b + a; > + } > + > + base_func_synth(s, ch, values[0], sf); > + > + s->imdct.imdct_calc(&s->imdct, result[0], values[0]); > + > + // Long window and overlap-add > + s->fdsp->vector_fmul_add(output, result[0], s->window, > + s->history[ch], noutsubbands * 4); > + s->fdsp->vector_fmul_reverse(s->history[ch], result[noutsubbands], > + s->window, noutsubbands * 4); > + output += noutsubbands * 4; > + } > + > + // Update history for LPC and forward MDCT > + for (i = 0; i < nsubbands; i++) { > + float *samples = s->time_samples[ch][i] - DCA_LBR_TIME_HISTORY; > + memcpy(samples, samples + DCA_LBR_TIME_SAMPLES, DCA_LBR_TIME_HISTORY > * sizeof(float)); > + } > +} _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel