On 4/27/2016 2:22 PM, foo86 wrote:
> ---
>  Changelog            |    1 +
>  libavcodec/Makefile  |    2 +-
>  libavcodec/dca_lbr.c | 1858 
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/dca_lbr.h |  130 ++++
>  libavcodec/dcadata.c |  460 +++++++++++++
>  libavcodec/dcadata.h |   46 ++
>  libavcodec/dcadec.c  |   22 +-
>  libavcodec/dcadec.h  |    5 +-
>  libavcodec/dcadsp.c  |   27 +
>  libavcodec/dcadsp.h  |    4 +
>  libavcodec/dcahuff.c |  245 ++++++-
>  libavcodec/dcahuff.h |   13 +
>  12 files changed, 2807 insertions(+), 6 deletions(-)
>  create mode 100644 libavcodec/dca_lbr.c
>  create mode 100644 libavcodec/dca_lbr.h


> +#define SW0     0.022810893
> +#define SW1     0.41799772
> +#define SW2     0.9084481
> +#define SW3     0.99973983
> +
> +#define C1      0.068974845
> +#define C2      0.34675997
> +#define C3      0.29396889
> +#define C4      0.19642374
> +
> +#define AL1     0.30865827
> +#define AL2     0.038060233

Make sure these are float. gcc, clang and icc are all converting the 
below to double before multiplying, making the following function much slower
than it should be.

> +
> +static void transform_channel(DCALbrDecoder *s, int ch, float *output)
> +{
> +    LOCAL_ALIGNED(32, float, values, [DCA_LBR_SUBBANDS    ], [4]);
> +    LOCAL_ALIGNED(32, float, result, [DCA_LBR_SUBBANDS * 2], [4]);

LOCAL_ALIGNED_32(float, ...)

> +    int i, sf, nsubbands = s->nsubbands, noutsubbands = 8 << s->freq_range;
> +
> +    // Clear inactive subbands
> +    if (nsubbands < noutsubbands)
> +        memset(values[nsubbands], 0, (noutsubbands - nsubbands) * 
> sizeof(values[0]));
> +
> +    for (sf = 0; sf < DCA_LBR_TIME_SAMPLES / 4; sf++) {
> +        // Short window and 8 point forward MDCT

According to perf, a lot of CPU time is spent on this and the aliasing 
code below, at least with a mono sample i found in the wild. Fixing the 
above helps a lot, though.

It looks like it shouldn't be hard to write using simd, so maybe it would be a 
idea to move this part to dcadsp.

> +        for (i = 0; i < nsubbands; i++) {
> +            float *samples = &s->time_samples[ch][i][sf * 4];
> +
> +            float a = samples[-4] * SW0 - samples[-1] * SW3;
> +            float b = samples[-3] * SW1 - samples[-2] * SW2;
> +            float c = samples[ 2] * SW1 + samples[ 1] * SW2;
> +            float d = samples[ 3] * SW0 + samples[ 0] * SW3;
> +
> +            values[i][0] = C1 * b - C2 * c + C4 * a - C3 * d;
> +            values[i][1] = C1 * d - C2 * a - C4 * b - C3 * c;
> +            values[i][2] = C3 * b + C2 * d - C4 * c + C1 * a;
> +            values[i][3] = C3 * a - C2 * b + C4 * d - C1 * c;
> +        }
> +
> +        // Aliasing cancellation for high frequencies
> +        for (i = 12; i < nsubbands - 1; i++) {
> +            float a = values[i  ][3] * AL1;
> +            float b = values[i+1][0] * AL1;
> +            values[i  ][3] += b - a;
> +            values[i+1][0] -= b + a;
> +            a = values[i  ][2] * AL2;
> +            b = values[i+1][1] * AL2;
> +            values[i  ][2] += b - a;
> +            values[i+1][1] -= b + a;
> +        }
> +
> +        base_func_synth(s, ch, values[0], sf);
> +
> +        s->imdct.imdct_calc(&s->imdct, result[0], values[0]);
> +
> +        // Long window and overlap-add
> +        s->fdsp->vector_fmul_add(output, result[0], s->window,
> +                                 s->history[ch], noutsubbands * 4);
> +        s->fdsp->vector_fmul_reverse(s->history[ch], result[noutsubbands],
> +                                     s->window, noutsubbands * 4);
> +        output += noutsubbands * 4;
> +    }
> +
> +    // Update history for LPC and forward MDCT
> +    for (i = 0; i < nsubbands; i++) {
> +        float *samples = s->time_samples[ch][i] - DCA_LBR_TIME_HISTORY;
> +        memcpy(samples, samples + DCA_LBR_TIME_SAMPLES, DCA_LBR_TIME_HISTORY 
> * sizeof(float));
> +    }
> +}

ffmpeg-devel mailing list

Reply via email to