From: Guillaume Khayat <gkha...@spectre-music.com> Improve performance (+17%) of ebur_128 filter using AVX2 and FMA instruction in the body of the filter_frame function.
## Benchmark Tested with hyperfine hyperfine --warmup 2 "./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -" "./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -" Benchmark 1: ./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null - Time (mean ± σ): 7.118 s ± 0.037 s [User: 9.114 s, System: 1.038 s] Range (min … max): 7.073 s … 7.177 s 10 runs Benchmark 2: ./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null - Time (mean ± σ): 6.073 s ± 0.108 s [User: 7.903 s, System: 1.058 s] Range (min … max): 5.955 s … 6.327 s 10 runs Summary ./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null - ran 1.17 ± 0.02 times faster than ./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null - ## Tests - all FATE tests pass, tested on Darwin/arm64 and Linux/x86_64 w/ AVX2/FMA support - On AVX2/FMA-capable system, all test files from the EBU yield the exact same output values (I/LRA) after and before optimization. See https://tech.ebu.ch/publications/ebu_loudness_test_set Disclaimer: this is my first ever patch submission to FFmpeg, and first ever time using git send-email to submit a patch anywhere. Signed-off-by: Cesar Matheus <cesar.math...@telecom-paris.fr> Signed-off-by: Guillaume Khayat <gkha...@spectre-music.com> --- libavfilter/f_ebur128.c | 246 ++++++++++++++++++++++++++++++++++------ 1 file changed, 214 insertions(+), 32 deletions(-) diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c index 768f062bac..e305b0a3ce 100644 --- a/libavfilter/f_ebur128.c +++ b/libavfilter/f_ebur128.c @@ -28,7 +28,7 @@ #include <float.h> #include <math.h> - +#include "libavutil/intmath.h" #include "libavutil/avassert.h" #include "libavutil/channel_layout.h" #include "libavutil/dict.h" @@ -199,7 +199,7 @@ static const AVOption ebur128_options[] = { }; AVFILTER_DEFINE_CLASS(ebur128); - +#define MIN(a, b) ((a) < (b) ? (a) : (b)) static const uint8_t graph_colors[] = { 0xdd, 0x66, 0x66, // value above 1LU non reached below -1LU (impossible) 0x66, 0x66, 0xdd, // value below 1LU non reached below -1LU @@ -628,13 +628,61 @@ static int gate_update(struct integrator *integ, double power, static int filter_frame(AVFilterLink *inlink, AVFrame *insamples) { - int i, ch, idx_insample, ret; + + int i, ch, idx_insample, ret,bin_id_400,bin_id_3000; AVFilterContext *ctx = inlink->dst; EBUR128Context *ebur128 = ctx->priv; const int nb_channels = ebur128->nb_channels; const int nb_samples = insamples->nb_samples; const double *samples = (double *)insamples->data[0]; AVFrame *pic; + +#if HAVE_AVX2_EXTERNAL && HAVE_AVX2 + double bin[4]; + __m256d pre_b_0,pre_b_1,pre_b_2,pre_a_1,pre_a_2,rlb_b_0,rlb_b_1,rlb_b_2,rlb_a_1,rlb_a_2,x1,x2,x0,y1,y2,y0,z1,z2,z0; // + + /** + * Set each coeeficients value of pre_b,pre_a,rlb_b, rlb_a as vector variable of size 4, each element + * of the vectors corresponds to a channel, here we coded the case where there is 2 channel to deal with, case where nb_channel =3 or + * 4 can easily be implemented following the exact same methodology + */ + + // Case where nb_channel = 3 : pre_b_0 = _mm256_setr_ps(ebur128->pre_b[0],ebur128->pre_b[0], ebur128->pre_b[0], 0.0); + bin[0] = 0.0; + bin[1] = 0.0; + bin[2] = 0.0; + bin[3] = 0.0; + + // Load pre_b coefficients in 3 4*64 bits vector + pre_b_0 = _mm256_set1_pd(ebur128->pre_b[0]); + pre_b_1 = _mm256_set1_pd(ebur128->pre_b[1]); + pre_b_2 = _mm256_set1_pd(ebur128->pre_b[2]); + + // Load pre_a coefficients in 2 4*64 bits vector pre_a_0 is not used here so no need to lad it + pre_a_1 = _mm256_set1_pd(ebur128->pre_a[1]); + pre_a_2 = _mm256_set1_pd(ebur128->pre_a[2]); + + // Load rlb_b + rlb_b_0 = _mm256_set1_pd(ebur128->rlb_b[0]); + rlb_b_1 = _mm256_set1_pd(ebur128->rlb_b[1]); + rlb_b_2 = _mm256_set1_pd(ebur128->rlb_b[2]); + + // Load rlb_a + rlb_a_1 = _mm256_set1_pd(ebur128->rlb_a[1]); + rlb_a_2 = _mm256_set1_pd(ebur128->rlb_a[2]); + + // At the start all the buffer filter are set at 0 in the start + x1 = _mm256_set1_pd(0.0); + x2 = _mm256_set1_pd(0.0); + + y0 = _mm256_set1_pd(0.0); + y1 = _mm256_set1_pd(0.0); + y2 = _mm256_set1_pd(0.0); + + z0 =_mm256_set1_pd(0.0); + z1 = _mm256_set1_pd(0.0); + z2 = _mm256_set1_pd(0.0); +#endif #if CONFIG_SWRESAMPLE if (ebur128->peak_mode & PEAK_MODE_TRUE_PEAKS && ebur128->idx_insample == 0) { @@ -657,8 +705,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples) #endif for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) { - const int bin_id_400 = ebur128->i400.cache_pos; - const int bin_id_3000 = ebur128->i3000.cache_pos; + bin_id_400 = ebur128->i400.cache_pos; + bin_id_3000 = ebur128->i3000.cache_pos; #define MOVE_TO_NEXT_CACHED_ENTRY(time) do { \ ebur128->i##time.cache_pos++; \ @@ -671,46 +719,180 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples) MOVE_TO_NEXT_CACHED_ENTRY(400); MOVE_TO_NEXT_CACHED_ENTRY(3000); + +#if HAVE_AVX2_EXTERNAL && HAVE_AVX2 + // Performs filter computation in parallel for the first 4 channels of the audio file + for (ch = 0; ch < MIN(4,nb_channels); ch++) { + if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS){ + ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[0], fabs(samples[idx_insample * nb_channels ])); + } + bin[ch] = samples[idx_insample * nb_channels +ch ]; + } + + // Initialise x0 + x0 = _mm256_setr_pd(bin[0],bin[1], bin[2],bin[3]); + y2 = y1; + y1 = y0; + y0 = _mm256_fmadd_pd(x0,pre_b_0,_mm256_fmadd_pd(x1,pre_b_1,_mm256_fmadd_pd(x2,pre_b_2,_mm256_fnmsub_pd(y1,pre_a_1,_mm256_mul_pd(y2,pre_a_2))))); + + + x2 = x1; + x1 = x0; + + z2 = z1; + z1 = z0; + + z0 = _mm256_fmadd_pd(y0,rlb_b_0,_mm256_fmadd_pd(y1,rlb_b_1,_mm256_fmadd_pd(y2,rlb_b_2,_mm256_fnmsub_pd(z1,rlb_a_1,_mm256_mul_pd(z2,rlb_a_2))))); + + + // Retrieve the filtered values stored in Z0, bin[i] gets the value corresponding to the channel i + _mm256_store_pd(bin, _mm256_mul_pd(z0, z0)); + + + + /** + * Add the new value, and limit the sum to the cache size (400ms or 3s) + * by removing the oldest one + * update sum and cache, demanding on the number of channel + */ + switch(nb_channels){ + case 1: + + ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400]; + ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000]; + ebur128->i400.cache [0][bin_id_400 ] = bin[0]; + ebur128->i3000.cache[0][bin_id_3000] = bin[0]; + break; + case 2: + + ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400]; + ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000]; + + ebur128->i400.cache [0][bin_id_400 ] = bin[0]; + ebur128->i3000.cache[0][bin_id_3000] = bin[0]; + + ebur128->i400.sum [1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400]; + ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000]; - for (ch = 0; ch < nb_channels; ch++) { - double bin; + ebur128->i400.cache [1][bin_id_400 ] = bin[1]; + ebur128->i3000.cache[1][bin_id_3000] = bin[1]; + break; + + case 3: + + ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400]; + ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000]; + ebur128->i400.cache [0][bin_id_400 ] = bin[0]; + ebur128->i3000.cache[0][bin_id_3000] = bin[0]; + + ebur128->i400.sum [1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400]; + ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000]; + ebur128->i400.cache [1][bin_id_400 ] = bin[1]; + ebur128->i3000.cache[1][bin_id_3000] = bin[1]; + + ebur128->i400.sum [2] = ebur128->i400.sum [2] + bin[2] - ebur128->i400.cache [2][bin_id_400]; + ebur128->i3000.sum[2] = ebur128->i3000.sum[2] + bin[2] - ebur128->i3000.cache[2][bin_id_3000]; + ebur128->i400.cache [2][bin_id_400 ] = bin[2]; + ebur128->i3000.cache[2][bin_id_3000] = bin[2]; + break; + + default : + ebur128->i400.sum[0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400]; + ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000]; + ebur128->i400.cache[0][bin_id_400 ] = bin[0]; + ebur128->i3000.cache[0][bin_id_3000] = bin[0]; + + ebur128->i400.sum[1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400]; + ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000]; + ebur128->i400.cache[1][bin_id_400 ] = bin[1]; + ebur128->i3000.cache[1][bin_id_3000] = bin[1]; + + ebur128->i400.sum[2] = ebur128->i400.sum [2] + bin[2] - ebur128->i400.cache [2][bin_id_400]; + ebur128->i3000.sum[2] = ebur128->i3000.sum[2] + bin[2] - ebur128->i3000.cache[2][bin_id_3000]; + ebur128->i400.cache[2][bin_id_400 ] = bin[2]; + ebur128->i3000.cache[2][bin_id_3000] = bin[2]; + + ebur128->i400.sum[3] = ebur128->i400.sum [3] + bin[3] - ebur128->i400.cache [3][bin_id_400]; + ebur128->i3000.sum[3] = ebur128->i3000.sum[3] + bin[3] - ebur128->i3000.cache[3][bin_id_3000]; + ebur128->i400.cache [3][bin_id_400 ] = bin[3]; + ebur128->i3000.cache[3][bin_id_3000] = bin[3]; + break; + } + // Use the classic version to compute data from the remainings channels + for (ch = 4; ch < nb_channels; ch++) { + double bin2; if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch])); ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // set X[i] - if (!ebur128->ch_weighting[ch]) continue; + + #define FILTER(Y, X, NUM, DEN) do { \ + double *dst = ebur128->Y + ch*3; \ + double *src = ebur128->X + ch*3; \ + dst[2] = dst[1]; \ + dst[1] = dst[0]; \ + dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \ + - dst[1]*DEN[1] - dst[2]*DEN[2]; \ + } while (0) + + // TODO: merge both filters in one? + FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter + ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1]; + ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ]; + FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter + + bin2 = ebur128->z[ch * 3] * ebur128->z[ch * 3]; + + /* Add the new value, and limit the sum to the cache size (400ms or 3s) + * by removing the oldest one */ + ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin2 - ebur128->i400.cache [ch][bin_id_400]; + ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin2 - ebur128->i3000.cache[ch][bin_id_3000]; + + // Override old cache entry with the new value + ebur128->i400.cache [ch][bin_id_400 ] = bin2; + ebur128->i3000.cache[ch][bin_id_3000] = bin2; + } - /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */ -#define FILTER(Y, X, NUM, DEN) do { \ - double *dst = ebur128->Y + ch*3; \ - double *src = ebur128->X + ch*3; \ - dst[2] = dst[1]; \ - dst[1] = dst[0]; \ - dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \ - - dst[1]*DEN[1] - dst[2]*DEN[2]; \ -} while (0) - - // TODO: merge both filters in one? - FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter - ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1]; - ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ]; - FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter +#else - bin = ebur128->z[ch * 3] * ebur128->z[ch * 3]; + for (ch = 0; ch < nb_channels; ch++) { + double bin; + if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS) + ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch])); - /* add the new value, and limit the sum to the cache size (400ms or 3s) - * by removing the oldest one */ - ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400]; - ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000]; + ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // Set X[i] - /* override old cache entry with the new value */ - ebur128->i400.cache [ch][bin_id_400 ] = bin; - ebur128->i3000.cache[ch][bin_id_3000] = bin; + if (!ebur128->ch_weighting[ch]) + continue; + + #define FILTER(Y, X, NUM, DEN) do { \ + double *dst = ebur128->Y + ch*3; \ + double *src = ebur128->X + ch*3; \ + dst[2] = dst[1]; \ + dst[1] = dst[0]; \ + dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \ + - dst[1]*DEN[1] - dst[2]*DEN[2]; \ + } while (0) + + // TODO: merge both filters in one? + FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // Apply pre-filter + ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1]; + ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ]; + FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // Apply RLB-filter + bin = ebur128->z[ch * 3] * ebur128->z[ch * 3]; + /* Add the new value, and limit the sum to the cache size (400ms or 3s) + * by removing the oldest one */ + ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400]; + ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000]; + // Override old cache entry with the new value + ebur128->i400.cache [ch][bin_id_400 ] = bin; + ebur128->i3000.cache[ch][bin_id_3000] = bin; } - +#endif + #define FIND_PEAK(global, sp, ptype) do { \ int ch; \ double maxpeak; \ -- 2.49.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".