From: Reimar Döffinger <reimar.doeffin...@gmx.de> This requests loops to be vectorized using SIMD instructions. The performance increase is far from hand-optimized assembly but still significant over the plain C version. Typical values are a 2-4x speedup where a hand-written version would achieve 4x-10x. So it is far from a replacement, however some architures will get hand-written assembler quite late or not at all, and this is a good improvement for a trivial amount of work. The cause, besides the compiler being a compiler, is usually that it does not manage to use saturating instructions and thus has to use 32-bit operations where actually saturating 16-bit operations would be sufficient. Other causes are for example the av_clip functions that are not ideal for vectorization (and even as scalar code not optimal for any modern CPU that has either CSEL or MAX/MIN instructions). And of course this only works for relatively simple loops, the IDCT functions for example seemed not possible to optimize that way. Also note that while clang may accept the code and sometimes produces warnings, it does not seem to do anything actually useful at all. Here are example measurements using gcc 10 under Linux (in a VM unfortunately) on AArch64 on Apple M1: Commad: time ./ffplay_g LG\ 4K\ HDR\ Demo\ -\ New\ York.ts -t 10 -autoexit -threads 1 -noframedrop
Original code: real 0m19.572s user 0m23.386s sys 0m0.213s Changing all put_hevc: real 0m15.648s user 0m19.503s (83.4% of original) sys 0m0.186s In addition changing add_residual: real 0m15.424s user 0m19.278s (82.4% of original) sys 0m0.133s In addition changing planar copy dither: real 0m15.040s user 0m18.874s (80.7% of original) sys 0m0.168s Signed-off-by: Reimar Döffinger <reimar.doeffin...@gmx.de> --- configure | 23 +++++++++++++++++ libavcodec/hevcdsp_template.c | 47 +++++++++++++++++++++++++++++++++++ libavutil/internal.h | 6 +++++ libswscale/swscale_unscaled.c | 3 +++ 4 files changed, 79 insertions(+) diff --git a/configure b/configure index 900505756b..73b7c3daeb 100755 --- a/configure +++ b/configure @@ -406,6 +406,7 @@ Toolchain options: --enable-pic build position-independent code --enable-thumb compile for Thumb instruction set --enable-lto use link-time optimization + --enable-openmp-simd use the "omp simd" pragma to optimize code --env="ENV=override" override the environment variables Advanced options (experts only): @@ -2335,6 +2336,7 @@ HAVE_LIST=" opencl_dxva2 opencl_vaapi_beignet opencl_vaapi_intel_media + openmp_simd perl pod2man texi2html @@ -2446,6 +2448,7 @@ CMDLINE_SELECT=" extra_warnings logging lto + openmp_simd optimizations rpath stripping @@ -6926,6 +6929,26 @@ if enabled lto; then disable inline_asm_direct_symbol_refs fi +if enabled openmp_simd; then + ompopt="-fopenmp" + if ! test_cflags $ompopt ; then + test_cflags -Xpreprocessor -fopenmp && ompopt="-Xpreprocessor -fopenmp" + fi + test_cc $ompopt <<EOF && add_cflags "$ompopt" || die "failed to enable openmp SIMD" +#ifndef _OPENMP +#error _OPENMP is not defined +#endif +void test(unsigned char *c) +{ + _Pragma("omp simd") + for (int i = 0; i < 256; i++) + { + c[i] *= 16; + } +} +EOF +fi + enabled ftrapv && check_cflags -ftrapv test_cc -mno-red-zone <<EOF && noredzone_flags="-mno-red-zone" diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 56cd9e605d..1a8b4160ec 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -50,6 +50,7 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res, stride /= sizeof(pixel); for (y = 0; y < size; y++) { + FF_OMP_SIMD for (x = 0; x < size; x++) { dst[x] = av_clip_pixel(dst[x] + *res); res++; @@ -247,6 +248,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ int16_t *src = coeffs; \ IDCT_VAR ## H(H); \ \ + FF_OMP_SIMD \ for (i = 0; i < H; i++) { \ TR_ ## H(src, src, H, H, SCALE, limit2); \ if (limit2 < H && i%4 == 0 && !!i) \ @@ -256,6 +258,7 @@ static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ \ shift = 20 - BIT_DEPTH; \ add = 1 << (shift - 1); \ + FF_OMP_SIMD \ for (i = 0; i < H; i++) { \ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ coeffs += H; \ @@ -502,6 +505,7 @@ static void FUNC(put_hevc_pel_pixels)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = src[x] << (14 - BIT_DEPTH); src += srcstride; @@ -543,6 +547,7 @@ static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ui #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); src += srcstride; @@ -568,6 +573,7 @@ static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); src += srcstride; @@ -592,6 +598,7 @@ static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); } @@ -623,6 +630,7 @@ static void FUNC(put_hevc_qpel_h)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -639,6 +647,7 @@ static void FUNC(put_hevc_qpel_v)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); src += srcstride; @@ -662,6 +671,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -671,6 +681,7 @@ static void FUNC(put_hevc_qpel_hv)(int16_t *dst, tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; tmp += MAX_PB_SIZE; @@ -697,6 +708,7 @@ static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -724,6 +736,7 @@ static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); src += srcstride; @@ -751,6 +764,7 @@ static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -779,6 +793,7 @@ static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); src += srcstride; @@ -810,6 +825,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -820,6 +836,7 @@ static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); tmp += MAX_PB_SIZE; @@ -849,6 +866,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -859,6 +877,7 @@ static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 filter = ff_hevc_qpel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); tmp += MAX_PB_SIZE; @@ -887,6 +906,7 @@ static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); src += srcstride; @@ -913,6 +933,7 @@ static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -942,6 +963,7 @@ static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); src += srcstride; @@ -968,6 +990,7 @@ static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1000,6 +1023,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1011,6 +1035,7 @@ static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); tmp += MAX_PB_SIZE; @@ -1037,6 +1062,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin src -= QPEL_EXTRA_BEFORE * srcstride; filter = ff_hevc_qpel_filters[mx - 1]; for (y = 0; y < height + QPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1049,6 +1075,7 @@ static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1076,6 +1103,7 @@ static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t srcstride = _srcstride / sizeof(pixel); const int8_t *filter = ff_hevc_epel_filters[mx - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1093,6 +1121,7 @@ static void FUNC(put_hevc_epel_v)(int16_t *dst, const int8_t *filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); src += srcstride; @@ -1114,6 +1143,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1124,6 +1154,7 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; tmp += MAX_PB_SIZE; @@ -1148,6 +1179,7 @@ static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8 #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -1173,6 +1205,7 @@ static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); } @@ -1199,6 +1232,7 @@ static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8 #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); src += srcstride; @@ -1224,6 +1258,7 @@ static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_ #endif for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); dst += dststride; @@ -1253,6 +1288,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1263,6 +1299,7 @@ static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); tmp += MAX_PB_SIZE; @@ -1292,6 +1329,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1302,6 +1340,7 @@ static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8 filter = ff_hevc_epel_filters[my - 1]; for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); tmp += MAX_PB_SIZE; @@ -1328,6 +1367,7 @@ static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uin ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); } @@ -1353,6 +1393,7 @@ static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1380,6 +1421,7 @@ static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uin ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) { dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); } @@ -1405,6 +1447,7 @@ static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); @@ -1435,6 +1478,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1446,6 +1490,7 @@ static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ui ox = ox * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); tmp += MAX_PB_SIZE; @@ -1472,6 +1517,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin src -= EPEL_EXTRA_BEFORE * srcstride; for (y = 0; y < height + EPEL_EXTRA; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); src += srcstride; @@ -1484,6 +1530,7 @@ static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uin ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ox1 = ox1 * (1 << (BIT_DEPTH - 8)); for (y = 0; y < height; y++) { + FF_OMP_SIMD for (x = 0; x < width; x++) dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); diff --git a/libavutil/internal.h b/libavutil/internal.h index 93ea57c324..b0543bbf02 100644 --- a/libavutil/internal.h +++ b/libavutil/internal.h @@ -299,4 +299,10 @@ int avpriv_dict_set_timestamp(AVDictionary **dict, const char *key, int64_t time #define FF_PSEUDOPAL 0 #endif +#if HAVE_OPENMP_SIMD +#define FF_OMP_SIMD _Pragma("omp simd") +#else +#define FF_OMP_SIMD +#endif + #endif /* AVUTIL_INTERNAL_H */ diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c index c4dd8a4d83..c112a61037 100644 --- a/libswscale/swscale_unscaled.c +++ b/libswscale/swscale_unscaled.c @@ -1743,6 +1743,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], unsigned shift= src_depth-dst_depth, tmp;\ if (c->dither == SWS_DITHER_NONE) {\ for (i = 0; i < height; i++) {\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ dst[j+0] = dbswap(bswap(src[j+0])>>shift);\ dst[j+1] = dbswap(bswap(src[j+1])>>shift);\ @@ -1762,6 +1763,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } else if (shiftonly) {\ for (i = 0; i < height; i++) {\ const uint8_t *dither= dithers[shift-1][i&7];\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ tmp = (bswap(src[j+0]) + dither[0])>>shift; dst[j+0] = dbswap(tmp - (tmp>>dst_depth));\ tmp = (bswap(src[j+1]) + dither[1])>>shift; dst[j+1] = dbswap(tmp - (tmp>>dst_depth));\ @@ -1781,6 +1783,7 @@ static int packedCopyWrapper(SwsContext *c, const uint8_t *src[], } else {\ for (i = 0; i < height; i++) {\ const uint8_t *dither= dithers[shift-1][i&7];\ + FF_OMP_SIMD \ for (j = 0; j < length-7; j+=8) {\ tmp = bswap(src[j+0]); dst[j+0] = dbswap((tmp - (tmp>>dst_depth) + dither[0])>>shift);\ tmp = bswap(src[j+1]); dst[j+1] = dbswap((tmp - (tmp>>dst_depth) + dither[1])>>shift);\ -- 2.24.3 (Apple Git-128) _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".