chrRangeFromJpeg8_1920_c: 3874.8 ( 1.00x) chrRangeFromJpeg8_1920_sse2: 1493.8 ( 2.59x) chrRangeFromJpeg8_1920_avx2: 741.8 ( 5.22x) chrRangeToJpeg8_1920_c: 5232.8 ( 1.00x) chrRangeToJpeg8_1920_sse2: 1673.3 ( 3.13x) chrRangeToJpeg8_1920_avx2: 850.6 ( 6.15x) lumRangeFromJpeg8_1920_c: 2416.3 ( 1.00x) lumRangeFromJpeg8_1920_sse2: 760.1 ( 3.18x) lumRangeFromJpeg8_1920_avx2: 379.6 ( 6.37x) lumRangeToJpeg8_1920_c: 3121.1 ( 1.00x) lumRangeToJpeg8_1920_sse2: 870.1 ( 3.59x) lumRangeToJpeg8_1920_avx2: 434.8 ( 7.18x) --- libswscale/x86/range_convert.asm | 112 ++++++++++++++++++------------- libswscale/x86/swscale.c | 14 ++-- 2 files changed, 73 insertions(+), 53 deletions(-)
diff --git a/libswscale/x86/range_convert.asm b/libswscale/x86/range_convert.asm index 97c7525448..d1aff63d7c 100644 --- a/libswscale/x86/range_convert.asm +++ b/libswscale/x86/range_convert.asm @@ -20,55 +20,53 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -chr_to_mult: times 4 dw 4663, 0 -chr_to_offset: times 4 dd -9289992 -%define chr_to_shift 12 - -chr_from_mult: times 4 dw 1799, 0 -chr_from_offset: times 4 dd 4081085 -%define chr_from_shift 11 - -lum_to_mult: times 4 dw 19077, 0 -lum_to_offset: times 4 dd -39057361 -%define lum_to_shift 14 - -lum_from_mult: times 4 dw 14071, 0 -lum_from_offset: times 4 dd 33561947 -%define lum_from_shift 14 - SECTION .text -; NOTE: there is no need to clamp the input when converting to jpeg range -; (like we do in the C code) because packssdw will saturate the output. - ;----------------------------------------------------------------------------- ; lumConvertRange ; -; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width); -; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width); +; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width, +; int amax, int coeff, int64_t offset); +; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width, +; int amax, int coeff, int64_t offset); ; ;----------------------------------------------------------------------------- -%macro LUMCONVERTRANGE 4 -cglobal %1, 2, 2, 5, dst, width +%macro LUMCONVERTRANGE 1 +%ifidni %1,To +cglobal lumRange%1Jpeg, 5, 5, 6, dst, width, amax, coeff, offset +%else +cglobal lumRange%1Jpeg, 5, 5, 5, dst, width, amax, coeff, offset +%endif shl widthd, 1 - VBROADCASTI128 m2, [%2] - VBROADCASTI128 m3, [%3] + movd xm2, coeffd + VBROADCASTSS m2, xm2 +%if ARCH_X86_64 + movq xm3, offsetq +%else + movq xm3, offsetm +%endif + VBROADCASTSS m3, xm3 pxor m4, m4 +%ifidni %1,To + movd xm5, amaxd + SPLATW m5, xm5 +%endif add dstq, widthq neg widthq .loop: movu m0, [dstq+widthq] +%ifidni %1,To + pminsw m0, m5 +%endif punpckhwd m1, m0, m4 punpcklwd m0, m4 pmaddwd m0, m2 pmaddwd m1, m2 paddd m0, m3 paddd m1, m3 - psrad m0, %4 - psrad m1, %4 + psrad m0, 14 + psrad m1, 14 packssdw m0, m1 movu [dstq+widthq], m0 add widthq, mmsize @@ -79,23 +77,43 @@ cglobal %1, 2, 2, 5, dst, width ;----------------------------------------------------------------------------- ; chrConvertRange ; -; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); -; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width); +; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width, +; int amax, int coeff, int64_t offset); +; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width, +; int amax, int coeff, int64_t offset); ; ;----------------------------------------------------------------------------- -%macro CHRCONVERTRANGE 4 -cglobal %1, 3, 3, 7, dstU, dstV, width +%macro CHRCONVERTRANGE 1 +%ifidni %1,To +cglobal chrRange%1Jpeg, 6, 6, 8, dstU, dstV, width, amax, coeff, offset +%else +cglobal chrRange%1Jpeg, 6, 6, 7, dstU, dstV, width, amax, coeff, offset +%endif shl widthd, 1 - VBROADCASTI128 m4, [%2] - VBROADCASTI128 m5, [%3] + movd xm4, coeffd + VBROADCASTSS m4, xm4 +%if ARCH_X86_64 + movq xm5, offsetq +%else + movq xm5, offsetm +%endif + VBROADCASTSS m5, xm5 pxor m6, m6 +%ifidni %1,To + movd xm7, amaxd + SPLATW m7, xm7 +%endif add dstUq, widthq add dstVq, widthq neg widthq .loop: movu m0, [dstUq+widthq] movu m2, [dstVq+widthq] +%ifidni %1,To + pminsw m0, m7 + pminsw m2, m7 +%endif punpckhwd m1, m0, m6 punpckhwd m3, m2, m6 punpcklwd m0, m6 @@ -108,10 +126,10 @@ cglobal %1, 3, 3, 7, dstU, dstV, width paddd m1, m5 paddd m2, m5 paddd m3, m5 - psrad m0, %4 - psrad m1, %4 - psrad m2, %4 - psrad m3, %4 + psrad m0, 14 + psrad m1, 14 + psrad m2, 14 + psrad m3, 14 packssdw m0, m1 packssdw m2, m3 movu [dstUq+widthq], m0 @@ -122,15 +140,15 @@ cglobal %1, 3, 3, 7, dstU, dstV, width %endmacro INIT_XMM sse2 -LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift -CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift -LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift -CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift +LUMCONVERTRANGE To +CHRCONVERTRANGE To +LUMCONVERTRANGE From +CHRCONVERTRANGE From %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift -CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift -LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift -CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift +LUMCONVERTRANGE To +CHRCONVERTRANGE To +LUMCONVERTRANGE From +CHRCONVERTRANGE From %endif diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index d55e45471f..2377365e91 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -464,24 +464,26 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2); } while (0) #define RANGE_CONVERT_FUNCS_DECL(opt) \ -void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \ -void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ -void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \ -void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \ +void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \ + int amax, int coeff, int64_t offset); \ +void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \ + int amax, int coeff, int64_t offset); \ +void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \ + int amax, int coeff, int64_t offset); \ +void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \ + int amax, int coeff, int64_t offset); \ RANGE_CONVERT_FUNCS_DECL(sse2); RANGE_CONVERT_FUNCS_DECL(avx2); av_cold void ff_sws_init_range_convert_x86(SwsContext *c) { -#if 0 int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_AVX2_FAST(cpu_flags)) { RANGE_CONVERT_FUNCS(avx2); } else if (EXTERNAL_SSE2(cpu_flags)) { RANGE_CONVERT_FUNCS(sse2); } -#endif } av_cold void ff_sws_init_swscale_x86(SwsContext *c) -- 2.30.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".