aarch64 A55: chrRangeFromJpeg8_1920_c: 28835.2 (1.00x) chrRangeFromJpeg8_1920_neon: 5313.9 (5.43x) 5308.4 (5.43x) chrRangeToJpeg8_1920_c: 23074.7 (1.00x) chrRangeToJpeg8_1920_neon: 5551.3 (4.16x) 5549.2 (4.16x) lumRangeFromJpeg8_1920_c: 15389.7 (1.00x) lumRangeFromJpeg8_1920_neon: 3152.3 (4.88x) 3147.7 (4.89x) lumRangeToJpeg8_1920_c: 19227.8 (1.00x) lumRangeToJpeg8_1920_neon: 3628.7 (5.30x) 3630.2 (5.30x)
aarch64 A76: chrRangeFromJpeg8_1920_c: 6324.4 (1.00x) chrRangeFromJpeg8_1920_neon: 2344.5 (2.70x) 2304.2 (2.74x) chrRangeToJpeg8_1920_c: 9656.0 (1.00x) chrRangeToJpeg8_1920_neon: 2824.2 (3.42x) 2794.2 (3.46x) lumRangeFromJpeg8_1920_c: 4422.0 (1.00x) lumRangeFromJpeg8_1920_neon: 1104.5 (4.00x) 1106.2 (4.00x) lumRangeToJpeg8_1920_c: 5949.1 (1.00x) lumRangeToJpeg8_1920_neon: 1329.8 (4.47x) 1328.2 (4.48x) --- libswscale/aarch64/range_convert_neon.S | 59 +++++++++++++------------ libswscale/aarch64/swscale.c | 17 ++++--- 2 files changed, 39 insertions(+), 37 deletions(-) diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S index 2f418adb24..462ba6f866 100644 --- a/libswscale/aarch64/range_convert_neon.S +++ b/libswscale/aarch64/range_convert_neon.S @@ -20,12 +20,13 @@ #include "libavutil/aarch64/asm.S" -.macro lumConvertRange name, fromto, mult, offset, shift -function ff_\name, export=1 - mov w3, #\mult - dup v25.4s, w3 - movz w3, #(\offset & 0xffff) - movk w3, #((\offset >> 16) & 0xffff), lsl #16 +.macro lumConvertRange fromto +function ff_lumRange\fromto\()Jpeg_neon, export=1 +// x0 int16_t *dst +// w1 int width +// w2 uint32_t coeff +// x3 int64_t offset + dup v25.4s, w2 dup v26.4s, w3 1: ld1 {v0.8h}, [x0] @@ -36,11 +37,11 @@ function ff_\name, export=1 mla v16.4s, v20.4s, v25.4s mla v18.4s, v22.4s, v25.4s .ifc \fromto, To - sqshrn v0.4h, v16.4s, #\shift - sqshrn2 v0.8h, v18.4s, #\shift + sqshrn v0.4h, v16.4s, 14 + sqshrn2 v0.8h, v18.4s, 14 .else - shrn v0.4h, v16.4s, #\shift - shrn2 v0.8h, v18.4s, #\shift + shrn v0.4h, v16.4s, 14 + shrn2 v0.8h, v18.4s, 14 .endif subs w1, w1, #8 st1 {v0.8h}, [x0], #16 @@ -49,13 +50,15 @@ function ff_\name, export=1 endfunc .endm -.macro chrConvertRange name, fromto, mult, offset, shift -function ff_\name, export=1 - mov w3, #\mult +.macro chrConvertRange fromto +function ff_chrRange\fromto\()Jpeg_neon, export=1 +// x0 int16_t *dstU +// x1 int16_t *dstV +// w2 int width +// w3 uint32_t coeff +// x4 int64_t offset dup v25.4s, w3 - movz w3, #(\offset & 0xffff) - movk w3, #((\offset >> 16) & 0xffff), lsl #16 - dup v26.4s, w3 + dup v26.4s, w4 1: ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x1] @@ -72,15 +75,15 @@ function ff_\name, export=1 mla v18.4s, v22.4s, v25.4s mla v19.4s, v23.4s, v25.4s .ifc \fromto, To - sqshrn v0.4h, v16.4s, #\shift - sqshrn v1.4h, v17.4s, #\shift - sqshrn2 v0.8h, v18.4s, #\shift - sqshrn2 v1.8h, v19.4s, #\shift + sqshrn v0.4h, v16.4s, 14 + sqshrn v1.4h, v17.4s, 14 + sqshrn2 v0.8h, v18.4s, 14 + sqshrn2 v1.8h, v19.4s, 14 .else - shrn v0.4h, v16.4s, #\shift - shrn v1.4h, v17.4s, #\shift - shrn2 v0.8h, v18.4s, #\shift - shrn2 v1.8h, v19.4s, #\shift + shrn v0.4h, v16.4s, 14 + shrn v1.4h, v17.4s, 14 + shrn2 v0.8h, v18.4s, 14 + shrn2 v1.8h, v19.4s, 14 .endif subs w2, w2, #8 st1 {v0.8h}, [x0], #16 @@ -90,7 +93,7 @@ function ff_\name, export=1 endfunc .endm -lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14 -chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12 -lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14 -chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11 +lumConvertRange To +chrConvertRange To +lumConvertRange From +chrConvertRange From diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 1fce77df26..b8679734c4 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -218,17 +218,17 @@ NEON_INPUT(bgra32); NEON_INPUT(rgb24); NEON_INPUT(rgba32); -void ff_lumRangeFromJpeg_neon(int16_t *dst, int width); -void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width); -void ff_lumRangeToJpeg_neon(int16_t *dst, int width); -void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width); +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeToJpeg_neon(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) { - /* This code is currently disabled because of changes in the base - * implementation of these functions. This code should be enabled - * again once those changes are ported to this architecture. */ -#if 0 int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { @@ -242,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) } } } -#endif } av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c) -- 2.39.5 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".