A55 A76 chrRangeFromJpeg8_1920_c: 28842.4 6346.5 chrRangeFromJpeg8_1920_neon: 5310.9 ( 5.43x) 2264.2 ( 2.80x) chrRangeToJpeg8_1920_c: 36520.7 9514.0 chrRangeToJpeg8_1920_neon: 6033.2 ( 6.05x) 2645.5 ( 3.60x) lumRangeFromJpeg8_1920_c: 15387.2 4444.5 lumRangeFromJpeg8_1920_neon: 3148.9 ( 4.89x) 1108.0 ( 4.01x) lumRangeToJpeg8_1920_c: 19226.4 6015.5 lumRangeToJpeg8_1920_neon: 3866.7 ( 4.97x) 1344.8 ( 4.47x) --- libswscale/aarch64/range_convert_neon.S | 63 +++++++++++++------------ libswscale/aarch64/swscale.c | 14 +++--- 2 files changed, 41 insertions(+), 36 deletions(-)
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S index 30991ab2a6..3454ee4932 100644 --- a/libswscale/aarch64/range_convert_neon.S +++ b/libswscale/aarch64/range_convert_neon.S @@ -20,20 +20,21 @@ #include "libavutil/aarch64/asm.S" -.macro lumConvertRange name, max, mult, offset, shift -function ff_\name, export=1 -.if \max != 0 - mov w3, #\max - dup v24.8h, w3 +.macro lumConvertRange fromto +function ff_lumRange\fromto\()Jpeg_neon, export=1 +// x0 int16_t *dst +// w1 int width +// w2 int amax +// w3 int coeff +// x4 int64_t offset +.ifc \fromto, To + dup v24.8h, w2 .endif - mov w3, #\mult dup v25.4s, w3 - movz w3, #(\offset & 0xffff) - movk w3, #((\offset >> 16) & 0xffff), lsl #16 - dup v26.4s, w3 + dup v26.4s, w4 1: ld1 {v0.8h}, [x0] -.if \max != 0 +.ifc \fromto, To smin v0.8h, v0.8h, v24.8h .endif mov v16.16b, v26.16b @@ -42,8 +43,8 @@ function ff_\name, export=1 sxtl2 v22.4s, v0.8h mla v16.4s, v20.4s, v25.4s mla v18.4s, v22.4s, v25.4s - shrn v0.4h, v16.4s, #\shift - shrn2 v0.8h, v18.4s, #\shift + shrn v0.4h, v16.4s, 14 + shrn2 v0.8h, v18.4s, 14 subs w1, w1, #8 st1 {v0.8h}, [x0], #16 b.gt 1b @@ -51,21 +52,23 @@ function ff_\name, export=1 endfunc .endm -.macro chrConvertRange name, max, mult, offset, shift -function ff_\name, export=1 -.if \max != 0 - mov w3, #\max +.macro chrConvertRange fromto +function ff_chrRange\fromto\()Jpeg_neon, export=1 +// x0 int16_t *dstU +// x1 int16_t *dstV +// w2 int width +// w3 int amax +// w4 int coeff +// x5 int64_t offset +.ifc \fromto, To dup v24.8h, w3 .endif - mov w3, #\mult - dup v25.4s, w3 - movz w3, #(\offset & 0xffff) - movk w3, #((\offset >> 16) & 0xffff), lsl #16 - dup v26.4s, w3 + dup v25.4s, w4 + dup v26.4s, w5 1: ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x1] -.if \max != 0 +.ifc \fromto, To smin v0.8h, v0.8h, v24.8h smin v1.8h, v1.8h, v24.8h .endif @@ -81,10 +84,10 @@ function ff_\name, export=1 mla v17.4s, v21.4s, v25.4s mla v18.4s, v22.4s, v25.4s mla v19.4s, v23.4s, v25.4s - shrn v0.4h, v16.4s, #\shift - shrn v1.4h, v17.4s, #\shift - shrn2 v0.8h, v18.4s, #\shift - shrn2 v1.8h, v19.4s, #\shift + shrn v0.4h, v16.4s, 14 + shrn v1.4h, v17.4s, 14 + shrn2 v0.8h, v18.4s, 14 + shrn2 v1.8h, v19.4s, 14 subs w2, w2, #8 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x1], #16 @@ -93,7 +96,7 @@ function ff_\name, export=1 endfunc .endm -lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14 -chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12 -lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14 -chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11 +lumConvertRange To +chrConvertRange To +lumConvertRange From +chrConvertRange From diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 21788cad5d..55fb81c1e3 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -218,14 +218,17 @@ NEON_INPUT(bgra32); NEON_INPUT(rgb24); NEON_INPUT(rgba32); -void ff_lumRangeFromJpeg_neon(int16_t *dst, int width); -void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width); -void ff_lumRangeToJpeg_neon(int16_t *dst, int width); -void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width); +void ff_lumRangeFromJpeg_neon(int16_t *dst, int width, + int amax, int coeff, int64_t offset); +void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width, + int amax, int coeff, int64_t offset); +void ff_lumRangeToJpeg_neon(int16_t *dst, int width, + int amax, int coeff, int64_t offset); +void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width, + int amax, int coeff, int64_t offset); av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c) { -#if 0 int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { @@ -239,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c) } } } -#endif } av_cold void ff_sws_init_swscale_aarch64(SwsContext *c) -- 2.30.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".