A55 A76 chrRangeFromJpeg16_1920_c: 28840.6 6323.5 chrRangeFromJpeg16_1920_neon: 8436.5 ( 3.42x) 3365.2 ( 1.88x) chrRangeToJpeg16_1920_c: 23075.1 9195.6 chrRangeToJpeg16_1920_neon: 9393.6 ( 2.46x) 4084.5 ( 2.25x) lumRangeFromJpeg16_1920_c: 15383.8 4436.8 lumRangeFromJpeg16_1920_neon: 4586.0 ( 3.35x) 1814.0 ( 2.45x) lumRangeToJpeg16_1920_c: 19225.5 6017.2 lumRangeToJpeg16_1920_neon: 5067.9 ( 3.79x) 2146.4 ( 2.80x) --- libswscale/aarch64/range_convert_neon.S | 98 +++++++++++++++++++++++-- libswscale/aarch64/swscale.c | 36 ++++++--- 2 files changed, 116 insertions(+), 18 deletions(-)
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S index 462ba6f866..c0eb714333 100644 --- a/libswscale/aarch64/range_convert_neon.S +++ b/libswscale/aarch64/range_convert_neon.S @@ -20,12 +20,42 @@ #include "libavutil/aarch64/asm.S" -.macro lumConvertRange fromto -function ff_lumRange\fromto\()Jpeg_neon, export=1 +.macro lumConvertRange fromto, bit_depth +function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1 // x0 int16_t *dst // w1 int width // w2 uint32_t coeff // x3 int64_t offset +.if \bit_depth == 16 +.ifc \fromto, To + movi v25.4s, #1 + movi v24.4s, #1<<3, lsl #16 + sub v24.4s, v24.4s, v25.4s +.endif + dup v25.4s, w2 + dup v26.2d, x3 +1: + ld1 {v0.4s, v1.4s}, [x0] + mov v16.16b, v26.16b + mov v17.16b, v26.16b + mov v18.16b, v26.16b + mov v19.16b, v26.16b + smlal v16.2d, v0.2s, v25.2s + smlal2 v17.2d, v0.4s, v25.4s + smlal v18.2d, v1.2s, v25.2s + smlal2 v19.2d, v1.4s, v25.4s + shrn v0.2s, v16.2d, 18 + shrn2 v0.4s, v17.2d, 18 + shrn v1.2s, v18.2d, 18 + shrn2 v1.4s, v19.2d, 18 + subs w1, w1, #8 +.ifc \fromto, To + smin v0.4s, v0.4s, v24.4s + smin v1.4s, v1.4s, v24.4s +.endif + st1 {v0.4s, v1.4s}, [x0], #32 + b.gt 1b +.else dup v25.4s, w2 dup v26.4s, w3 1: @@ -46,17 +76,64 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1 subs w1, w1, #8 st1 {v0.8h}, [x0], #16 b.gt 1b +.endif ret endfunc .endm -.macro chrConvertRange fromto -function ff_chrRange\fromto\()Jpeg_neon, export=1 +.macro chrConvertRange fromto, bit_depth +function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1 // x0 int16_t *dstU // x1 int16_t *dstV // w2 int width // w3 uint32_t coeff // x4 int64_t offset +.if \bit_depth == 16 +.ifc \fromto, To + movi v25.4s, #1 + movi v24.4s, #1<<3, lsl #16 + sub v24.4s, v24.4s, v25.4s +.endif + dup v25.4s, w3 + dup v26.2d, x4 +1: + ld1 {v0.4s, v1.4s}, [x0] + ld1 {v2.4s, v3.4s}, [x1] + mov v16.16b, v26.16b + mov v17.16b, v26.16b + mov v18.16b, v26.16b + mov v19.16b, v26.16b + mov v20.16b, v26.16b + mov v21.16b, v26.16b + mov v22.16b, v26.16b + mov v23.16b, v26.16b + smlal v16.2d, v0.2s, v25.2s + smlal2 v17.2d, v0.4s, v25.4s + smlal v18.2d, v1.2s, v25.2s + smlal2 v19.2d, v1.4s, v25.4s + smlal v20.2d, v2.2s, v25.2s + smlal2 v21.2d, v2.4s, v25.4s + smlal v22.2d, v3.2s, v25.2s + smlal2 v23.2d, v3.4s, v25.4s + shrn v0.2s, v16.2d, 18 + shrn2 v0.4s, v17.2d, 18 + shrn v1.2s, v18.2d, 18 + shrn2 v1.4s, v19.2d, 18 + shrn v2.2s, v20.2d, 18 + shrn2 v2.4s, v21.2d, 18 + shrn v3.2s, v22.2d, 18 + shrn2 v3.4s, v23.2d, 18 + subs w2, w2, #8 +.ifc \fromto, To + smin v0.4s, v0.4s, v24.4s + smin v1.4s, v1.4s, v24.4s + smin v2.4s, v2.4s, v24.4s + smin v3.4s, v3.4s, v24.4s +.endif + st1 {v0.4s, v1.4s}, [x0], #32 + st1 {v2.4s, v3.4s}, [x1], #32 + b.gt 1b +.else dup v25.4s, w3 dup v26.4s, w4 1: @@ -89,11 +166,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x1], #16 b.gt 1b +.endif ret endfunc .endm -lumConvertRange To -chrConvertRange To -lumConvertRange From -chrConvertRange From +lumConvertRange To, 8 +lumConvertRange To, 16 +chrConvertRange To, 8 +chrConvertRange To, 16 +lumConvertRange From, 8 +lumConvertRange From, 16 +chrConvertRange From, 8 +chrConvertRange From, 16 diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index b8679734c4..92c49dcf3a 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -218,14 +218,22 @@ NEON_INPUT(bgra32); NEON_INPUT(rgb24); NEON_INPUT(rgba32); -void ff_lumRangeFromJpeg_neon(int16_t *dst, int width, +void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeToJpeg8_neon(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeToJpeg16_neon(int16_t *dst, int width, uint32_t coeff, int64_t offset); -void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width, +void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width, uint32_t coeff, int64_t offset); -void ff_lumRangeToJpeg_neon(int16_t *dst, int width, - uint32_t coeff, int64_t offset); -void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width, - uint32_t coeff, int64_t offset); av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) { @@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c) if (have_neon(cpu_flags)) { if (c->dstBpc <= 14) { if (c->opts.src_range) { - c->lumConvertRange = ff_lumRangeFromJpeg_neon; - c->chrConvertRange = ff_chrRangeFromJpeg_neon; + c->lumConvertRange = ff_lumRangeFromJpeg8_neon; + c->chrConvertRange = ff_chrRangeFromJpeg8_neon; } else { - c->lumConvertRange = ff_lumRangeToJpeg_neon; - c->chrConvertRange = ff_chrRangeToJpeg_neon; + c->lumConvertRange = ff_lumRangeToJpeg8_neon; + c->chrConvertRange = ff_chrRangeToJpeg8_neon; + } + } else { + if (c->opts.src_range) { + c->lumConvertRange = ff_lumRangeFromJpeg16_neon; + c->chrConvertRange = ff_chrRangeFromJpeg16_neon; + } else { + c->lumConvertRange = ff_lumRangeToJpeg16_neon; + c->chrConvertRange = ff_chrRangeToJpeg16_neon; } } } -- 2.39.5 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".