A55 A76 chrRangeFromJpeg16_1920_c: 28848.5 6325.2 chrRangeFromJpeg16_1920_neon: 8433.0 ( 3.42x) 3365.8 ( 1.88x) chrRangeToJpeg16_1920_c: 36558.7 9479.0 chrRangeToJpeg16_1920_neon: 9395.5 ( 3.89x) 4083.8 ( 2.32x) lumRangeFromJpeg16_1920_c: 15390.0 4430.5 lumRangeFromJpeg16_1920_neon: 4588.7 ( 3.35x) 1814.5 ( 2.44x) lumRangeToJpeg16_1920_c: 19223.0 6014.8 lumRangeToJpeg16_1920_neon: 5306.0 ( 3.62x) 2050.8 ( 2.93x) --- libswscale/aarch64/range_convert_neon.S | 94 ++++++++++++++++++++++--- libswscale/aarch64/swscale.c | 36 +++++++--- 2 files changed, 112 insertions(+), 18 deletions(-)
diff --git a/libswscale/aarch64/range_convert_neon.S b/libswscale/aarch64/range_convert_neon.S index 3454ee4932..067e524195 100644 --- a/libswscale/aarch64/range_convert_neon.S +++ b/libswscale/aarch64/range_convert_neon.S @@ -20,13 +20,41 @@ #include "libavutil/aarch64/asm.S" -.macro lumConvertRange fromto -function ff_lumRange\fromto\()Jpeg_neon, export=1 +.macro lumConvertRange fromto, bit_depth +function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1 // x0 int16_t *dst // w1 int width // w2 int amax // w3 int coeff // x4 int64_t offset +.if \bit_depth == 16 +.ifc \fromto, To + dup v24.4s, w2 +.endif + dup v25.4s, w3 + dup v26.2d, x4 +1: + ld1 {v0.4s, v1.4s}, [x0] +.ifc \fromto, To + smin v0.4s, v0.4s, v24.4s + smin v1.4s, v1.4s, v24.4s +.endif + mov v16.16b, v26.16b + mov v17.16b, v26.16b + mov v18.16b, v26.16b + mov v19.16b, v26.16b + smlal v16.2d, v0.2s, v25.2s + smlal2 v17.2d, v0.4s, v25.4s + smlal v18.2d, v1.2s, v25.2s + smlal2 v19.2d, v1.4s, v25.4s + shrn v0.2s, v16.2d, 18 + shrn2 v0.4s, v17.2d, 18 + shrn v1.2s, v18.2d, 18 + shrn2 v1.4s, v19.2d, 18 + subs w1, w1, #8 + st1 {v0.4s, v1.4s}, [x0], #32 + b.gt 1b +.else .ifc \fromto, To dup v24.8h, w2 .endif @@ -48,18 +76,63 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1 subs w1, w1, #8 st1 {v0.8h}, [x0], #16 b.gt 1b +.endif ret endfunc .endm -.macro chrConvertRange fromto -function ff_chrRange\fromto\()Jpeg_neon, export=1 +.macro chrConvertRange fromto, bit_depth +function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1 // x0 int16_t *dstU // x1 int16_t *dstV // w2 int width // w3 int amax // w4 int coeff // x5 int64_t offset +.if \bit_depth == 16 +.ifc \fromto, To + dup v24.4s, w3 +.endif + dup v25.4s, w4 + dup v26.2d, x5 +1: + ld1 {v0.4s, v1.4s}, [x0] + ld1 {v2.4s, v3.4s}, [x1] +.ifc \fromto, To + smin v0.4s, v0.4s, v24.4s + smin v1.4s, v1.4s, v24.4s + smin v2.4s, v2.4s, v24.4s + smin v3.4s, v3.4s, v24.4s +.endif + mov v16.16b, v26.16b + mov v17.16b, v26.16b + mov v18.16b, v26.16b + mov v19.16b, v26.16b + mov v20.16b, v26.16b + mov v21.16b, v26.16b + mov v22.16b, v26.16b + mov v23.16b, v26.16b + smlal v16.2d, v0.2s, v25.2s + smlal2 v17.2d, v0.4s, v25.4s + smlal v18.2d, v1.2s, v25.2s + smlal2 v19.2d, v1.4s, v25.4s + smlal v20.2d, v2.2s, v25.2s + smlal2 v21.2d, v2.4s, v25.4s + smlal v22.2d, v3.2s, v25.2s + smlal2 v23.2d, v3.4s, v25.4s + shrn v0.2s, v16.2d, 18 + shrn2 v0.4s, v17.2d, 18 + shrn v1.2s, v18.2d, 18 + shrn2 v1.4s, v19.2d, 18 + shrn v2.2s, v20.2d, 18 + shrn2 v2.4s, v21.2d, 18 + shrn v3.2s, v22.2d, 18 + shrn2 v3.4s, v23.2d, 18 + subs w2, w2, #8 + st1 {v0.4s, v1.4s}, [x0], #32 + st1 {v2.4s, v3.4s}, [x1], #32 + b.gt 1b +.else .ifc \fromto, To dup v24.8h, w3 .endif @@ -92,11 +165,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1 st1 {v0.8h}, [x0], #16 st1 {v1.8h}, [x1], #16 b.gt 1b +.endif ret endfunc .endm -lumConvertRange To -chrConvertRange To -lumConvertRange From -chrConvertRange From +lumConvertRange To, 8 +lumConvertRange To, 16 +chrConvertRange To, 8 +chrConvertRange To, 16 +lumConvertRange From, 8 +lumConvertRange From, 16 +chrConvertRange From, 8 +chrConvertRange From, 16 diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c index 55fb81c1e3..d6ae6103d6 100644 --- a/libswscale/aarch64/swscale.c +++ b/libswscale/aarch64/swscale.c @@ -218,14 +218,22 @@ NEON_INPUT(bgra32); NEON_INPUT(rgb24); NEON_INPUT(rgba32); -void ff_lumRangeFromJpeg_neon(int16_t *dst, int width, +void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width, int amax, int coeff, int64_t offset); -void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width, +void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width, + int amax, int coeff, int64_t offset); +void ff_lumRangeToJpeg8_neon(int16_t *dst, int width, + int amax, int coeff, int64_t offset); +void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width, + int amax, int coeff, int64_t offset); +void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width, + int amax, int coeff, int64_t offset); +void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width, + int amax, int coeff, int64_t offset); +void ff_lumRangeToJpeg16_neon(int16_t *dst, int width, + int amax, int coeff, int64_t offset); +void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width, int amax, int coeff, int64_t offset); -void ff_lumRangeToJpeg_neon(int16_t *dst, int width, - int amax, int coeff, int64_t offset); -void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width, - int amax, int coeff, int64_t offset); av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c) { @@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsContext *c) if (have_neon(cpu_flags)) { if (c->dstBpc <= 14) { if (c->srcRange) { - c->lumConvertRange = ff_lumRangeFromJpeg_neon; - c->chrConvertRange = ff_chrRangeFromJpeg_neon; + c->lumConvertRange = ff_lumRangeFromJpeg8_neon; + c->chrConvertRange = ff_chrRangeFromJpeg8_neon; + } else { + c->lumConvertRange = ff_lumRangeToJpeg8_neon; + c->chrConvertRange = ff_chrRangeToJpeg8_neon; + } + } else { + if (c->srcRange) { + c->lumConvertRange = ff_lumRangeFromJpeg16_neon; + c->chrConvertRange = ff_chrRangeFromJpeg16_neon; } else { - c->lumConvertRange = ff_lumRangeToJpeg_neon; - c->chrConvertRange = ff_chrRangeToJpeg_neon; + c->lumConvertRange = ff_lumRangeToJpeg16_neon; + c->chrConvertRange = ff_chrRangeToJpeg16_neon; } } } -- 2.30.2 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".