range_convert: update neon range_convert functions to new API

Ramiro Polla Sun, 01 Dec 2024 10:21:45 -0800

aarch64 A55:
chrRangeFromJpeg8_1920_c:    28835.2 (1.00x)
chrRangeFromJpeg8_1920_neon:  5313.9 (5.43x)  5308.4 (5.43x)
chrRangeToJpeg8_1920_c:      23074.7 (1.00x)
chrRangeToJpeg8_1920_neon:    5551.3 (4.16x)  5549.2 (4.16x)
lumRangeFromJpeg8_1920_c:    15389.7 (1.00x)
lumRangeFromJpeg8_1920_neon:  3152.3 (4.88x)  3147.7 (4.89x)
lumRangeToJpeg8_1920_c:      19227.8 (1.00x)
lumRangeToJpeg8_1920_neon:    3628.7 (5.30x)  3630.2 (5.30x)


aarch64 A76:
chrRangeFromJpeg8_1920_c:    6324.4 (1.00x)
chrRangeFromJpeg8_1920_neon: 2344.5 (2.70x) 2304.2 (2.74x)
chrRangeToJpeg8_1920_c:      9656.0 (1.00x)
chrRangeToJpeg8_1920_neon:   2824.2 (3.42x) 2794.2 (3.46x)
lumRangeFromJpeg8_1920_c:    4422.0 (1.00x)
lumRangeFromJpeg8_1920_neon: 1104.5 (4.00x) 1106.2 (4.00x)
lumRangeToJpeg8_1920_c:      5949.1 (1.00x)
lumRangeToJpeg8_1920_neon:   1329.8 (4.47x) 1328.2 (4.48x)
---
 libswscale/aarch64/range_convert_neon.S | 59 +++++++++++++------------
 libswscale/aarch64/swscale.c            | 17 ++++---
 2 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/libswscale/aarch64/range_convert_neon.S 
b/libswscale/aarch64/range_convert_neon.S
index 2f418adb24..462ba6f866 100644
--- a/libswscale/aarch64/range_convert_neon.S
+++ b/libswscale/aarch64/range_convert_neon.S
@@ -20,12 +20,13 @@
 
 #include "libavutil/aarch64/asm.S"
 
-.macro lumConvertRange name, fromto, mult, offset, shift
-function ff_\name, export=1
-        mov             w3, #\mult
-        dup             v25.4s, w3
-        movz            w3, #(\offset & 0xffff)
-        movk            w3, #((\offset >> 16) & 0xffff), lsl #16
+.macro lumConvertRange fromto
+function ff_lumRange\fromto\()Jpeg_neon, export=1
+// x0  int16_t *dst
+// w1  int width
+// w2  uint32_t coeff
+// x3  int64_t offset
+        dup             v25.4s, w2
         dup             v26.4s, w3
 1:
         ld1             {v0.8h}, [x0]
@@ -36,11 +37,11 @@ function ff_\name, export=1
         mla             v16.4s, v20.4s, v25.4s
         mla             v18.4s, v22.4s, v25.4s
 .ifc \fromto, To
-        sqshrn          v0.4h, v16.4s, #\shift
-        sqshrn2         v0.8h, v18.4s, #\shift
+        sqshrn          v0.4h, v16.4s, 14
+        sqshrn2         v0.8h, v18.4s, 14
 .else
-        shrn            v0.4h, v16.4s, #\shift
-        shrn2           v0.8h, v18.4s, #\shift
+        shrn            v0.4h, v16.4s, 14
+        shrn2           v0.8h, v18.4s, 14
 .endif
         subs            w1, w1, #8
         st1             {v0.8h}, [x0], #16
@@ -49,13 +50,15 @@ function ff_\name, export=1
 endfunc
 .endm
 
-.macro chrConvertRange name, fromto, mult, offset, shift
-function ff_\name, export=1
-        mov             w3, #\mult
+.macro chrConvertRange fromto
+function ff_chrRange\fromto\()Jpeg_neon, export=1
+// x0  int16_t *dstU
+// x1  int16_t *dstV
+// w2  int width
+// w3  uint32_t coeff
+// x4  int64_t offset
         dup             v25.4s, w3
-        movz            w3, #(\offset & 0xffff)
-        movk            w3, #((\offset >> 16) & 0xffff), lsl #16
-        dup             v26.4s, w3
+        dup             v26.4s, w4
 1:
         ld1             {v0.8h}, [x0]
         ld1             {v1.8h}, [x1]
@@ -72,15 +75,15 @@ function ff_\name, export=1
         mla             v18.4s, v22.4s, v25.4s
         mla             v19.4s, v23.4s, v25.4s
 .ifc \fromto, To
-        sqshrn          v0.4h, v16.4s, #\shift
-        sqshrn          v1.4h, v17.4s, #\shift
-        sqshrn2         v0.8h, v18.4s, #\shift
-        sqshrn2         v1.8h, v19.4s, #\shift
+        sqshrn          v0.4h, v16.4s, 14
+        sqshrn          v1.4h, v17.4s, 14
+        sqshrn2         v0.8h, v18.4s, 14
+        sqshrn2         v1.8h, v19.4s, 14
 .else
-        shrn            v0.4h, v16.4s, #\shift
-        shrn            v1.4h, v17.4s, #\shift
-        shrn2           v0.8h, v18.4s, #\shift
-        shrn2           v1.8h, v19.4s, #\shift
+        shrn            v0.4h, v16.4s, 14
+        shrn            v1.4h, v17.4s, 14
+        shrn2           v0.8h, v18.4s, 14
+        shrn2           v1.8h, v19.4s, 14
 .endif
         subs            w2, w2, #8
         st1             {v0.8h}, [x0], #16
@@ -90,7 +93,7 @@ function ff_\name, export=1
 endfunc
 .endm
 
-lumConvertRange lumRangeToJpeg_neon,   To,   19077, -39057361, 14
-chrConvertRange chrRangeToJpeg_neon,   To,    4663,  -9289992, 12
-lumConvertRange lumRangeFromJpeg_neon, From, 14071,  33561947, 14
-chrConvertRange chrRangeFromJpeg_neon, From,  1799,   4081085, 11
+lumConvertRange To
+chrConvertRange To
+lumConvertRange From
+chrConvertRange From
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 1fce77df26..b8679734c4 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -218,17 +218,17 @@ NEON_INPUT(bgra32);
 NEON_INPUT(rgb24);
 NEON_INPUT(rgba32);
 
-void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
-void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
-void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
+void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
+                              uint32_t coeff, int64_t offset);
+void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+                              uint32_t coeff, int64_t offset);
+void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
+                            uint32_t coeff, int64_t offset);
+void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
+                            uint32_t coeff, int64_t offset);
 
 av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
 {
-    /* This code is currently disabled because of changes in the base
-     * implementation of these functions. This code should be enabled
-     * again once those changes are ported to this architecture. */
-#if 0
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
@@ -242,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal 
*c)
             }
         }
     }
-#endif
 }
 
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
-- 
2.39.5

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v4 6/8] swscale/aarch64/range_convert: update neon range_convert functions to new API

Reply via email to