On Sun, 8 Sep 2024, Zhao Zhili wrote:

diff --git a/libavcodec/aarch64/h26x/dsp.h b/libavcodec/aarch64/h26x/dsp.h
index f72746ce03..076d01b477 100644
--- a/libavcodec/aarch64/h26x/dsp.h
+++ b/libavcodec/aarch64/h26x/dsp.h
@@ -248,4 +248,26 @@ NEON8_FNPROTO_PARTIAL_4(qpel, (int16_t *dst, const uint8_t 
*_src, ptrdiff_t _src
NEON8_FNPROTO_PARTIAL_4(qpel_uni, (uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
        ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, 
int width),)

+#undef NEON8_FNPROTO_PARTIAL_6
+#define NEON8_FNPROTO_PARTIAL_6(fn, args, ext) \
+    void ff_vvc_put_##fn##4_8_neon##ext args; \
+    void ff_vvc_put_##fn##8_8_neon##ext args; \
+    void ff_vvc_put_##fn##16_8_neon##ext args; \
+    void ff_vvc_put_##fn##32_8_neon##ext args; \
+    void ff_vvc_put_##fn##64_8_neon##ext args; \
+    void ff_vvc_put_##fn##128_8_neon##ext args
+
+NEON8_FNPROTO_PARTIAL_6(pel_pixels, (int16_t *dst,
+        const uint8_t *src, ptrdiff_t srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride, int height,
+        const int8_t *hf, const int8_t *vf, int width),);
+
+NEON8_FNPROTO_PARTIAL_6(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        const int8_t *hf, const int8_t *vf, int width),);
+
#endif
diff --git a/libavcodec/aarch64/h26x/epel_neon.S 
b/libavcodec/aarch64/h26x/epel_neon.S
index 378b0f7fb2..729395f2f0 100644
--- a/libavcodec/aarch64/h26x/epel_neon.S
+++ b/libavcodec/aarch64/h26x/epel_neon.S
@@ -19,7 +19,8 @@
 */

#include "libavutil/aarch64/asm.S"
-#define MAX_PB_SIZE 64
+#define HEVC_MAX_PB_SIZE 64
+#define VVC_MAX_PB_SIZE 128

const epel_filters, align=4
        .byte  0,  0,  0,  0
@@ -131,8 +132,13 @@ endconst
        b.ne            1b
.endm

+function ff_vvc_put_pel_pixels4_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
1:      ld1             {v0.s}[0], [x1], x2
        ushll           v4.8h, v0.8b, #6
        subs            w3, w3, #1
@@ -142,7 +148,7 @@ function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
endfunc

function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2 - 8)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 8)
1:      ld1             {v0.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        st1             {v4.d}[0], [x0], #8
@@ -152,8 +158,13 @@ function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
        ret
endfunc

+function ff_vvc_put_pel_pixels8_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
1:      ld1             {v0.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        subs            w3, w3, #1
@@ -163,7 +174,7 @@ function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
endfunc

function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2 - 16)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2 - 16)
1:      ld1             {v0.8b, v1.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        st1             {v4.8h}, [x0], #16
@@ -174,8 +185,13 @@ function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
        ret
endfunc

+function ff_vvc_put_pel_pixels16_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
1:      ld1             {v0.8b, v1.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        ushll           v5.8h, v1.8b, #6
@@ -186,7 +202,7 @@ function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
endfunc

function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
1:      ld1             {v0.8b-v2.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        ushll           v5.8h, v1.8b, #6
@@ -197,8 +213,13 @@ function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
        ret
endfunc

+function ff_vvc_put_pel_pixels32_8_neon, export=1
+        mov             x7, #(VVC_MAX_PB_SIZE * 2)
+        b               1f
+endfunc
+
function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE * 2)
+        mov             x7, #(HEVC_MAX_PB_SIZE * 2)
1:      ld1             {v0.8b-v3.8b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        ushll           v5.8h, v1.8b, #6
@@ -211,7 +232,7 @@ function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
endfunc

function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
-        mov             x7, #(MAX_PB_SIZE)
+        mov             x7, #(HEVC_MAX_PB_SIZE)
1:      ld1             {v0.16b-v2.16b}, [x1], x2
        ushll           v4.8h, v0.8b, #6
        ushll2          v5.8h, v0.16b, #6
@@ -226,26 +247,50 @@ function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
        ret
endfunc

-function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
-1:      ld1             {v0.16b-v3.16b}, [x1], x2
+.macro put_pel_pixels64_8_neon
        ushll           v4.8h, v0.8b, #6
        ushll2          v5.8h, v0.16b, #6
        ushll           v6.8h, v1.8b, #6
        ushll2          v7.8h, v1.16b, #6
-        st1             {v4.8h-v7.8h}, [x0], #(MAX_PB_SIZE)
+        st1             {v4.8h-v7.8h}, [x0], #64
        ushll           v16.8h, v2.8b, #6
        ushll2          v17.8h, v2.16b, #6
        ushll           v18.8h, v3.8b, #6
        ushll2          v19.8h, v3.16b, #6
-        subs            w3, w3, #1
-        st1             {v16.8h-v19.8h}, [x0], #(MAX_PB_SIZE)
-        b.ne            1b
+        st1             {v16.8h-v19.8h}, [x0], x7
+.endm
+
+function ff_vvc_put_pel_pixels64_8_neon, export=1
+        mov             x7, #(2 * VVC_MAX_PB_SIZE - 64)
+        b               1f
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
+        mov             x7, #(HEVC_MAX_PB_SIZE)
+1:
+        ld1             {v0.16b-v3.16b}, [x1], x2
+        sub             w3, w3, #1
+        put_pel_pixels64_8_neon
+        cbnz            w3, 1b

We'd typically use subs + b.ne, rather than sub+cbnz for loops like these. Or is there anything inside the macros that clobber the condition flags?

The same thing in most of these functions you're touching in this patch.

+function ff_vvc_put_pel_uni_pixels128_8_neon, export=1
+1:
+        mov             x5, x2
+        mov             x6, x0
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x5]
+        sub             w4, w4, #1
+        add             x2, x2, x3
+        add             x0, x0, x1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], #64
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x6]
+        cbnz            w4, 1b
+        ret
+endfunc

subs+b.ne rather than sub+cbnz, for consistency if nothing else.

The copying of values back and forth between x2/x5 and x0/x6 seems wasteful here. I'd suggest this instead:

  sub x1, x1, #64
  sub x3, x3, #64
1:
  ld1 [x2], #64
  subs w4, w4, #1
  ld1 [x2], x3
  ...
  st1 [x0], #64
  st1 [x0], x1
  b.ne 1b

The same goes in ff_vvc_put_pel_uni_w_pixels128_8_neon below as well.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to